http://www.dalsoft.com dco is a compiler post-processor asm->asm cc -c -O3 foo.c .c.o: $(CC) $(CFLAGS) -c $< cc -S -O3 foo.c dco foo.s -o ofoo.s mv ofoo.s foo.s cc -c foo.s rm foo.s .c.o: $(CC) $(CFLAGS) -S $< dco $*.s -o $*.s $(DOPTS) $(CC) $(CFLAGS) -c $*.s rm $*.s 1 http://www.dalsoft.com 2 http://www.dalsoft.com allows selective optimization optimizes optimized code takes full advantage of the options and features provided by the target processor x86 offers the 'Single Instruction Multiple Data' (SIMD) instructions. The part of dco utilizing these instructions is called SIMDinator ( pronounced seem-d-ney-ter ) 3 http://www.dalsoft.com addsd mulsd addpd mulpd 4 http://www.dalsoft.com shufpd unpcklpd 5 http://www.dalsoft.com 6 http://www.dalsoft.com SIMDinator makes code faster Kernel# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 Geometric Mean gcc 4.1.2 gcc+dco -np gcc+dco/gcc -np/gcc -np/gcc+dco 4.96 2.38 5.93 4.66 5.2 4.53 4.87 5 4.6 4.94 5.78 5.18 4.57 4.71 3.72 5.61 5.01 4.7 5.81 4.53 4.88 4.88 4.17 4.85 3.32 2.32 3.55 4.12 2.07 3.9 3.12 5.4 3.95 3.87 1.52 4.98 5.56 4.71 3.72 5.31 4.99 3.96 4.1 4.43 4.61 6.21 4.09 0.77 4 2.32 3.84 3.8 2.07 3.63 3.96 3.88 4.23 3.38 1.52 4.39 4.58 4.26 3.72 5.29 4.99 3.95 4.1 4.43 4.61 4.86 4.09 0.77 33.06% 2.52% 40.13% 11.59% 60.19% 13.91% 35.93% -8.00% 14.13% 21.66% 73.70% 3.86% -21.66% 0.00% 0.00% 5.35% 0.40% 15.74% 29.43% 2.21% 5.53% -27.25% 1.92% 84.12% 19.35% 2.52% 35.24% 18.45% 60.19% 19.87% 18.69% 22.40% 8.04% 31.58% 73.70% 15.25% -0.22% 9.55% 0.00% 5.70% 0.40% 15.96% 29.43% 2.21% 5.53% 0.41% 1.92% 84.12% -20.48% 0.00% -8.17% 7.77% 0.00% 6.92% -26.92% 28.15% -7.09% 12.66% 0.00% 11.85% 17.63% 9.55% 0.00% 0.38% 0.00% 0.25% 0.00% 0.00% 0.00% 21.74% 0.00% 0.00% 4.75 3.64 3.53 23.37% 25.68% 3.02% 7 http://www.dalsoft.com 36% faster dco generated code compiler generated code .L1012: leal as1+32032(,%edx,8),%eax unpcklpd %xmm3,%xmm3 unpcklpd %xmm4,%xmm4 unpcklpd %xmm5,%xmm5 ___dcox86_wl_0_: movsd -31992(%eax),%xmm2 addl $2,%edx movhpd -32000(%eax),%xmm2 addl $16,%eax cmpl %ecx,%edx movsd -32000(%eax),%xmm7 movhpd -32008(%eax),%xmm7 movsd -32032(%eax),%xmm6 movhpd -32040(%eax),%xmm6 movsd -8(%eax),%xmm1 movhpd -16(%eax),%xmm1 mulpd %xmm4,%xmm2 movsd 8000(%eax),%xmm0 movhpd 7992(%eax),%xmm0 addpd %xmm7,%xmm2 movsd -32024(%eax),%xmm7 movhpd -32032(%eax),%xmm7 mulpd %xmm3,%xmm6 mulpd %xmm4,%xmm2 mulpd %xmm3,%xmm1 addpd %xmm7,%xmm6 movsd -31992(%eax),%xmm7 movhpd -32000(%eax),%xmm7 addpd %xmm0,%xmm1 movsd -32040(%eax),%xmm0 movhpd -32048(%eax),%xmm0 mulpd %xmm3,%xmm6 mulpd %xmm3,%xmm1 addpd %xmm7,%xmm2 movsd -32016(%eax),%xmm7 movhpd -32024(%eax),%xmm7 addpd %xmm0,%xmm1 mulpd %xmm5,%xmm2 addpd %xmm7,%xmm6 addpd %xmm2,%xmm6 mulpd %xmm5,%xmm6 addpd %xmm6,%xmm1 movhpd %xmm1,-8024(%eax) movsd %xmm1,-8016(%eax) jne ___dcox86_wl_0_ .L1012: leal 1(%edx), %esi movapd %xmm3, %xmm6 mulsd as1+32032(,%edx,8), %xmm6 addsd as1+40040(,%edx,8), %xmm6 mulsd %xmm3, %xmm6 addsd as1(,%edx,8), %xmm6 movapd %xmm3, %xmm1 mulsd as1(,%esi,8), %xmm1 addsd as1+16(,%edx,8), %xmm1 mulsd %xmm3, %xmm1 addsd as1+24(,%edx,8), %xmm1 movapd %xmm4, %xmm0 mulsd as1+24(,%esi,8), %xmm0 addsd as1+32(,%esi,8), %xmm0 mulsd %xmm4, %xmm0 addsd as1+40(,%esi,8), %xmm0 mulsd %xmm5, %xmm0 addsd %xmm0, %xmm1 mulsd %xmm5, %xmm1 addsd %xmm1, %xmm6 movsd %xmm6, as1+24016(,%esi,8) addl $2, %edx movapd %xmm3, %xmm6 mulsd as1+32032(,%esi,8), %xmm6 addsd as1+40040(,%esi,8), %xmm6 mulsd %xmm3, %xmm6 addsd as1(,%esi,8), %xmm6 movapd %xmm3, %xmm7 mulsd as1(,%edx,8), %xmm7 addsd as1+16(,%esi,8), %xmm7 mulsd %xmm3, %xmm7 addsd as1+24(,%esi,8), %xmm7 movapd %xmm4, %xmm2 mulsd as1+24(,%edx,8), %xmm2 addsd as1+32(,%edx,8), %xmm2 mulsd %xmm4, %xmm2 addsd as1+40(,%edx,8), %xmm2 mulsd %xmm5, %xmm2 addsd %xmm2, %xmm7 mulsd %xmm5, %xmm7 addsd %xmm7, %xmm6 movsd %xmm6, as1+24016(,%edx,8) cmpl %ecx, %edx jne .L1012 8 http://www.dalsoft.com Kernel# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 Geometric Mean SIMDinator makes code slower gcc 4.1.2 gcc+dco -np gcc+dco/gcc -np/gcc -np/gcc+dco 4.96 2.38 5.93 4.66 5.2 4.53 4.87 5 4.6 4.94 5.78 5.18 4.57 4.71 3.72 5.61 5.01 4.7 5.81 4.53 4.88 4.88 4.17 4.85 3.32 2.32 3.55 4.12 2.07 3.9 3.12 5.4 3.95 3.87 1.52 4.98 5.56 4.71 3.72 5.31 4.99 3.96 4.1 4.43 4.61 6.21 4.09 0.77 4 2.32 3.84 3.8 2.07 3.63 3.96 3.88 4.23 3.38 1.52 4.39 4.58 4.26 3.72 5.29 4.99 3.95 4.1 4.43 4.61 4.86 4.09 0.77 33.06% 2.52% 40.13% 11.59% 60.19% 13.91% 35.93% -8.00% 14.13% 21.66% 73.70% 3.86% -21.66% 0.00% 0.00% 5.35% 0.40% 15.74% 29.43% 2.21% 5.53% -27.25% 1.92% 84.12% 19.35% 2.52% 35.24% 18.45% 60.19% 19.87% 18.69% 22.40% 8.04% 31.58% 73.70% 15.25% -0.22% 9.55% 0.00% 5.70% 0.40% 15.96% 29.43% 2.21% 5.53% 0.41% 1.92% 84.12% -20.48% 0.00% -8.17% 7.77% 0.00% 6.92% -26.92% 28.15% -7.09% 12.66% 0.00% 11.85% 17.63% 9.55% 0.00% 0.38% 0.00% 0.25% 0.00% 0.00% 0.00% 21.74% 0.00% 0.00% 4.75 3.64 3.53 23.37% 25.68% 3.02% 9 http://www.dalsoft.com Kernel# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 Geometric Mean SIMDinator prevents better code generation gcc 4.1.2 gcc+dco -np gcc+dco/gcc -np/gcc -np/gcc+dco 4.96 2.38 5.93 4.66 5.2 4.53 4.87 5 4.6 4.94 5.78 5.18 4.57 4.71 3.72 5.61 5.01 4.7 5.81 4.53 4.88 4.88 4.17 4.85 3.32 2.32 3.55 4.12 2.07 3.9 3.12 5.4 3.95 3.87 1.52 4.98 5.56 4.71 3.72 5.31 4.99 3.96 4.1 4.43 4.61 6.21 4.09 0.77 4 2.32 3.84 3.8 2.07 3.63 3.96 3.88 4.23 3.38 1.52 4.39 4.58 4.26 3.72 5.29 4.99 3.95 4.1 4.43 4.61 4.86 4.09 0.77 33.06% 2.52% 40.13% 11.59% 60.19% 13.91% 35.93% -8.00% 14.13% 21.66% 73.70% 3.86% -21.66% 0.00% 0.00% 5.35% 0.40% 15.74% 29.43% 2.21% 5.53% -27.25% 1.92% 84.12% 19.35% 2.52% 35.24% 18.45% 60.19% 19.87% 18.69% 22.40% 8.04% 31.58% 73.70% 15.25% -0.22% 9.55% 0.00% 5.70% 0.40% 15.96% 29.43% 2.21% 5.53% 0.41% 1.92% 84.12% -20.48% 0.00% -8.17% 7.77% 0.00% 6.92% -26.92% 28.15% -7.09% 12.66% 0.00% 11.85% 17.63% 9.55% 0.00% 0.38% 0.00% 0.25% 0.00% 0.00% 0.00% 21.74% 0.00% 0.00% 4.75 3.64 3.53 23.37% 25.68% 3.02% 10 http://www.dalsoft.com faster 29.00% slower 12.50% prevents 25.00% 11 http://www.dalsoft.com Kernel# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 23 24 Geometric Mean gcc 4.1.2 gcc+dco icc gcc+dco/gcc icc/gcc icc/gcc+dco 4.96 2.38 5.93 4.66 5.2 4.53 4.87 5 4.6 4.94 5.78 5.18 4.57 4.71 3.72 5.61 5.01 4.7 5.81 4.53 4.88 4.17 4.85 3.32 2.32 3.55 3.8 2.07 3.63 3.12 3.88 3.95 3.38 1.52 4.39 4.58 4.26 3.72 5.29 4.99 3.95 4.1 4.43 4.61 4.09 0.77 2.96 2.28 2.54 4.63 2.38 3.87 2.57 3.25 4.97 4.32 1.65 4.13 4.61 2.3 3.67 5.66 4.86 3.45 6.77 4.38 1.05 4.67 1.66 33.06% 2.52% 40.13% 18.45% 60.19% 19.87% 35.93% 22.40% 14.13% 31.58% 73.70% 15.25% -0.22% 9.55% 0.00% 5.70% 0.40% 15.96% 29.43% 2.21% 5.53% 1.92% 84.12% 40.32% 4.20% 57.17% 0.64% 54.23% 14.57% 47.23% 35.00% -8.04% 12.55% 71.45% 20.27% -0.88% 51.17% 1.34% -0.89% 2.99% 26.60% -16.52% 3.31% 78.48% -11.99% 65.77% 10.84% 1.72% 28.45% -21.84% -14.98% -6.61% 17.63% 16.24% -25.82% -27.81% -8.55% 5.92% -0.66% 46.01% 1.34% -6.99% 2.61% 12.66% -65.12% 1.13% 77.22% -14.18% -115.58% 4.74 3.4 3.29 28.27% 30.56% 3.19% Number of cases icc 5 8 dco 10 12 http://www.dalsoft.com Compiler generated information to aid optimization: aliasing failed register allocations alignment 13 http://www.dalsoft.com Thank you! 14