Presentation

http://www.dalsoft.com
dco is a compiler post-processor
asm->asm
cc -c -O3 foo.c
.c.o:
$(CC) $(CFLAGS) -c $<
cc -S -O3 foo.c
dco foo.s -o ofoo.s
mv ofoo.s foo.s
cc -c foo.s
rm foo.s
.c.o:
$(CC) $(CFLAGS) -S $<
dco $*.s -o $*.s $(DOPTS)
$(CC) $(CFLAGS) -c $*.s
rm $*.s
1
http://www.dalsoft.com
2
http://www.dalsoft.com
allows selective optimization
optimizes optimized code
takes full advantage of the options and features
provided by the target processor
x86 offers the 'Single Instruction Multiple Data'
(SIMD) instructions. The part of dco utilizing these
instructions is called
SIMDinator ( pronounced seem-d-ney-ter )
3
http://www.dalsoft.com
addsd
mulsd
addpd
mulpd
4
http://www.dalsoft.com
shufpd
unpcklpd
5
http://www.dalsoft.com
6
http://www.dalsoft.com
SIMDinator makes code faster
Kernel#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
Geometric
Mean
gcc 4.1.2
gcc+dco
-np
gcc+dco/gcc
-np/gcc
-np/gcc+dco
4.96
2.38
5.93
4.66
5.2
4.53
4.87
5
4.6
4.94
5.78
5.18
4.57
4.71
3.72
5.61
5.01
4.7
5.81
4.53
4.88
4.88
4.17
4.85
3.32
2.32
3.55
4.12
2.07
3.9
3.12
5.4
3.95
3.87
1.52
4.98
5.56
4.71
3.72
5.31
4.99
3.96
4.1
4.43
4.61
6.21
4.09
0.77
4
2.32
3.84
3.8
2.07
3.63
3.96
3.88
4.23
3.38
1.52
4.39
4.58
4.26
3.72
5.29
4.99
3.95
4.1
4.43
4.61
4.86
4.09
0.77
33.06%
2.52%
40.13%
11.59%
60.19%
13.91%
35.93%
-8.00%
14.13%
21.66%
73.70%
3.86%
-21.66%
0.00%
0.00%
5.35%
0.40%
15.74%
29.43%
2.21%
5.53%
-27.25%
1.92%
84.12%
19.35%
2.52%
35.24%
18.45%
60.19%
19.87%
18.69%
22.40%
8.04%
31.58%
73.70%
15.25%
-0.22%
9.55%
0.00%
5.70%
0.40%
15.96%
29.43%
2.21%
5.53%
0.41%
1.92%
84.12%
-20.48%
0.00%
-8.17%
7.77%
0.00%
6.92%
-26.92%
28.15%
-7.09%
12.66%
0.00%
11.85%
17.63%
9.55%
0.00%
0.38%
0.00%
0.25%
0.00%
0.00%
0.00%
21.74%
0.00%
0.00%
4.75
3.64
3.53
23.37%
25.68%
3.02%
7
http://www.dalsoft.com
36%
faster
dco generated code
compiler generated code
.L1012:
leal as1+32032(,%edx,8),%eax
unpcklpd %xmm3,%xmm3
unpcklpd %xmm4,%xmm4
unpcklpd %xmm5,%xmm5
___dcox86_wl_0_:
movsd -31992(%eax),%xmm2
addl $2,%edx
movhpd -32000(%eax),%xmm2
addl $16,%eax
cmpl %ecx,%edx
movsd -32000(%eax),%xmm7
movhpd -32008(%eax),%xmm7
movsd -32032(%eax),%xmm6
movhpd -32040(%eax),%xmm6
movsd -8(%eax),%xmm1
movhpd -16(%eax),%xmm1
mulpd %xmm4,%xmm2
movsd 8000(%eax),%xmm0
movhpd 7992(%eax),%xmm0
addpd %xmm7,%xmm2
movsd -32024(%eax),%xmm7
movhpd -32032(%eax),%xmm7
mulpd %xmm3,%xmm6
mulpd %xmm4,%xmm2
mulpd %xmm3,%xmm1
addpd %xmm7,%xmm6
movsd -31992(%eax),%xmm7
movhpd -32000(%eax),%xmm7
addpd %xmm0,%xmm1
movsd -32040(%eax),%xmm0
movhpd -32048(%eax),%xmm0
mulpd %xmm3,%xmm6
mulpd %xmm3,%xmm1
addpd %xmm7,%xmm2
movsd -32016(%eax),%xmm7
movhpd -32024(%eax),%xmm7
addpd %xmm0,%xmm1
mulpd %xmm5,%xmm2
addpd %xmm7,%xmm6
addpd %xmm2,%xmm6
mulpd %xmm5,%xmm6
addpd %xmm6,%xmm1
movhpd %xmm1,-8024(%eax)
movsd %xmm1,-8016(%eax)
jne ___dcox86_wl_0_
.L1012:
leal 1(%edx), %esi
movapd %xmm3, %xmm6
mulsd as1+32032(,%edx,8), %xmm6
addsd as1+40040(,%edx,8), %xmm6
mulsd %xmm3, %xmm6
addsd as1(,%edx,8), %xmm6
movapd %xmm3, %xmm1
mulsd as1(,%esi,8), %xmm1
addsd as1+16(,%edx,8), %xmm1
mulsd %xmm3, %xmm1
addsd as1+24(,%edx,8), %xmm1
movapd %xmm4, %xmm0
mulsd as1+24(,%esi,8), %xmm0
addsd as1+32(,%esi,8), %xmm0
mulsd %xmm4, %xmm0
addsd as1+40(,%esi,8), %xmm0
mulsd %xmm5, %xmm0
addsd %xmm0, %xmm1
mulsd %xmm5, %xmm1
addsd %xmm1, %xmm6
movsd %xmm6, as1+24016(,%esi,8)
addl $2, %edx
movapd %xmm3, %xmm6
mulsd as1+32032(,%esi,8), %xmm6
addsd as1+40040(,%esi,8), %xmm6
mulsd %xmm3, %xmm6
addsd as1(,%esi,8), %xmm6
movapd %xmm3, %xmm7
mulsd as1(,%edx,8), %xmm7
addsd as1+16(,%esi,8), %xmm7
mulsd %xmm3, %xmm7
addsd as1+24(,%esi,8), %xmm7
movapd %xmm4, %xmm2
mulsd as1+24(,%edx,8), %xmm2
addsd as1+32(,%edx,8), %xmm2
mulsd %xmm4, %xmm2
addsd as1+40(,%edx,8), %xmm2
mulsd %xmm5, %xmm2
addsd %xmm2, %xmm7
mulsd %xmm5, %xmm7
addsd %xmm7, %xmm6
movsd %xmm6, as1+24016(,%edx,8)
cmpl %ecx, %edx
jne .L1012
8
http://www.dalsoft.com
Kernel#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
Geometric
Mean
SIMDinator makes code slower
gcc 4.1.2
gcc+dco
-np
gcc+dco/gcc
-np/gcc
-np/gcc+dco
4.96
2.38
5.93
4.66
5.2
4.53
4.87
5
4.6
4.94
5.78
5.18
4.57
4.71
3.72
5.61
5.01
4.7
5.81
4.53
4.88
4.88
4.17
4.85
3.32
2.32
3.55
4.12
2.07
3.9
3.12
5.4
3.95
3.87
1.52
4.98
5.56
4.71
3.72
5.31
4.99
3.96
4.1
4.43
4.61
6.21
4.09
0.77
4
2.32
3.84
3.8
2.07
3.63
3.96
3.88
4.23
3.38
1.52
4.39
4.58
4.26
3.72
5.29
4.99
3.95
4.1
4.43
4.61
4.86
4.09
0.77
33.06%
2.52%
40.13%
11.59%
60.19%
13.91%
35.93%
-8.00%
14.13%
21.66%
73.70%
3.86%
-21.66%
0.00%
0.00%
5.35%
0.40%
15.74%
29.43%
2.21%
5.53%
-27.25%
1.92%
84.12%
19.35%
2.52%
35.24%
18.45%
60.19%
19.87%
18.69%
22.40%
8.04%
31.58%
73.70%
15.25%
-0.22%
9.55%
0.00%
5.70%
0.40%
15.96%
29.43%
2.21%
5.53%
0.41%
1.92%
84.12%
-20.48%
0.00%
-8.17%
7.77%
0.00%
6.92%
-26.92%
28.15%
-7.09%
12.66%
0.00%
11.85%
17.63%
9.55%
0.00%
0.38%
0.00%
0.25%
0.00%
0.00%
0.00%
21.74%
0.00%
0.00%
4.75
3.64
3.53
23.37%
25.68%
3.02%
9
http://www.dalsoft.com
Kernel#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
Geometric
Mean
SIMDinator prevents better code generation
gcc 4.1.2
gcc+dco
-np
gcc+dco/gcc
-np/gcc
-np/gcc+dco
4.96
2.38
5.93
4.66
5.2
4.53
4.87
5
4.6
4.94
5.78
5.18
4.57
4.71
3.72
5.61
5.01
4.7
5.81
4.53
4.88
4.88
4.17
4.85
3.32
2.32
3.55
4.12
2.07
3.9
3.12
5.4
3.95
3.87
1.52
4.98
5.56
4.71
3.72
5.31
4.99
3.96
4.1
4.43
4.61
6.21
4.09
0.77
4
2.32
3.84
3.8
2.07
3.63
3.96
3.88
4.23
3.38
1.52
4.39
4.58
4.26
3.72
5.29
4.99
3.95
4.1
4.43
4.61
4.86
4.09
0.77
33.06%
2.52%
40.13%
11.59%
60.19%
13.91%
35.93%
-8.00%
14.13%
21.66%
73.70%
3.86%
-21.66%
0.00%
0.00%
5.35%
0.40%
15.74%
29.43%
2.21%
5.53%
-27.25%
1.92%
84.12%
19.35%
2.52%
35.24%
18.45%
60.19%
19.87%
18.69%
22.40%
8.04%
31.58%
73.70%
15.25%
-0.22%
9.55%
0.00%
5.70%
0.40%
15.96%
29.43%
2.21%
5.53%
0.41%
1.92%
84.12%
-20.48%
0.00%
-8.17%
7.77%
0.00%
6.92%
-26.92%
28.15%
-7.09%
12.66%
0.00%
11.85%
17.63%
9.55%
0.00%
0.38%
0.00%
0.25%
0.00%
0.00%
0.00%
21.74%
0.00%
0.00%
4.75
3.64
3.53
23.37%
25.68%
3.02%
10
http://www.dalsoft.com
faster
29.00%
slower 12.50%
prevents 25.00%
11
http://www.dalsoft.com
Kernel#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
23
24
Geometric
Mean
gcc 4.1.2
gcc+dco
icc
gcc+dco/gcc
icc/gcc
icc/gcc+dco
4.96
2.38
5.93
4.66
5.2
4.53
4.87
5
4.6
4.94
5.78
5.18
4.57
4.71
3.72
5.61
5.01
4.7
5.81
4.53
4.88
4.17
4.85
3.32
2.32
3.55
3.8
2.07
3.63
3.12
3.88
3.95
3.38
1.52
4.39
4.58
4.26
3.72
5.29
4.99
3.95
4.1
4.43
4.61
4.09
0.77
2.96
2.28
2.54
4.63
2.38
3.87
2.57
3.25
4.97
4.32
1.65
4.13
4.61
2.3
3.67
5.66
4.86
3.45
6.77
4.38
1.05
4.67
1.66
33.06%
2.52%
40.13%
18.45%
60.19%
19.87%
35.93%
22.40%
14.13%
31.58%
73.70%
15.25%
-0.22%
9.55%
0.00%
5.70%
0.40%
15.96%
29.43%
2.21%
5.53%
1.92%
84.12%
40.32%
4.20%
57.17%
0.64%
54.23%
14.57%
47.23%
35.00%
-8.04%
12.55%
71.45%
20.27%
-0.88%
51.17%
1.34%
-0.89%
2.99%
26.60%
-16.52%
3.31%
78.48%
-11.99%
65.77%
10.84%
1.72%
28.45%
-21.84%
-14.98%
-6.61%
17.63%
16.24%
-25.82%
-27.81%
-8.55%
5.92%
-0.66%
46.01%
1.34%
-6.99%
2.61%
12.66%
-65.12%
1.13%
77.22%
-14.18%
-115.58%
4.74
3.4
3.29
28.27%
30.56%
3.19%
Number of cases
icc 5
8
dco
10
12
http://www.dalsoft.com
Compiler generated information to aid optimization:
aliasing
failed register allocations
alignment
13
http://www.dalsoft.com
Thank you!
14