leal -1(,%edx),%ecx xorl %eax,%eax pxor %xmm1,%xmm1 addl $-2,%ecx jg ___dcox86_lu_0_ |
Code marked in this color establishes %ecx as a loop count and checks if there are at least 4 iterations in the loop; if yes - jump to ___dcox86_lu_0_ to execute unrolled loop. |
subl $-3,%ecx jmp ___dcox86_wl_4_ |
Not enough iterations for unrolled loop - readjust the count %ecx and jump to___dcox86_wl_4_ to execute not unrolled loop. |
___dcox86_lu_0_: pxor %xmm2,%xmm2 unpcklpd %xmm2,%xmm1 pxor %xmm0,%xmm0 movl %ebx,%ebp andl $15,%ebp jne ___dcox86_wl_3_ ___dcox86_wl_5_: movsd (%esi,%eax,8),%xmm5 addl $4,%eax movhpd -24(%esi,%eax,8),%xmm5 addl $-4,%ecx movsd -16(%esi,%eax,8),%xmm6 movhpd -8(%esi,%eax,8),%xmm6 mulpd -32(%ebx,%eax,8),%xmm5 mulpd -16(%ebx,%eax,8),%xmm6 addpd %xmm5,%xmm1 addpd %xmm6,%xmm0 jg ___dcox86_wl_5_ jmp ___dcox86_cc_7_ ___dcox86_wl_3_: movsd (%ebx,%eax,8),%xmm7 addl $4,%eax movhpd -24(%ebx,%eax,8),%xmm7 addl $-4,%ecx movsd -32(%esi,%eax,8),%xmm3 movhpd -24(%esi,%eax,8),%xmm3 movsd -16(%ebx,%eax,8),%xmm4 movhpd -8(%ebx,%eax,8),%xmm4 movsd -16(%esi,%eax,8),%xmm6 movhpd -8(%esi,%eax,8),%xmm6 mulpd %xmm3,%xmm7 mulpd %xmm6,%xmm4 addpd %xmm7,%xmm1 addpd %xmm4,%xmm0 jg ___dcox86_wl_3_ |
Unrolled loop. Note the optimizations
performed by dco,
such as use of SIMD instructions, conditionally-generated code to use
aligned memory access ( e.g. mulpd -32(%ebx,%eax,8),%xmm5 ). |
___dcox86_cc_7_: addpd %xmm0,%xmm1 movhlps %xmm1,%xmm2 addsd %xmm2,%xmm1 subl $-3,%ecx jle ___dcox86_lu_2_ |
Exit from the unrolled loop execution. Code marked in this color checks if there are iterations left for execution ( 3 or less ) - note that %ecx contains a loop count; if no iterations left - jump to ___dcox86_lu_2_. |
___dcox86_wl_4_: movsd (%ebx,%eax,8),%xmm0 addl $1,%eax mulsd -8(%esi,%eax,8),%xmm0 addl $-1,%ecx addsd %xmm0,%xmm1 jg ___dcox86_wl_4_ |
Not unrolled loop - executed if there are iterations left after the execution of the unrolled loop or it was established that number of iterations in the loop is less than unroll count. Note that this code is executed for number of iterations less than unroll count ( 3 or less ). |
___dcox86_lu_2_: | Exit label. |
Kernel# | gcc 4.2.2 | dco | dcoU | dco/ gcc |
dcoU/ gcc |
dcoU/ dco |
1 | 5.03 | 4.05 | 2.96 | 19.48% | 41.15% | 26.91% |
2 | 2.84 | 2.78 | 2.78 | 2.11% | 2.11% | 0.0% |
3 | 5.06 | 5.18 | 4.02 | -2.37% | 20.55% | 22.39% |
4 | 5.33 | 5.24 | 5.2 | 1.69% | 2.44% | 0.76% |
5 | 5.08 | 4.99 | 1.73 | 1.77% | 65.94% | 65.33% |
6 | 16.25 | 16.22 | 4.08 | 0.18% | 74.89% | 74.85% |
7 | 5.55 | 4.8 | 4.8 | 13.51% | 13.51% | 0.0% |
8 | 3.91 | 3.9 | 3.92 | 0.26% | -0.26% | -0.51% |
9 | 5.04 | 5.69 | 3.72 | -12.9% | 26.19% | 34.62% |
10 | 4.97 | 4.35 | 4.53 | 12.47% | 8.85% | -4.14% |
11 | 5.13 | 0.82 | 0.82 | 84.02% | 84.02% | 0.0% |
12 | 4.42 | 5.58 | 4.99 | -26.24% | -12.9% | 10.57% |
13 | 4.65 | 4.66 | 4.6 | -0.22% | 1.08% | 1.29% |
14 | 4.37 | 4.35 | 4.33 | 0.46% | 0.92% | 0.46% |
15 | 5.45 | 5.4 | 5.4 | 0.92% | 0.92% | 0.0% |
16 | 5.5 | 6.67 | 6.66 | -21.27% | -21.09% | 0.15% |
17 | 4.87 | 4.17 | 4.11 | 14.37% | 14.37% | 0.0% |
18 | 4.58 | 3.64 | 4.01 | 20.52% | 12.45% | -10.46% |
19 | 7.04 | 3.68 | 3.7 | 47.73% | 47.44% | -0.54% |
20 | 4.84 | 4.73 | 4.73 | 2.27% | 2.27% | 0.0% |
21 | 7.15 | 7.11 | 6.86 | 0.56% | 4.06% | 3.52% |
22 | 4.83 | 4.79 | 4.78 | 0.83% | 1.04% | 0.21% |
23 | 3.67 | 2.98 | 2.58 | 18.8% | 29.7% | 13.42% |
24 | 4.85 | 1.19 | 0.89 | 75.46% | 81.65% | 25.21% |
Geometric Mean | 5.13 | 4.25 | 3.62 | 17.15% | 29.56% | 14.98% |
Kernel# | gccU | gccU+dco | gcc+dcoU | gccU+dco/ gccU |
gcc+dcoU/ gcc |
gcc+dcoU/ gccU+dco |
1 | 5.03 | 3.24 | 2.96 | 35.59% | 41.15% | 8.64% |
2 | 2.46 | 2.36 | 2.78 | 4.07% | 2.11% | -17.8% |
3 | 4.99 | 2.49 | 4.02 | 50.1% | 20.55% | -61.45% |
4 | 5.04 | 3.84 | 5.2 | 23.81% | 2.44% | -35.42% |
5 | 5.33 | 1.76 | 1.73 | 66.98% | 65.94% | 1.7% |
6 | 16.24 | 5.06 | 4.08 | 68.84% | 74.89% | 19.37% |
7 | 5.16 | 4.16 | 4.8 | 19.38% | 13.51% | -15.38% |
8 | 3.87 | 3.91 | 3.92 | -1.03% | -0.26% | -0.26% |
9 | 4.95 | 4.01 | 3.72 | 18.99% | 26.19% | 7.23% |
10 | 4.94 | 3.38 | 4.53 | 31.58% | 8.85% | -34.02% |
11 | 4.93 | 0.85 | 0.82 | 82.76% | 84.02% | 3.53% |
12 | 5.2 | 5.52 | 4.99 | -6.15% | -12.9% | 9.6% |
13 | 4.63 | 4.66 | 4.6 | -0.65% | 1.08% | 1.29% |
14 | 4.41 | 4.23 | 4.33 | 4.08% | 0.92% | -2.36% |
15 | 5.44 | 4.54 | 5.4 | 16.54% | 0.92% | -18.94% |
16 | 4.86 | 4.52 | 6.66 | 7.0% | -21.09% | -47.35% |
17 | 4.87 | 4.17 | 4.11 | 14.37% | 14.37% | 0.0% |
18 | 4.57 | 3.61 | 4.01 | 21.01% | 12.45% | -11.08% |
19 | 5.82 | 4.1 | 3.7 | 29.55% | 47.44% | 9.76% |
20 | 4.53 | 4.43 | 4.73 | 2.21% | 2.27% | -6.77% |
21 | 7.16 | 8.85 | 6.86 | -23.6% | 4.06% | 22.49% |
22 | 4.79 | 4.8 | 4.78 | -0.21% | 1.04% | 0.42% |
23 | 3.67 | 2.82 | 2.58 | 23.16% | 29.7% | 8.51% |
24 | 4.85 | 0.84 | 0.89 | 82.68% | 81.65% | -5.95% |
Geometric Mean | 5.02 | 3.44 | 3.62 | 31.47% | 29.56% | -5.12% |
gcc | -lu 1 | -lu 2 | -lu 3 | -lu 4 | |
gcc | 2.9916 | 2.9975 | 1.7947 | 2.0497 | 1.7877 |
gccU | 2.9106 | 1.7927 | 1.3348 | 1.8267 | 1.3758 |