LLVM 2.2 v.s. gcc 4.2

by syoyo

LLVM 2.2 is just released.

http://www.llvm.org/

Benchmark again. Here is my old post: LLVM 2.0 & gcc 4.2

I ran Himeno benchmark with following compilers on my Core2 Intel Mac 2.16 GHz, Leopard 10.5.2

– gcc 4.0.1(from Apple’s Xcode)
– gcc 4.2.1
LLVM 2.2

The graph of observed performance for 3 compilers,

llvm22_bench_01.png

and the graph showing the performance for LLVM2.2 and LLVM2.0
(The value of LLVM 2.0 is taken from my old post)

llvm22_bench_02.png

Here are procedures and outputs

llvm-gcc & LLVM 2.2


Macintosh: $ llvm-gcc -c -emit-llvm -O3 -DSMALL himenoBMTxps.c -o himenoBMTxps.bc
Macintosh: $ lli himenoBMTxps.bc
...
 Gosa : 1.981140e-05
 MFLOPS measured : 1283.698606	cpu : 39.254658
 Score based on Pentium III 600MHz : 15.654861

gcc-4.2.1


Macintosh: $ gcc-4.2.1 -O3 -DSMALL himenoBMTxps.c -o himenoBMTxps.gcc42.out
Macintosh: $ ./himenoBMTxps.gcc42.out
...
 Gosa : 1.966431e-05
 MFLOPS measured : 946.659033	cpu : 53.317495
 Score based on Pentium III 600MHz : 11.544622

gcc4.0


Macintosh: $ gcc -O3 -DSMALL -o himenoBMTxps.gcc40.out
Macintosh: $ ./himenoBMTxps.gcc40.out
...
 Gosa : 8.331492e-05
 MFLOPS measured : 612.135986	cpu : 56.467345
 Score based on Pentium III 600MHz : 7.465073

LLVM2.2(mid-end and x86 back-end) is so nice,
+35% improvement compared to gcc 4.2 and
+10% improvement was observed against LLVM 2.0.

Investigation

Where these performance differences come from?
In Himeno bench, The inner-most loop in jacobi() consumes
99.7% of whole computation time.

I disassembled executables of this part,
and found LLVM 2.2 emits very efficient x86 assembly.
It is composed of move, add/sub and mul instructions.
These 3 instructions can be executed in parallel on Core2 CPU.
(Core2 has independent load/store unit, additive fp ALU and multiply fp ALU)

Here’s a disassembled code of inner-most loop in jacobi(), from 3 compilers.

LLVM2.2 generated code.


1.8%    mulss    +21847984(%ebp), %xmm3
0.1%    movss    +46412(%ebp), %xmm0
0.9%    mulss    +21881008(%ebp), %xmm0
1.8%    addss    %xmm3, %xmm0
0.1%    movss    +4406612(%ebp), %xmm3
0.8%    mulss    +21847472(%ebp), %xmm3
0.0%    addss    %xmm3, %xmm0
1.8%    movss    +21881524(%ebp), %xmm3
1.1%    subss    +21880492(%ebp), %xmm3
0.2%    subss    +21814444(%ebp), %xmm3
0.3%    addss    +21813412(%ebp), %xmm3
1.9%    mulss    +8766828(%ebp), %xmm3
1.6%    addss    %xmm3, %xmm0
0.9%    movss    +21847988(%ebp), %xmm3
0.0%    subss    +21846956(%ebp), %xmm3
1.4%    subss    +21847980(%ebp), %xmm3
0.0%    addss    +21846948(%ebp), %xmm3
0.4%    mulss    +10946928(%ebp), %xmm3
2.0%    addss    %xmm3, %xmm0
3.0%    movss    +21881012(%ebp), %xmm3
0.0%    subss    +21813932(%ebp), %xmm3
0.0%    subss    +21881004(%ebp), %xmm3
0.0%    addss    +21813924(%ebp), %xmm3
1.9%    mulss    +13127028(%ebp), %xmm3
2.1%    addss    %xmm3, %xmm0
2.7%    movss    +15307148(%ebp), %xmm3
0.7%    mulss    +21813928(%ebp), %xmm3
0.8%    addss    %xmm3, %xmm0
3.1%    movss    +17487248(%ebp), %xmm3
0.7%    mulss    +21846952(%ebp), %xmm3
0.1%    addss    %xmm3, %xmm0
4.2%    movss    +19667348(%ebp), %xmm3
0.7%    mulss    +21847464(%ebp), %xmm3
0.0%    addss    %xmm3, %xmm0
4.6%    addss    +24027596(%ebp), %xmm0
6.1%    mulss    +6586712(%ebp), %xmm0
8.5%    movss    +21847468(%ebp), %xmm3
0.0%    subss    %xmm3, %xmm0
5.5%    mulss    +26207724(%ebp), %xmm0
8.0%    movaps   %xmm1, %xmm4
        mulss    %xmm0, %xmm4
7.8%    addss    %xmm4, %xmm3
4.6%    addss    +24027596(%ebp), %xmm0
6.1%    mulss    +6586712(%ebp), %xmm0
8.5%    movss    +21847468(%ebp), %xmm3
0.0%    subss    %xmm3, %xmm0
5.5%    mulss    +26207724(%ebp), %xmm0
8.0%    movaps   %xmm1, %xmm4
        mulss    %xmm0, %xmm4
7.8%    addss    %xmm4, %xmm3
5.2%    movss    %xmm3, +28387852(%ebp)
1.9%    mulss    %xmm0, %xmm0
        addss    %xmm2, %xmm0
0.2%    addl     $4, %ebp
        incl     %ebx
1.6%    cmpl     %ebx, %esi
        movaps   %xmm0, %xmm2
2.1%    jg       0x00002395 

gcc 4.2 genereted code.


        mulss    +2180104(%ecx), %xmm2
1.7%    movss    (%esi, %edx, 4), %xmm0
1.1%    mulss    +4(%ecx), %xmm1
0.5%    movl     -104(%ebp), %esi
0.0%    movss    %xmm0, -20(%ebp)
0.0%    mulss    +4360204(%ecx), %xmm0
1.7%    addss    %xmm2, %xmm1
0.0%    addss    %xmm0, %xmm1
0.1%    movss    -4(%eax, %esi), %xmm0
0.4%    movl     -116(%ebp), %esi
1.1%    subss    -4(%eax, %esi), %xmm0
0.1%    movl     -100(%ebp), %esi
0.0%    subss    -4(%eax, %esi), %xmm0
0.2%    movl     -108(%ebp), %esi
1.1%    addss    -4(%eax, %esi), %xmm0
0.1%    movl     -112(%ebp), %esi
0.1%    mulss    +4(%edi), %xmm0
0.7%    movss    (%esi, %edx, 4), %xmm2
1.1%    movl     -120(%ebp), %esi
0.0%    addss    %xmm0, %xmm1
0.2%    movss    (%esi, %edx, 4), %xmm0
0.0%    movl     -36(%ebp), %esi
1.3%    movss    %xmm0, -16(%ebp)
0.0%    movaps   %xmm2, %xmm0
0.1%    movss    (%esi, %edx, 4), %xmm5
0.0%    movl     -32(%ebp), %esi
1.2%    subss    -16(%ebp), %xmm0
0.1%    movss    (%esi, %edx, 4), %xmm7
0.1%    addl     $1, %edx
0.0%    movl     -112(%ebp), %esi
1.4%    subss    -8(%eax, %esi), %xmm0
4.7%    movl     -120(%ebp), %esi
0.1%    addss    -8(%eax, %esi), %xmm0
5.8%    movl     -36(%ebp), %esi
        mulss    +2180104(%edi), %xmm0
5.3%    addss    %xmm0, %xmm1
4.0%    movaps   %xmm5, %xmm0
        subss    %xmm7, %xmm0
        subss    -8(%eax, %esi), %xmm0
0.0%    movl     -32(%ebp), %esi
1.3%    addss    -8(%eax, %esi), %xmm0
0.1%    movl     -152(%ebp), %esi
        mulss    +4360204(%edi), %xmm0
2.6%    addl     $4, %edi
0.2%    mulss    +4(%esi), %xmm4
0.7%    mulss    +2180104(%esi), %xmm3
0.7%    addss    %xmm0, %xmm1
3.2%    movss    +4360204(%esi), %xmm0
0.6%    movl     -44(%ebp), %esi
0.0%    addss    %xmm4, %xmm1
3.4%    mulss    -8(%eax, %esi), %xmm0
0.1%    addss    %xmm3, %xmm1
3.8%    movl     -132(%ebp), %esi
0.0%    addss    %xmm0, %xmm1
3.8%    addss    -4(%eax, %esi), %xmm1
5.4%    movl     -136(%ebp), %esi
        mulss    +6540304(%ecx), %xmm1
5.1%    addl     $4, %ecx
        subss    %xmm6, %xmm1
3.9%    mulss    -4(%eax, %esi), %xmm1
5.6%    movaps   %xmm1, %xmm0
2.6%    mulss    %xmm1, %xmm0
6.1%    addss    -64(%ebp), %xmm0
4.0%    movss    %xmm0, -64(%ebp)
1.3%    movl     -140(%ebp), %esi
        mulss    -96(%ebp), %xmm1
        addb     $4, -152(%ebp)
1.3%    cmpl     -92(%ebp), %edx
        addss    %xmm6, %xmm1
        movss    %xmm1, -4(%eax, %esi)
0.0%    jnz

gcc4.0 genereated code.


0.8%    movl     -60(%ebp), %eax
        inc      -56(%ebp)
0.8%    movl     -56(%ebp), %esi
0.0%    movl     -56(%ebp), %edi
0.0%    shll     $7, %eax
        addl     -60(%ebp), %eax
0.8%    addl     $2, %esi
        movl     %esi, -32(%ebp)
0.0%    movl     -136(%ebp), %esi
0.0%    incl     %edi
0.9%    movl     %eax, -80(%ebp)
0.0%    movl     -80(%ebp), %edx
0.0%    imull    $8385, -64(%ebp), %eax
0.0%    movl     %edi, -28(%ebp)
0.8%    leal     -8(%esi), %ecx
        imull    $8385, -144(%ebp), %esi
0.0%    addl     %eax, %edx
        movl     %eax, -84(%ebp)
0.7%    movl     -80(%ebp), %eax
        movl     %edx, -88(%ebp)
0.0%    addl     %edi, %edx
        movl     %edx, -92(%ebp)
0.8%    movl     -92(%ebp), %edi
0.0%    movl     -136(%ebp), %esi
0.0%    incl     %edi
0.9%    movl     %eax, -80(%ebp)
0.0%    movl     -80(%ebp), %edx
0.0%    imull    $8385, -64(%ebp), %eax
0.0%    movl     %edi, -28(%ebp)
0.8%    leal     -8(%esi), %ecx
        movl     %eax, -104(%ebp)
0.1%    leal     -4(%edx), %eax
        movl     -104(%ebp), %edx
0.8%    movss    +4(%edi), %xmm1
0.3%    movl     -84(%ebp), %edi
0.0%    mulss    (%eax, %edx, 4), %xmm1
0.7%    movl     -20(%ebp), %edx
0.7%    shll     $7, %edx
        addl     -20(%ebp), %edx
0.0%    addl     %edx, %edi
0.2%    movl     %edi, -108(%ebp)
	movl     %edi, -112(%ebp)
0.0%	movl     -36(%ebp), %edi
0.3%	movss    +2180104(%edi), %xmm0
1.3%	movl     -112(%ebp), %edi
0.0%	mulss    (%eax, %edi, 4), %xmm0
2.1%	movl     -36(%ebp), %edi
0.1%	movl     -88(%ebp), %eax
0.1%	addl     -32(%ebp), %eax
0.0%	addss    %xmm0, %xmm1
2.1%	movss    +4360204(%edi), %xmm0
0.4%	movl     -48(%ebp), %edi
0.1%	mulss    (%ecx, %eax, 4), %xmm0
1.2%	movl     -128(%ebp), %eax
0.4%	shll     $7, %edi
0.0%	addl     -48(%ebp), %edi
	addss    %xmm0, %xmm1
2.0%	subl     $8, %eax
0.0%	movl     %eax, -116(%ebp)
	leal     (%edx, %esi), %eax
	addl     -28(%ebp), %eax
0.8%	leal     (%edi, %esi), %esi
	addl     -28(%ebp), %esi
0.0%	movss    (%ecx, %eax, 4), %xmm0
0.3%	subss    (%ecx, %esi, 4), %xmm0
2.6%	imull    $8385, -44(%ebp), %esi
0.0%	addl     %esi, %edx
	addl     -28(%ebp), %edx
0.0%	leal     (%edi, %esi), %eax
0.8%	addl     -28(%ebp), %eax
0.0%	subss    (%ecx, %edx, 4), %xmm0
3.5%	movl     -92(%ebp), %edx
0.0%	addl     -84(%ebp), %edi
0.0%	addss    (%ecx, %eax, 4), %xmm0
2.5%	movl     -116(%ebp), %eax
	mulss    (%eax, %edx, 4), %xmm0
3.7%	movl     -32(%ebp), %eax
0.0%	movl     -108(%ebp), %edx
	addl     -32(%ebp), %edx
0.0%	addss    %xmm0, %xmm1
2.7%	addl     %edi, %eax
0.0%	movl     %eax, -160(%ebp)
	movss    (%ecx, %edx, 4), %xmm0
0.0%	movl     -112(%ebp), %edx
0.9%	subss    (%ecx, %eax, 4), %xmm0
0.2%	subss    (%ecx, %edx, 4), %xmm0
0.9%	movl     -56(%ebp), %edx
	leal     (%edi, %edx), %eax
0.4%	movl     -116(%ebp), %edx
0.0%	addss    (%ecx, %eax, 4), %xmm0
1.8%	movl     -96(%ebp), %eax
	mulss    +2180100(%eax, %edx), %xmm0
3.5%	movl     -100(%ebp), %eax
0.0%	addl     -32(%ebp), %eax
	addss    %xmm0, %xmm1
2.4%	movl     %eax, -120(%ebp)
0.0%	movl     -80(%ebp), %edx
	addl     -28(%ebp), %edi
	addl     %esi, %edx
0.7%	movl     -32(%ebp), %esi
0.0%	leal     (%edx, %esi), %eax
0.0%	movl     -120(%ebp), %esi
0.0%	movss    (%ecx, %esi, 4), %xmm0
0.7%	movl     -56(%ebp), %esi
0.0%	subss    (%ecx, %eax, 4), %xmm0
0.3%	movl     -104(%ebp), %eax
	subss    (%ecx, %eax, 4), %xmm0
2.2%	leal     (%edx, %esi), %eax
0.0%	movl     -116(%ebp), %esi
	addl     -28(%ebp), %edx
0.0%	addss    (%ecx, %eax, 4), %xmm0
2.3%	movl     -96(%ebp), %eax
	mulss    +4360200(%eax, %esi), %xmm0
3.8%	movl     -132(%ebp), %esi
	addss    %xmm0, %xmm1
2.5%	leal     -8(%esi), %eax
	movl     -92(%ebp), %esi
	movss    (%eax, %esi, 4), %xmm0
0.2%	movl     -96(%ebp), %esi
0.7%	mulss    (%ecx, %edx, 4), %xmm0
0.1%	movl     -96(%ebp), %edx
	addss    %xmm0, %xmm1
1.5%	movss    +2180100(%edx, %eax), %xmm0
0.3%	movl     -88(%ebp), %edx
	addl     -56(%ebp), %edx
	mulss    (%ecx, %edi, 4), %xmm0
1.0%	movl     -92(%ebp), %edi
0.0%	addss    %xmm0, %xmm1
2.0%	movss    +4360200(%esi, %eax), %xmm0
0.3%	movl     -148(%ebp), %eax
	movl     -152(%ebp), %esi
0.0%	mulss    (%ecx, %edx, 4), %xmm0
2.3%	movl     -36(%ebp), %edx
0.0%	movl     -140(%ebp), %ecx
1.0%	addss    %xmm0, %xmm1
1.5%	addss    -8(%eax, %edi, 4), %xmm1
2.8%	mulss    +6540304(%edx), %xmm1
3.5%	addl     $4, %edx
	subss    %xmm2, %xmm1
2.4%	mulss    -8(%ecx, %edi, 4), %xmm1
3.5%	movaps   %xmm1, %xmm0
1.7%	mulss    %xmm1, %xmm0
4.1%	mulss    %xmm4, %xmm1
	addss    %xmm0, %xmm3
2.4%	addss    %xmm1, %xmm2
	movss    %xmm2, -8(%esi, %edi, 4)
0.0%	movl     -76(%ebp), %edi
0.0%	cmpl     %edi, -56(%ebp)
1.0%	movl     %edx, -36(%ebp)
	jnz

Definitely, gcc4.0 generated code is redundant.
This is why gcc4.0 is slow.

gcc4.2 genereated code contains some redundant instructions, e.g. movl and addl.

x86 Core2 is OoO(Out-of-Order) architecture, thus less instructions does not mean faster execution in general.
But in this case(gcc4.2 v.s. LLVM2.2), I think the exection time scales well with the number of instructions.

For more accurate investigation, we need instruction pipeline simulator for Core2 architecture,
but Intel does not provide it! , whereas ATI did.
So I cannot investigate more on the performance.

Advertisements