From: "Alinabi" <alexander.the.average@gmail.com>
Subject: Re: GNAT compiler switches and optimization
Date: 22 Oct 2006 17:02:13 -0700
Date: 2006-10-22T17:02:13-07:00 [thread overview]
Message-ID: <1161561733.272140.281830@b28g2000cwb.googlegroups.com> (raw)
In-Reply-To: <a1gs04-toe.ln1@newserver.thecreems.com>
Ok, it appears that the score is Fortran: 13 instructions -- Ada: 17
instructions in the inner loop.
Aggain, this is compiled with gcc 4.0.3 with the following switches:
-g -march=opteron -mtune=opteron -mfpmath=sse -fomit-frame-pointer -O2
-fdump-tree-optimized
Fortran inner loop (13 instructions):
=============
3d4: 48 63 c1 movslq %ecx,%rax
3d7: ff c1 inc %ecx
3d9: 48 89 c2 mov %rax,%rdx
3dc: 49 0f af c2 imul %r10,%rax
3e0: 48 0f af d3 imul %rbx,%rdx
3e4: 4c 01 c8 add %r9,%rax
3e7: 48 01 f0 add %rsi,%rax
3ea: 48 8d 14 17 lea (%rdi,%rdx,1),%rdx
3ee: 44 39 c1 cmp %r8d,%ecx
3f1: f3 41 0f 10 04 83 movss (%r11,%rax,4),%xmm0
3f7: f3 41 0f 59 04 94 mulss (%r12,%rdx,4),%xmm0
3fd: f3 0f 58 c8 addss %xmm0,%xmm1
401: 75 d1 jne 3d4 <MAIN__+0x3d4>
Ada inner loop (17 instructions):
==============
a90: ff c6 inc %esi
a92: 48 63 d6 movslq %esi,%rdx
a95: 48 89 d0 mov %rdx,%rax
a98: 48 29 e8 sub %rbp,%rax
a9b: 4c 01 d8 add %r11,%rax
a9e: f3 41 0f 10 0c 84 movss (%r12,%rax,4),%xmm1
aa4: 4c 89 c0 mov %r8,%rax
aa7: 4c 29 d2 sub %r10,%rdx
aaa: 45 31 c9 xor %r9d,%r9d
aad: 48 83 c0 04 add $0x4,%rax
ab1: 49 0f 48 c1 cmovs %r9,%rax
ab5: 48 0f af d0 imul %rax,%rdx
ab9: f3 0f 10 04 17 movss (%rdi,%rdx,1),%xmm0
abe: f3 0f 59 c1 mulss %xmm1,%xmm0
ac2: f3 0f 58 d0 addss %xmm0,%xmm2
ac6: 39 de cmp %ebx,%esi
ac8: 75 c6 jne a90 <_ada_tst_array+0x3a0>
Now, since Jeffrey was right about me not being an assembly guru, here
is the assembly code for all three nested loops, so that you can
doublecheck yourself.
Fortran:
========
do I = 1,N
2f2: 8b 84 24 ac 01 00 00 mov 0x1ac(%rsp),%eax
2f9: f3 0f 11 44 24 28 movss %xmm0,0x28(%rsp)
2ff: 85 c0 test %eax,%eax
301: 0f 8e 28 01 00 00 jle 42f <MAIN__+0x42f>
do J = 1,N
sum = 0.0
do R = 1,N
sum = sum + A(I,R)*B(R,J)
307: 48 8b 8c 24 28 01 00 mov 0x128(%rsp),%rcx
30e: 00
30f: 48 8b 94 24 18 01 00 mov 0x118(%rsp),%rdx
316: 00
317: 44 8d 40 01 lea 0x1(%rax),%r8d
31b: 4c 8b a4 24 10 01 00 mov 0x110(%rsp),%r12
322: 00
323: 48 8b 9c 24 40 01 00 mov 0x140(%rsp),%rbx
32a: 00
32b: 4c 8b 9c 24 60 01 00 mov 0x160(%rsp),%r11
332: 00
333: 4c 8b 94 24 78 01 00 mov 0x178(%rsp),%r10
33a: 00
33b: 48 89 4c 24 10 mov %rcx,0x10(%rsp)
340: 48 89 54 24 18 mov %rdx,0x18(%rsp)
345: 48 8b 8c 24 90 01 00 mov 0x190(%rsp),%rcx
34c: 00
end do
C(I,J) = sum
34d: 48 8b 94 24 c0 00 00 mov 0xc0(%rsp),%rdx
354: 00
355: 4c 8b 8c 24 68 01 00 mov 0x168(%rsp),%r9
35c: 00
35d: 4c 8b bc 24 f0 00 00 mov 0xf0(%rsp),%r15
364: 00
365: 0f 57 d2 xorps %xmm2,%xmm2
368: c7 44 24 30 01 00 00 movl $0x1,0x30(%rsp)
36f: 00
370: 48 89 4c 24 20 mov %rcx,0x20(%rsp)
375: 48 89 54 24 48 mov %rdx,0x48(%rsp)
37a: 48 8b 8c 24 d8 00 00 mov 0xd8(%rsp),%rcx
381: 00
382: 48 8b 94 24 c8 00 00 mov 0xc8(%rsp),%rdx
389: 00
38a: 48 89 4c 24 40 mov %rcx,0x40(%rsp)
38f: 48 89 54 24 38 mov %rdx,0x38(%rsp)
394: 48 63 44 24 30 movslq 0x30(%rsp),%rax
399: 48 8b 54 24 10 mov 0x10(%rsp),%rdx
39e: 41 bd 01 00 00 00 mov $0x1,%r13d
3a4: 48 8b 4c 24 18 mov 0x18(%rsp),%rcx
3a9: 48 0f af d0 imul %rax,%rdx
3ad: 48 0f af 44 24 40 imul 0x40(%rsp),%rax
3b3: 48 8d 3c 0a lea (%rdx,%rcx,1),%rdi
3b7: 48 8b 54 24 38 mov 0x38(%rsp),%rdx
3bc: 4c 8d 34 10 lea (%rax,%rdx,1),%r14
3c0: 48 8b 74 24 20 mov 0x20(%rsp),%rsi
3c5: 49 63 ed movslq %r13d,%rbp
3c8: b9 01 00 00 00 mov $0x1,%ecx
3cd: 0f 28 ca movaps %xmm2,%xmm1
3d0: 48 0f af f5 imul %rbp,%rsi
3d4: 48 63 c1 movslq %ecx,%rax
3d7: ff c1 inc %ecx
3d9: 48 89 c2 mov %rax,%rdx
3dc: 49 0f af c2 imul %r10,%rax
3e0: 48 0f af d3 imul %rbx,%rdx
3e4: 4c 01 c8 add %r9,%rax
3e7: 48 01 f0 add %rsi,%rax
3ea: 48 8d 14 17 lea (%rdi,%rdx,1),%rdx
3ee: 44 39 c1 cmp %r8d,%ecx
3f1: f3 41 0f 10 04 83 movss (%r11,%rax,4),%xmm0
3f7: f3 41 0f 59 04 94 mulss (%r12,%rdx,4),%xmm0
3fd: f3 0f 58 c8 addss %xmm0,%xmm1
401: 75 d1 jne 3d4 <MAIN__+0x3d4>
403: 4c 89 f8 mov %r15,%rax
406: 48 8b 54 24 48 mov 0x48(%rsp),%rdx
40b: 41 ff c5 inc %r13d
40e: 48 0f af c5 imul %rbp,%rax
412: 41 39 cd cmp %ecx,%r13d
415: 49 8d 04 06 lea (%r14,%rax,1),%rax
419: f3 0f 11 0c 82 movss %xmm1,(%rdx,%rax,4)
41e: 75 a0 jne 3c0 <MAIN__+0x3c0>
420: ff 44 24 30 incl 0x30(%rsp)
424: 44 39 6c 24 30 cmp %r13d,0x30(%rsp)
429: 0f 85 65 ff ff ff jne 394 <MAIN__+0x394>
end do
end do
Ada:
=============
for I in A'range(1) loop
9a5: 41 8b 45 00 mov 0x0(%r13),%eax
9a9: 41 8b 55 04 mov 0x4(%r13),%edx
9ad: 39 d0 cmp %edx,%eax
9af: 0f 8f 8a 01 00 00 jg b3f <_ada_tst_array+0x44f>
for J in A'range(2) loop
Sum := 0.0;
for R in A'range(2) loop
Sum := Sum + A(I,R)*B(R,J);
9b5: 4c 8b bc 24 d0 00 00 mov 0xd0(%rsp),%r15
9bc: 00
end loop;
C(I,J) := Sum;
9bd: 48 8b 8c 24 c0 00 00 mov 0xc0(%rsp),%rcx
9c4: 00
9c5: 4c 8b a4 24 e0 00 00 mov 0xe0(%rsp),%r12
9cc: 00
9cd: 89 44 24 68 mov %eax,0x68(%rsp)
9d1: 4c 89 7c 24 50 mov %r15,0x50(%rsp)
9d6: 48 89 8c 24 88 00 00 mov %rcx,0x88(%rsp)
9dd: 00
9de: 45 8b 75 08 mov 0x8(%r13),%r14d
9e2: 45 8b 6d 0c mov 0xc(%r13),%r13d
9e6: 44 89 6c 24 6c mov %r13d,0x6c(%rsp)
9eb: 4c 63 7c 24 6c movslq 0x6c(%rsp),%r15
9f0: 48 63 f8 movslq %eax,%rdi
9f3: 0f 57 e4 xorps %xmm4,%xmm4
9f6: 48 89 7c 24 20 mov %rdi,0x20(%rsp)
9fb: 49 63 ee movslq %r14d,%rbp
9fe: 89 54 24 2c mov %edx,0x2c(%rsp)
a02: 44 89 eb mov %r13d,%ebx
a05: 4c 89 7c 24 30 mov %r15,0x30(%rsp)
a0a: 44 3b 74 24 6c cmp 0x6c(%rsp),%r14d
a0f: 0f 8f 17 01 00 00 jg b2c <_ada_tst_array+0x43c>
a15: 48 8b 44 24 30 mov 0x30(%rsp),%rax
a1a: ba 00 00 00 00 mov $0x0,%edx
a1f: 45 89 f5 mov %r14d,%r13d
a22: 0f 28 dc movaps %xmm4,%xmm3
a25: 48 8b 4c 24 40 mov 0x40(%rsp),%rcx
a2a: 48 29 e8 sub %rbp,%rax
a2d: 48 8d 04 85 04 00 00 lea 0x4(,%rax,4),%rax
a34: 00
a35: 48 85 c0 test %rax,%rax
a38: 48 0f 48 c2 cmovs %rdx,%rax
a3c: 48 63 54 24 68 movslq 0x68(%rsp),%rdx
a41: 48 c1 f8 02 sar $0x2,%rax
a45: 48 89 54 24 70 mov %rdx,0x70(%rsp)
a4a: 48 2b 54 24 20 sub 0x20(%rsp),%rdx
a4f: 49 89 d3 mov %rdx,%r11
a52: 4c 0f af d8 imul %rax,%r11
a56: 8b 01 mov (%rcx),%eax
a58: 4c 63 d0 movslq %eax,%r10
a5b: 8b 41 0c mov 0xc(%rcx),%eax
a5e: 48 98 cltq
a60: 8b 51 08 mov 0x8(%rcx),%edx
a63: 48 63 d2 movslq %edx,%rdx
a66: 48 29 d0 sub %rdx,%rax
a69: 48 89 14 24 mov %rdx,(%rsp)
a6d: 4c 8d 04 85 00 00 00 lea 0x0(,%rax,4),%r8
a74: 00
a75: 49 63 cd movslq %r13d,%rcx
a78: 4c 8b 7c 24 50 mov 0x50(%rsp),%r15
a7d: 44 89 f6 mov %r14d,%esi
a80: 48 89 c8 mov %rcx,%rax
a83: 48 2b 04 24 sub (%rsp),%rax
a87: 0f 28 d3 movaps %xmm3,%xmm2
a8a: 49 8d 3c 87 lea (%r15,%rax,4),%rdi
a8e: eb 02 jmp a92 <_ada_tst_array+0x3a2>
a90: ff c6 inc %esi
a92: 48 63 d6 movslq %esi,%rdx
a95: 48 89 d0 mov %rdx,%rax
a98: 48 29 e8 sub %rbp,%rax
a9b: 4c 01 d8 add %r11,%rax
a9e: f3 41 0f 10 0c 84 movss (%r12,%rax,4),%xmm1
aa4: 4c 89 c0 mov %r8,%rax
aa7: 4c 29 d2 sub %r10,%rdx
aaa: 45 31 c9 xor %r9d,%r9d
aad: 48 83 c0 04 add $0x4,%rax
ab1: 49 0f 48 c1 cmovs %r9,%rax
ab5: 48 0f af d0 imul %rax,%rdx
ab9: f3 0f 10 04 17 movss (%rdi,%rdx,1),%xmm0
abe: f3 0f 59 c1 mulss %xmm1,%xmm0
ac2: f3 0f 58 d0 addss %xmm0,%xmm2
ac6: 39 de cmp %ebx,%esi
ac8: 75 c6 jne a90 <_ada_tst_array+0x3a0>
aca: 48 8b 54 24 48 mov 0x48(%rsp),%rdx
acf: 8b 02 mov (%rdx),%eax
ad1: 48 63 d0 movslq %eax,%rdx
ad4: 48 8b 7c 24 48 mov 0x48(%rsp),%rdi
ad9: 8b 47 0c mov 0xc(%rdi),%eax
adc: 48 63 f8 movslq %eax,%rdi
adf: 4c 8b 7c 24 48 mov 0x48(%rsp),%r15
ae4: 41 8b 47 08 mov 0x8(%r15),%eax
ae8: 48 98 cltq
aea: 4c 8b 7c 24 70 mov 0x70(%rsp),%r15
aef: 48 29 c7 sub %rax,%rdi
af2: 48 29 c1 sub %rax,%rcx
af5: 48 8d 04 bd 04 00 00 lea 0x4(,%rdi,4),%rax
afc: 00
afd: 49 29 d7 sub %rdx,%r15
b00: 48 85 c0 test %rax,%rax
b03: 4c 89 fa mov %r15,%rdx
b06: 49 0f 48 c1 cmovs %r9,%rax
b0a: 48 0f af d0 imul %rax,%rdx
b0e: 48 8b 84 24 88 00 00 mov 0x88(%rsp),%rax
b15: 00
b16: 48 8d 0c 88 lea (%rax,%rcx,4),%rcx
b1a: f3 0f 11 14 11 movss %xmm2,(%rcx,%rdx,1)
b1f: 41 39 f5 cmp %esi,%r13d
b22: 74 08 je b2c <_ada_tst_array+0x43c>
b24: 41 ff c5 inc %r13d
b27: e9 49 ff ff ff jmpq a75 <_ada_tst_array+0x385>
b2c: 8b 54 24 2c mov 0x2c(%rsp),%edx
b30: 39 54 24 68 cmp %edx,0x68(%rsp)
b34: 74 09 je b3f <_ada_tst_array+0x44f>
b36: ff 44 24 68 incl 0x68(%rsp)
b3a: e9 cb fe ff ff jmpq a0a <_ada_tst_array+0x31a>
end loop;
end loop;
Jeffrey Creem wrote:
> Alinabi wrote:
> > I ran your test programs compiled with gcc 4.0.3 and the following
> > optimizations:
> > COMMON_FLAGS=-g -march=opteron -mtune=opteron -mfpmath=sse
> > -fomit-frame-pointer -O2 -fdump-tree-optimized
> > and I cannot reproduce the large differences in performance everyone
> > else talks about. Here are the times I get:
> >
> > N Ada Fortran
> > ====================
> > 64 0.002029 0.000000
> > 128 0.016321 0.016000
> > 256 0.214143 0.204013
> > 512 3.125888 3.124195
> > 800 6.374982 5.864366
> > 1024 34.10479 35.22620
> > 2048 277.3071 283.2417
> >
>
> That is interesting. The question then becomes has FORTRAN improved on
> the way to 4.2.0 or has Ada regressed.
>
> Try doing a make dis_all which should produce annotated assembly output.
> The Ada version can be a little daunting in the way we have setup the
> files since the generic instantiations at the top full the .S files
> (woops, looked like I named them .dis) with the generic instatance.
>
> Even if you are not an assembly guru, if you start from the bottom of
> the files you can usually pretty quickly find that inner loop and
> compare the number of statements.
next prev parent reply other threads:[~2006-10-23 0:02 UTC|newest]
Thread overview: 68+ messages / expand[flat|nested] mbox.gz Atom feed top
2006-10-20 10:47 GNAT compiler switches and optimization tkrauss
2006-10-20 11:04 ` Duncan Sands
2006-10-21 10:45 ` Stephen Leake
2006-10-20 11:42 ` Duncan Sands
2006-10-20 15:41 ` Martin Krischik
2006-10-20 12:09 ` Samuel Tardieu
2006-10-20 12:18 ` Samuel Tardieu
2006-10-20 12:12 ` Gautier
2006-10-20 12:35 ` Dmitry A. Kazakov
2006-10-20 15:53 ` Martin Krischik
2006-10-20 12:52 ` Gautier
2006-10-20 13:27 ` claude.simon
2006-10-20 15:38 ` Robert A Duff
2006-10-20 19:32 ` Gautier
2006-10-20 15:56 ` Jeffrey Creem
2006-10-20 16:30 ` Martin Krischik
2006-10-20 19:51 ` Gautier
2006-10-20 22:11 ` Jeffrey R. Carter
2006-10-20 23:52 ` Jeffrey Creem
2006-10-21 7:37 ` Gautier
2006-10-21 16:35 ` Jeffrey Creem
2006-10-21 17:04 ` Pascal Obry
2006-10-21 21:22 ` Jeffrey Creem
2006-10-22 3:03 ` Jeffrey Creem
2006-10-22 7:39 ` Jeffrey R. Carter
2006-10-22 11:48 ` tkrauss
2006-10-22 18:02 ` Georg Bauhaus
2006-10-22 18:24 ` Jeffrey Creem
2006-10-23 0:10 ` Georg Bauhaus
2006-10-22 20:20 ` Jeffrey R. Carter
2006-10-22 12:31 ` Gautier
2006-10-22 20:26 ` Jeffrey R. Carter
2006-10-22 21:22 ` Gautier
2006-10-22 18:01 ` tmoran
2006-10-22 20:54 ` Jeffrey R. Carter
2006-10-22 13:50 ` Alinabi
2006-10-22 15:41 ` Jeffrey Creem
2006-10-23 0:02 ` Alinabi [this message]
2006-10-23 5:28 ` Gautier
2006-10-23 16:32 ` Alinabi
2006-10-22 15:57 ` Jeffrey Creem
2006-10-22 19:32 ` Damien Carbonne
2006-10-22 20:00 ` Gautier
2006-10-22 20:51 ` Damien Carbonne
2006-10-23 2:15 ` Jeffrey Creem
2006-10-23 2:29 ` Jeffrey R. Carter
2006-10-23 1:31 ` Jeffrey Creem
2006-10-23 3:10 ` Jeffrey Creem
2006-10-23 7:31 ` Jeffrey R. Carter
2006-10-23 11:55 ` Jeffrey Creem
2006-10-23 19:52 ` Wiljan Derks
2006-10-23 20:25 ` Jeffrey R. Carter
2006-10-24 9:52 ` Dr. Adrian Wrigley
2006-10-24 11:50 ` Jeffrey Creem
2006-10-24 16:24 ` Jeffrey R. Carter
2006-10-25 3:50 ` Jeffrey Creem
2006-10-25 15:32 ` claude.simon
2006-10-24 19:21 ` Wiljan Derks
2006-10-23 12:33 ` Warner BRUNS
2006-10-23 12:40 ` Warner BRUNS
2006-10-23 13:52 ` Georg Bauhaus
2006-10-23 17:11 ` Warner BRUNS
2006-10-23 17:57 ` Dr. Adrian Wrigley
2006-10-23 15:02 ` Robert A Duff
2006-10-23 20:22 ` Jeffrey R. Carter
2006-10-21 18:28 ` tmoran
2006-10-23 6:28 ` Martin Krischik
2006-10-21 12:39 ` Dr. Adrian Wrigley
replies disabled
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox