From mboxrd@z Thu Jan 1 00:00:00 1970 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on polar.synack.me X-Spam-Level: X-Spam-Status: No, score=-0.9 required=5.0 tests=BAYES_00,FORGED_GMAIL_RCVD, FREEMAIL_FROM autolearn=no autolearn_force=no version=3.4.4 X-Google-Thread: 103376,7767a311e01e1cd X-Google-Attributes: gid103376,public X-Google-Language: ENGLISH,ASCII-7-bit Path: g2news2.google.com!postnews.google.com!b28g2000cwb.googlegroups.com!not-for-mail From: "Alinabi" Newsgroups: comp.lang.ada Subject: Re: GNAT compiler switches and optimization Date: 22 Oct 2006 17:02:13 -0700 Organization: http://groups.google.com Message-ID: <1161561733.272140.281830@b28g2000cwb.googlegroups.com> References: <1161341264.471057.252750@h48g2000cwc.googlegroups.com> <9Qb_g.111857$aJ.65708@attbi_s21> <434o04-7g7.ln1@newserver.thecreems.com> <4539ce34$1_2@news.bluewin.ch> <453A532F.2070709@obry.net> <9kfq04-sgm.ln1@newserver.thecreems.com> <1161525012.997046.264780@k70g2000cwa.googlegroups.com> NNTP-Posting-Host: 68.235.169.69 Mime-Version: 1.0 Content-Type: text/plain; charset="iso-8859-1" X-Trace: posting.google.com 1161561741 7184 127.0.0.1 (23 Oct 2006 00:02:21 GMT) X-Complaints-To: groups-abuse@google.com NNTP-Posting-Date: Mon, 23 Oct 2006 00:02:21 +0000 (UTC) In-Reply-To: User-Agent: G2/1.0 X-HTTP-UserAgent: Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.8.0.7) Gecko/20060921 Ubuntu/dapper-security Firefox/1.5.0.7,gzip(gfe),gzip(gfe) Complaints-To: groups-abuse@google.com Injection-Info: b28g2000cwb.googlegroups.com; posting-host=68.235.169.69; posting-account=gD74RA0AAABm9rsBG7oeOmJ-iO5c3KUQ Xref: g2news2.google.com comp.lang.ada:7146 Date: 2006-10-22T17:02:13-07:00 List-Id: Ok, it appears that the score is Fortran: 13 instructions -- Ada: 17 instructions in the inner loop. Aggain, this is compiled with gcc 4.0.3 with the following switches: -g -march=opteron -mtune=opteron -mfpmath=sse -fomit-frame-pointer -O2 -fdump-tree-optimized Fortran inner loop (13 instructions): ============= 3d4: 48 63 c1 movslq %ecx,%rax 3d7: ff c1 inc %ecx 3d9: 48 89 c2 mov %rax,%rdx 3dc: 49 0f af c2 imul %r10,%rax 3e0: 48 0f af d3 imul %rbx,%rdx 3e4: 4c 01 c8 add %r9,%rax 3e7: 48 01 f0 add %rsi,%rax 3ea: 48 8d 14 17 lea (%rdi,%rdx,1),%rdx 3ee: 44 39 c1 cmp %r8d,%ecx 3f1: f3 41 0f 10 04 83 movss (%r11,%rax,4),%xmm0 3f7: f3 41 0f 59 04 94 mulss (%r12,%rdx,4),%xmm0 3fd: f3 0f 58 c8 addss %xmm0,%xmm1 401: 75 d1 jne 3d4 Ada inner loop (17 instructions): ============== a90: ff c6 inc %esi a92: 48 63 d6 movslq %esi,%rdx a95: 48 89 d0 mov %rdx,%rax a98: 48 29 e8 sub %rbp,%rax a9b: 4c 01 d8 add %r11,%rax a9e: f3 41 0f 10 0c 84 movss (%r12,%rax,4),%xmm1 aa4: 4c 89 c0 mov %r8,%rax aa7: 4c 29 d2 sub %r10,%rdx aaa: 45 31 c9 xor %r9d,%r9d aad: 48 83 c0 04 add $0x4,%rax ab1: 49 0f 48 c1 cmovs %r9,%rax ab5: 48 0f af d0 imul %rax,%rdx ab9: f3 0f 10 04 17 movss (%rdi,%rdx,1),%xmm0 abe: f3 0f 59 c1 mulss %xmm1,%xmm0 ac2: f3 0f 58 d0 addss %xmm0,%xmm2 ac6: 39 de cmp %ebx,%esi ac8: 75 c6 jne a90 <_ada_tst_array+0x3a0> Now, since Jeffrey was right about me not being an assembly guru, here is the assembly code for all three nested loops, so that you can doublecheck yourself. Fortran: ======== do I = 1,N 2f2: 8b 84 24 ac 01 00 00 mov 0x1ac(%rsp),%eax 2f9: f3 0f 11 44 24 28 movss %xmm0,0x28(%rsp) 2ff: 85 c0 test %eax,%eax 301: 0f 8e 28 01 00 00 jle 42f do J = 1,N sum = 0.0 do R = 1,N sum = sum + A(I,R)*B(R,J) 307: 48 8b 8c 24 28 01 00 mov 0x128(%rsp),%rcx 30e: 00 30f: 48 8b 94 24 18 01 00 mov 0x118(%rsp),%rdx 316: 00 317: 44 8d 40 01 lea 0x1(%rax),%r8d 31b: 4c 8b a4 24 10 01 00 mov 0x110(%rsp),%r12 322: 00 323: 48 8b 9c 24 40 01 00 mov 0x140(%rsp),%rbx 32a: 00 32b: 4c 8b 9c 24 60 01 00 mov 0x160(%rsp),%r11 332: 00 333: 4c 8b 94 24 78 01 00 mov 0x178(%rsp),%r10 33a: 00 33b: 48 89 4c 24 10 mov %rcx,0x10(%rsp) 340: 48 89 54 24 18 mov %rdx,0x18(%rsp) 345: 48 8b 8c 24 90 01 00 mov 0x190(%rsp),%rcx 34c: 00 end do C(I,J) = sum 34d: 48 8b 94 24 c0 00 00 mov 0xc0(%rsp),%rdx 354: 00 355: 4c 8b 8c 24 68 01 00 mov 0x168(%rsp),%r9 35c: 00 35d: 4c 8b bc 24 f0 00 00 mov 0xf0(%rsp),%r15 364: 00 365: 0f 57 d2 xorps %xmm2,%xmm2 368: c7 44 24 30 01 00 00 movl $0x1,0x30(%rsp) 36f: 00 370: 48 89 4c 24 20 mov %rcx,0x20(%rsp) 375: 48 89 54 24 48 mov %rdx,0x48(%rsp) 37a: 48 8b 8c 24 d8 00 00 mov 0xd8(%rsp),%rcx 381: 00 382: 48 8b 94 24 c8 00 00 mov 0xc8(%rsp),%rdx 389: 00 38a: 48 89 4c 24 40 mov %rcx,0x40(%rsp) 38f: 48 89 54 24 38 mov %rdx,0x38(%rsp) 394: 48 63 44 24 30 movslq 0x30(%rsp),%rax 399: 48 8b 54 24 10 mov 0x10(%rsp),%rdx 39e: 41 bd 01 00 00 00 mov $0x1,%r13d 3a4: 48 8b 4c 24 18 mov 0x18(%rsp),%rcx 3a9: 48 0f af d0 imul %rax,%rdx 3ad: 48 0f af 44 24 40 imul 0x40(%rsp),%rax 3b3: 48 8d 3c 0a lea (%rdx,%rcx,1),%rdi 3b7: 48 8b 54 24 38 mov 0x38(%rsp),%rdx 3bc: 4c 8d 34 10 lea (%rax,%rdx,1),%r14 3c0: 48 8b 74 24 20 mov 0x20(%rsp),%rsi 3c5: 49 63 ed movslq %r13d,%rbp 3c8: b9 01 00 00 00 mov $0x1,%ecx 3cd: 0f 28 ca movaps %xmm2,%xmm1 3d0: 48 0f af f5 imul %rbp,%rsi 3d4: 48 63 c1 movslq %ecx,%rax 3d7: ff c1 inc %ecx 3d9: 48 89 c2 mov %rax,%rdx 3dc: 49 0f af c2 imul %r10,%rax 3e0: 48 0f af d3 imul %rbx,%rdx 3e4: 4c 01 c8 add %r9,%rax 3e7: 48 01 f0 add %rsi,%rax 3ea: 48 8d 14 17 lea (%rdi,%rdx,1),%rdx 3ee: 44 39 c1 cmp %r8d,%ecx 3f1: f3 41 0f 10 04 83 movss (%r11,%rax,4),%xmm0 3f7: f3 41 0f 59 04 94 mulss (%r12,%rdx,4),%xmm0 3fd: f3 0f 58 c8 addss %xmm0,%xmm1 401: 75 d1 jne 3d4 403: 4c 89 f8 mov %r15,%rax 406: 48 8b 54 24 48 mov 0x48(%rsp),%rdx 40b: 41 ff c5 inc %r13d 40e: 48 0f af c5 imul %rbp,%rax 412: 41 39 cd cmp %ecx,%r13d 415: 49 8d 04 06 lea (%r14,%rax,1),%rax 419: f3 0f 11 0c 82 movss %xmm1,(%rdx,%rax,4) 41e: 75 a0 jne 3c0 420: ff 44 24 30 incl 0x30(%rsp) 424: 44 39 6c 24 30 cmp %r13d,0x30(%rsp) 429: 0f 85 65 ff ff ff jne 394 end do end do Ada: ============= for I in A'range(1) loop 9a5: 41 8b 45 00 mov 0x0(%r13),%eax 9a9: 41 8b 55 04 mov 0x4(%r13),%edx 9ad: 39 d0 cmp %edx,%eax 9af: 0f 8f 8a 01 00 00 jg b3f <_ada_tst_array+0x44f> for J in A'range(2) loop Sum := 0.0; for R in A'range(2) loop Sum := Sum + A(I,R)*B(R,J); 9b5: 4c 8b bc 24 d0 00 00 mov 0xd0(%rsp),%r15 9bc: 00 end loop; C(I,J) := Sum; 9bd: 48 8b 8c 24 c0 00 00 mov 0xc0(%rsp),%rcx 9c4: 00 9c5: 4c 8b a4 24 e0 00 00 mov 0xe0(%rsp),%r12 9cc: 00 9cd: 89 44 24 68 mov %eax,0x68(%rsp) 9d1: 4c 89 7c 24 50 mov %r15,0x50(%rsp) 9d6: 48 89 8c 24 88 00 00 mov %rcx,0x88(%rsp) 9dd: 00 9de: 45 8b 75 08 mov 0x8(%r13),%r14d 9e2: 45 8b 6d 0c mov 0xc(%r13),%r13d 9e6: 44 89 6c 24 6c mov %r13d,0x6c(%rsp) 9eb: 4c 63 7c 24 6c movslq 0x6c(%rsp),%r15 9f0: 48 63 f8 movslq %eax,%rdi 9f3: 0f 57 e4 xorps %xmm4,%xmm4 9f6: 48 89 7c 24 20 mov %rdi,0x20(%rsp) 9fb: 49 63 ee movslq %r14d,%rbp 9fe: 89 54 24 2c mov %edx,0x2c(%rsp) a02: 44 89 eb mov %r13d,%ebx a05: 4c 89 7c 24 30 mov %r15,0x30(%rsp) a0a: 44 3b 74 24 6c cmp 0x6c(%rsp),%r14d a0f: 0f 8f 17 01 00 00 jg b2c <_ada_tst_array+0x43c> a15: 48 8b 44 24 30 mov 0x30(%rsp),%rax a1a: ba 00 00 00 00 mov $0x0,%edx a1f: 45 89 f5 mov %r14d,%r13d a22: 0f 28 dc movaps %xmm4,%xmm3 a25: 48 8b 4c 24 40 mov 0x40(%rsp),%rcx a2a: 48 29 e8 sub %rbp,%rax a2d: 48 8d 04 85 04 00 00 lea 0x4(,%rax,4),%rax a34: 00 a35: 48 85 c0 test %rax,%rax a38: 48 0f 48 c2 cmovs %rdx,%rax a3c: 48 63 54 24 68 movslq 0x68(%rsp),%rdx a41: 48 c1 f8 02 sar $0x2,%rax a45: 48 89 54 24 70 mov %rdx,0x70(%rsp) a4a: 48 2b 54 24 20 sub 0x20(%rsp),%rdx a4f: 49 89 d3 mov %rdx,%r11 a52: 4c 0f af d8 imul %rax,%r11 a56: 8b 01 mov (%rcx),%eax a58: 4c 63 d0 movslq %eax,%r10 a5b: 8b 41 0c mov 0xc(%rcx),%eax a5e: 48 98 cltq a60: 8b 51 08 mov 0x8(%rcx),%edx a63: 48 63 d2 movslq %edx,%rdx a66: 48 29 d0 sub %rdx,%rax a69: 48 89 14 24 mov %rdx,(%rsp) a6d: 4c 8d 04 85 00 00 00 lea 0x0(,%rax,4),%r8 a74: 00 a75: 49 63 cd movslq %r13d,%rcx a78: 4c 8b 7c 24 50 mov 0x50(%rsp),%r15 a7d: 44 89 f6 mov %r14d,%esi a80: 48 89 c8 mov %rcx,%rax a83: 48 2b 04 24 sub (%rsp),%rax a87: 0f 28 d3 movaps %xmm3,%xmm2 a8a: 49 8d 3c 87 lea (%r15,%rax,4),%rdi a8e: eb 02 jmp a92 <_ada_tst_array+0x3a2> a90: ff c6 inc %esi a92: 48 63 d6 movslq %esi,%rdx a95: 48 89 d0 mov %rdx,%rax a98: 48 29 e8 sub %rbp,%rax a9b: 4c 01 d8 add %r11,%rax a9e: f3 41 0f 10 0c 84 movss (%r12,%rax,4),%xmm1 aa4: 4c 89 c0 mov %r8,%rax aa7: 4c 29 d2 sub %r10,%rdx aaa: 45 31 c9 xor %r9d,%r9d aad: 48 83 c0 04 add $0x4,%rax ab1: 49 0f 48 c1 cmovs %r9,%rax ab5: 48 0f af d0 imul %rax,%rdx ab9: f3 0f 10 04 17 movss (%rdi,%rdx,1),%xmm0 abe: f3 0f 59 c1 mulss %xmm1,%xmm0 ac2: f3 0f 58 d0 addss %xmm0,%xmm2 ac6: 39 de cmp %ebx,%esi ac8: 75 c6 jne a90 <_ada_tst_array+0x3a0> aca: 48 8b 54 24 48 mov 0x48(%rsp),%rdx acf: 8b 02 mov (%rdx),%eax ad1: 48 63 d0 movslq %eax,%rdx ad4: 48 8b 7c 24 48 mov 0x48(%rsp),%rdi ad9: 8b 47 0c mov 0xc(%rdi),%eax adc: 48 63 f8 movslq %eax,%rdi adf: 4c 8b 7c 24 48 mov 0x48(%rsp),%r15 ae4: 41 8b 47 08 mov 0x8(%r15),%eax ae8: 48 98 cltq aea: 4c 8b 7c 24 70 mov 0x70(%rsp),%r15 aef: 48 29 c7 sub %rax,%rdi af2: 48 29 c1 sub %rax,%rcx af5: 48 8d 04 bd 04 00 00 lea 0x4(,%rdi,4),%rax afc: 00 afd: 49 29 d7 sub %rdx,%r15 b00: 48 85 c0 test %rax,%rax b03: 4c 89 fa mov %r15,%rdx b06: 49 0f 48 c1 cmovs %r9,%rax b0a: 48 0f af d0 imul %rax,%rdx b0e: 48 8b 84 24 88 00 00 mov 0x88(%rsp),%rax b15: 00 b16: 48 8d 0c 88 lea (%rax,%rcx,4),%rcx b1a: f3 0f 11 14 11 movss %xmm2,(%rcx,%rdx,1) b1f: 41 39 f5 cmp %esi,%r13d b22: 74 08 je b2c <_ada_tst_array+0x43c> b24: 41 ff c5 inc %r13d b27: e9 49 ff ff ff jmpq a75 <_ada_tst_array+0x385> b2c: 8b 54 24 2c mov 0x2c(%rsp),%edx b30: 39 54 24 68 cmp %edx,0x68(%rsp) b34: 74 09 je b3f <_ada_tst_array+0x44f> b36: ff 44 24 68 incl 0x68(%rsp) b3a: e9 cb fe ff ff jmpq a0a <_ada_tst_array+0x31a> end loop; end loop; Jeffrey Creem wrote: > Alinabi wrote: > > I ran your test programs compiled with gcc 4.0.3 and the following > > optimizations: > > COMMON_FLAGS=-g -march=opteron -mtune=opteron -mfpmath=sse > > -fomit-frame-pointer -O2 -fdump-tree-optimized > > and I cannot reproduce the large differences in performance everyone > > else talks about. Here are the times I get: > > > > N Ada Fortran > > ==================== > > 64 0.002029 0.000000 > > 128 0.016321 0.016000 > > 256 0.214143 0.204013 > > 512 3.125888 3.124195 > > 800 6.374982 5.864366 > > 1024 34.10479 35.22620 > > 2048 277.3071 283.2417 > > > > That is interesting. The question then becomes has FORTRAN improved on > the way to 4.2.0 or has Ada regressed. > > Try doing a make dis_all which should produce annotated assembly output. > The Ada version can be a little daunting in the way we have setup the > files since the generic instantiations at the top full the .S files > (woops, looked like I named them .dis) with the generic instatance. > > Even if you are not an assembly guru, if you start from the bottom of > the files you can usually pretty quickly find that inner loop and > compare the number of statements.