From mboxrd@z Thu Jan  1 00:00:00 1970
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on polar.synack.me
X-Spam-Level: 
X-Spam-Status: No, score=-0.9 required=5.0 tests=BAYES_00,FORGED_GMAIL_RCVD,
	FREEMAIL_FROM autolearn=no autolearn_force=no version=3.4.4
X-Google-Thread: 103376,7767a311e01e1cd
X-Google-Attributes: gid103376,public
X-Google-Language: ENGLISH,ASCII-7-bit
Path: 
 g2news2.google.com!postnews.google.com!b28g2000cwb.googlegroups.com!not-for-mail
From: "Alinabi" <alexander.the.average@gmail.com>
Newsgroups: comp.lang.ada
Subject: Re: GNAT compiler switches and optimization
Date: 22 Oct 2006 17:02:13 -0700
Organization: http://groups.google.com
Message-ID: <1161561733.272140.281830@b28g2000cwb.googlegroups.com>
References: <1161341264.471057.252750@h48g2000cwc.googlegroups.com>
   <9Qb_g.111857$aJ.65708@attbi_s21>
   <434o04-7g7.ln1@newserver.thecreems.com>
   <4539ce34$1_2@news.bluewin.ch>
   <nrup04-5hj.ln1@newserver.thecreems.com>
   <453A532F.2070709@obry.net>
   <9kfq04-sgm.ln1@newserver.thecreems.com>
   <sj3r04-rlv.ln1@newserver.thecreems.com>
   <1161525012.997046.264780@k70g2000cwa.googlegroups.com>
   <a1gs04-toe.ln1@newserver.thecreems.com>
NNTP-Posting-Host: 68.235.169.69
Mime-Version: 1.0
Content-Type: text/plain; charset="iso-8859-1"
X-Trace: posting.google.com 1161561741 7184 127.0.0.1 (23 Oct 2006 00:02:21
 GMT)
X-Complaints-To: groups-abuse@google.com
NNTP-Posting-Date: Mon, 23 Oct 2006 00:02:21 +0000 (UTC)
In-Reply-To: <a1gs04-toe.ln1@newserver.thecreems.com>
User-Agent: G2/1.0
X-HTTP-UserAgent: Mozilla/5.0 (X11; U; Linux x86_64; en-US;
 rv:1.8.0.7) Gecko/20060921 Ubuntu/dapper-security
 Firefox/1.5.0.7,gzip(gfe),gzip(gfe)
Complaints-To: groups-abuse@google.com
Injection-Info: b28g2000cwb.googlegroups.com; posting-host=68.235.169.69;
   posting-account=gD74RA0AAABm9rsBG7oeOmJ-iO5c3KUQ
Xref: g2news2.google.com comp.lang.ada:7146
Date: 2006-10-22T17:02:13-07:00
List-Id: <comp.lang.ada>

Ok, it appears that the score is Fortran: 13 instructions -- Ada: 17
instructions in the inner loop.
Aggain, this is compiled with gcc 4.0.3 with the following switches:
-g -march=opteron -mtune=opteron -mfpmath=sse -fomit-frame-pointer -O2
-fdump-tree-optimized


Fortran inner loop (13 instructions):
=============
 3d4:   48 63 c1                movslq %ecx,%rax
 3d7:   ff c1                   inc    %ecx
 3d9:   48 89 c2                mov    %rax,%rdx
 3dc:   49 0f af c2             imul   %r10,%rax
 3e0:   48 0f af d3             imul   %rbx,%rdx
 3e4:   4c 01 c8                add    %r9,%rax
 3e7:   48 01 f0                add    %rsi,%rax
 3ea:   48 8d 14 17             lea    (%rdi,%rdx,1),%rdx
 3ee:   44 39 c1                cmp    %r8d,%ecx
 3f1:   f3 41 0f 10 04 83       movss  (%r11,%rax,4),%xmm0
 3f7:   f3 41 0f 59 04 94       mulss  (%r12,%rdx,4),%xmm0
 3fd:   f3 0f 58 c8             addss  %xmm0,%xmm1
 401:   75 d1                   jne    3d4 <MAIN__+0x3d4>

Ada inner loop (17 instructions):
==============
 a90:   ff c6                   inc    %esi
 a92:   48 63 d6                movslq %esi,%rdx
 a95:   48 89 d0                mov    %rdx,%rax
 a98:   48 29 e8                sub    %rbp,%rax
 a9b:   4c 01 d8                add    %r11,%rax
 a9e:   f3 41 0f 10 0c 84       movss  (%r12,%rax,4),%xmm1
 aa4:   4c 89 c0                mov    %r8,%rax
 aa7:   4c 29 d2                sub    %r10,%rdx
 aaa:   45 31 c9                xor    %r9d,%r9d
 aad:   48 83 c0 04             add    $0x4,%rax
 ab1:   49 0f 48 c1             cmovs  %r9,%rax
 ab5:   48 0f af d0             imul   %rax,%rdx
 ab9:   f3 0f 10 04 17          movss  (%rdi,%rdx,1),%xmm0
 abe:   f3 0f 59 c1             mulss  %xmm1,%xmm0
 ac2:   f3 0f 58 d0             addss  %xmm0,%xmm2
 ac6:   39 de                   cmp    %ebx,%esi
 ac8:   75 c6                   jne    a90 <_ada_tst_array+0x3a0>

Now, since Jeffrey was right about me not being an assembly guru, here
is the assembly code for all three nested loops, so that you can
doublecheck yourself.

Fortran:
========
   do I = 1,N
 2f2:   8b 84 24 ac 01 00 00    mov    0x1ac(%rsp),%eax
 2f9:   f3 0f 11 44 24 28       movss  %xmm0,0x28(%rsp)
 2ff:   85 c0                   test   %eax,%eax
 301:   0f 8e 28 01 00 00       jle    42f <MAIN__+0x42f>
      do J = 1,N
         sum = 0.0
         do R = 1,N
            sum = sum + A(I,R)*B(R,J)
 307:   48 8b 8c 24 28 01 00    mov    0x128(%rsp),%rcx
 30e:   00
 30f:   48 8b 94 24 18 01 00    mov    0x118(%rsp),%rdx
 316:   00
 317:   44 8d 40 01             lea    0x1(%rax),%r8d
 31b:   4c 8b a4 24 10 01 00    mov    0x110(%rsp),%r12
 322:   00
 323:   48 8b 9c 24 40 01 00    mov    0x140(%rsp),%rbx
 32a:   00
 32b:   4c 8b 9c 24 60 01 00    mov    0x160(%rsp),%r11
 332:   00
 333:   4c 8b 94 24 78 01 00    mov    0x178(%rsp),%r10
 33a:   00
 33b:   48 89 4c 24 10          mov    %rcx,0x10(%rsp)
 340:   48 89 54 24 18          mov    %rdx,0x18(%rsp)
 345:   48 8b 8c 24 90 01 00    mov    0x190(%rsp),%rcx
 34c:   00
         end do
         C(I,J) = sum
 34d:   48 8b 94 24 c0 00 00    mov    0xc0(%rsp),%rdx
 354:   00
 355:   4c 8b 8c 24 68 01 00    mov    0x168(%rsp),%r9
 35c:   00
 35d:   4c 8b bc 24 f0 00 00    mov    0xf0(%rsp),%r15
 364:   00
 365:   0f 57 d2                xorps  %xmm2,%xmm2
 368:   c7 44 24 30 01 00 00    movl   $0x1,0x30(%rsp)
 36f:   00
 370:   48 89 4c 24 20          mov    %rcx,0x20(%rsp)
 375:   48 89 54 24 48          mov    %rdx,0x48(%rsp)
 37a:   48 8b 8c 24 d8 00 00    mov    0xd8(%rsp),%rcx
 381:   00
 382:   48 8b 94 24 c8 00 00    mov    0xc8(%rsp),%rdx
 389:   00
 38a:   48 89 4c 24 40          mov    %rcx,0x40(%rsp)
 38f:   48 89 54 24 38          mov    %rdx,0x38(%rsp)
 394:   48 63 44 24 30          movslq 0x30(%rsp),%rax
 399:   48 8b 54 24 10          mov    0x10(%rsp),%rdx
 39e:   41 bd 01 00 00 00       mov    $0x1,%r13d
 3a4:   48 8b 4c 24 18          mov    0x18(%rsp),%rcx
 3a9:   48 0f af d0             imul   %rax,%rdx
 3ad:   48 0f af 44 24 40       imul   0x40(%rsp),%rax
 3b3:   48 8d 3c 0a             lea    (%rdx,%rcx,1),%rdi
 3b7:   48 8b 54 24 38          mov    0x38(%rsp),%rdx
 3bc:   4c 8d 34 10             lea    (%rax,%rdx,1),%r14
 3c0:   48 8b 74 24 20          mov    0x20(%rsp),%rsi
 3c5:   49 63 ed                movslq %r13d,%rbp
 3c8:   b9 01 00 00 00          mov    $0x1,%ecx
 3cd:   0f 28 ca                movaps %xmm2,%xmm1
 3d0:   48 0f af f5             imul   %rbp,%rsi
 3d4:   48 63 c1                movslq %ecx,%rax
 3d7:   ff c1                   inc    %ecx
 3d9:   48 89 c2                mov    %rax,%rdx
 3dc:   49 0f af c2             imul   %r10,%rax
 3e0:   48 0f af d3             imul   %rbx,%rdx
 3e4:   4c 01 c8                add    %r9,%rax
 3e7:   48 01 f0                add    %rsi,%rax
 3ea:   48 8d 14 17             lea    (%rdi,%rdx,1),%rdx
 3ee:   44 39 c1                cmp    %r8d,%ecx
 3f1:   f3 41 0f 10 04 83       movss  (%r11,%rax,4),%xmm0
 3f7:   f3 41 0f 59 04 94       mulss  (%r12,%rdx,4),%xmm0
 3fd:   f3 0f 58 c8             addss  %xmm0,%xmm1
 401:   75 d1                   jne    3d4 <MAIN__+0x3d4>
 403:   4c 89 f8                mov    %r15,%rax
 406:   48 8b 54 24 48          mov    0x48(%rsp),%rdx
 40b:   41 ff c5                inc    %r13d
 40e:   48 0f af c5             imul   %rbp,%rax
 412:   41 39 cd                cmp    %ecx,%r13d
 415:   49 8d 04 06             lea    (%r14,%rax,1),%rax
 419:   f3 0f 11 0c 82          movss  %xmm1,(%rdx,%rax,4)
 41e:   75 a0                   jne    3c0 <MAIN__+0x3c0>
 420:   ff 44 24 30             incl   0x30(%rsp)
 424:   44 39 6c 24 30          cmp    %r13d,0x30(%rsp)
 429:   0f 85 65 ff ff ff       jne    394 <MAIN__+0x394>
      end do
   end do


Ada:
=============
    for I in A'range(1) loop
 9a5:   41 8b 45 00             mov    0x0(%r13),%eax
 9a9:   41 8b 55 04             mov    0x4(%r13),%edx
 9ad:   39 d0                   cmp    %edx,%eax
 9af:   0f 8f 8a 01 00 00       jg     b3f <_ada_tst_array+0x44f>
      for J in A'range(2) loop
         Sum := 0.0;
         for R in A'range(2) loop
            Sum := Sum + A(I,R)*B(R,J);
 9b5:   4c 8b bc 24 d0 00 00    mov    0xd0(%rsp),%r15
 9bc:   00
         end loop;
         C(I,J) := Sum;
 9bd:   48 8b 8c 24 c0 00 00    mov    0xc0(%rsp),%rcx
 9c4:   00
 9c5:   4c 8b a4 24 e0 00 00    mov    0xe0(%rsp),%r12
 9cc:   00
 9cd:   89 44 24 68             mov    %eax,0x68(%rsp)
 9d1:   4c 89 7c 24 50          mov    %r15,0x50(%rsp)
 9d6:   48 89 8c 24 88 00 00    mov    %rcx,0x88(%rsp)
 9dd:   00
 9de:   45 8b 75 08             mov    0x8(%r13),%r14d
 9e2:   45 8b 6d 0c             mov    0xc(%r13),%r13d
 9e6:   44 89 6c 24 6c          mov    %r13d,0x6c(%rsp)
 9eb:   4c 63 7c 24 6c          movslq 0x6c(%rsp),%r15
 9f0:   48 63 f8                movslq %eax,%rdi
 9f3:   0f 57 e4                xorps  %xmm4,%xmm4
 9f6:   48 89 7c 24 20          mov    %rdi,0x20(%rsp)
 9fb:   49 63 ee                movslq %r14d,%rbp
 9fe:   89 54 24 2c             mov    %edx,0x2c(%rsp)
 a02:   44 89 eb                mov    %r13d,%ebx
 a05:   4c 89 7c 24 30          mov    %r15,0x30(%rsp)
 a0a:   44 3b 74 24 6c          cmp    0x6c(%rsp),%r14d
 a0f:   0f 8f 17 01 00 00       jg     b2c <_ada_tst_array+0x43c>
 a15:   48 8b 44 24 30          mov    0x30(%rsp),%rax
 a1a:   ba 00 00 00 00          mov    $0x0,%edx
 a1f:   45 89 f5                mov    %r14d,%r13d
 a22:   0f 28 dc                movaps %xmm4,%xmm3
 a25:   48 8b 4c 24 40          mov    0x40(%rsp),%rcx
 a2a:   48 29 e8                sub    %rbp,%rax
 a2d:   48 8d 04 85 04 00 00    lea    0x4(,%rax,4),%rax
 a34:   00
 a35:   48 85 c0                test   %rax,%rax
 a38:   48 0f 48 c2             cmovs  %rdx,%rax
 a3c:   48 63 54 24 68          movslq 0x68(%rsp),%rdx
 a41:   48 c1 f8 02             sar    $0x2,%rax
 a45:   48 89 54 24 70          mov    %rdx,0x70(%rsp)
 a4a:   48 2b 54 24 20          sub    0x20(%rsp),%rdx
 a4f:   49 89 d3                mov    %rdx,%r11
 a52:   4c 0f af d8             imul   %rax,%r11
 a56:   8b 01                   mov    (%rcx),%eax
 a58:   4c 63 d0                movslq %eax,%r10
 a5b:   8b 41 0c                mov    0xc(%rcx),%eax
 a5e:   48 98                   cltq
 a60:   8b 51 08                mov    0x8(%rcx),%edx
 a63:   48 63 d2                movslq %edx,%rdx
 a66:   48 29 d0                sub    %rdx,%rax
 a69:   48 89 14 24             mov    %rdx,(%rsp)
 a6d:   4c 8d 04 85 00 00 00    lea    0x0(,%rax,4),%r8
 a74:   00
 a75:   49 63 cd                movslq %r13d,%rcx
 a78:   4c 8b 7c 24 50          mov    0x50(%rsp),%r15
 a7d:   44 89 f6                mov    %r14d,%esi
 a80:   48 89 c8                mov    %rcx,%rax
 a83:   48 2b 04 24             sub    (%rsp),%rax
 a87:   0f 28 d3                movaps %xmm3,%xmm2
 a8a:   49 8d 3c 87             lea    (%r15,%rax,4),%rdi
 a8e:   eb 02                   jmp    a92 <_ada_tst_array+0x3a2>
 a90:   ff c6                   inc    %esi
 a92:   48 63 d6                movslq %esi,%rdx
 a95:   48 89 d0                mov    %rdx,%rax
 a98:   48 29 e8                sub    %rbp,%rax
 a9b:   4c 01 d8                add    %r11,%rax
 a9e:   f3 41 0f 10 0c 84       movss  (%r12,%rax,4),%xmm1
 aa4:   4c 89 c0                mov    %r8,%rax
 aa7:   4c 29 d2                sub    %r10,%rdx
 aaa:   45 31 c9                xor    %r9d,%r9d
 aad:   48 83 c0 04             add    $0x4,%rax
 ab1:   49 0f 48 c1             cmovs  %r9,%rax
 ab5:   48 0f af d0             imul   %rax,%rdx
 ab9:   f3 0f 10 04 17          movss  (%rdi,%rdx,1),%xmm0
 abe:   f3 0f 59 c1             mulss  %xmm1,%xmm0
 ac2:   f3 0f 58 d0             addss  %xmm0,%xmm2
 ac6:   39 de                   cmp    %ebx,%esi
 ac8:   75 c6                   jne    a90 <_ada_tst_array+0x3a0>
 aca:   48 8b 54 24 48          mov    0x48(%rsp),%rdx
 acf:   8b 02                   mov    (%rdx),%eax
 ad1:   48 63 d0                movslq %eax,%rdx
 ad4:   48 8b 7c 24 48          mov    0x48(%rsp),%rdi
 ad9:   8b 47 0c                mov    0xc(%rdi),%eax
 adc:   48 63 f8                movslq %eax,%rdi
 adf:   4c 8b 7c 24 48          mov    0x48(%rsp),%r15
 ae4:   41 8b 47 08             mov    0x8(%r15),%eax
 ae8:   48 98                   cltq
 aea:   4c 8b 7c 24 70          mov    0x70(%rsp),%r15
 aef:   48 29 c7                sub    %rax,%rdi
 af2:   48 29 c1                sub    %rax,%rcx
 af5:   48 8d 04 bd 04 00 00    lea    0x4(,%rdi,4),%rax
 afc:   00
 afd:   49 29 d7                sub    %rdx,%r15
 b00:   48 85 c0                test   %rax,%rax
 b03:   4c 89 fa                mov    %r15,%rdx
 b06:   49 0f 48 c1             cmovs  %r9,%rax
 b0a:   48 0f af d0             imul   %rax,%rdx
 b0e:   48 8b 84 24 88 00 00    mov    0x88(%rsp),%rax
 b15:   00
 b16:   48 8d 0c 88             lea    (%rax,%rcx,4),%rcx
 b1a:   f3 0f 11 14 11          movss  %xmm2,(%rcx,%rdx,1)
 b1f:   41 39 f5                cmp    %esi,%r13d
 b22:   74 08                   je     b2c <_ada_tst_array+0x43c>
 b24:   41 ff c5                inc    %r13d
 b27:   e9 49 ff ff ff          jmpq   a75 <_ada_tst_array+0x385>
 b2c:   8b 54 24 2c             mov    0x2c(%rsp),%edx
 b30:   39 54 24 68             cmp    %edx,0x68(%rsp)
 b34:   74 09                   je     b3f <_ada_tst_array+0x44f>
 b36:   ff 44 24 68             incl   0x68(%rsp)
 b3a:   e9 cb fe ff ff          jmpq   a0a <_ada_tst_array+0x31a>
      end loop;
   end loop;


Jeffrey Creem wrote:
> Alinabi wrote:
> > I ran your test programs compiled with gcc 4.0.3 and the following
> > optimizations:
> > COMMON_FLAGS=-g -march=opteron -mtune=opteron -mfpmath=sse
> > -fomit-frame-pointer -O2 -fdump-tree-optimized
> > and I cannot reproduce the large differences in performance everyone
> > else talks about. Here are the times I get:
> >
> > N        Ada            Fortran
> > ====================
> > 64      0.002029   0.000000
> > 128    0.016321   0.016000
> > 256    0.214143   0.204013
> > 512    3.125888   3.124195
> > 800    6.374982   5.864366
> > 1024  34.10479   35.22620
> > 2048  277.3071   283.2417
> >
>
> That is interesting. The question then becomes has FORTRAN improved on
> the way to 4.2.0 or has Ada regressed.
>
> Try doing a make dis_all which should produce annotated assembly output.
> The Ada version can be a little daunting in the way we have setup the
> files since the generic instantiations at the top full the .S files
> (woops, looked like I named them .dis) with the generic instatance.
>
> Even if you are not an assembly guru, if you start from the bottom of
> the files you can usually pretty quickly find that inner loop and
> compare the number of statements.