$ Porting of programs from VAX to AXP.$ ====================================  ; I)    To get optimal performance from the AXP architecture. 4 II)   Features/problems in the new compilers on AXP." III)  Porting problems VAX -> AXP.  9 I)  To get optimal performance from the AXP architecture. 9 =========================================================    A)  proper alignment of data6 B)  choosing algorithms to maximize benefit from cache: C)  choosing programming style to allow maximal pipelining  3 II)  Features/problems in the new compilers on AXP. 3 ===================================================    A)  C  B)  FORTRAN 
 C)  PASCAL  " III)  Porting problems VAX -> AXP." ==================================  ! A)  problems with system services  B)  problems with macro32  C)  architectural problems' D)  problems with undocumented features  E)  MACRO64 assembler  F)  VEST  B Illustration of the importance of proper data alignment and cache.B ==================================================================   Performance chart:  7                    VAX 4200                    DEC 3400 F                    (rated at 5 VUPS)           (rated at 110 Specmark)  G                    aligned    not aligned      aligned      not aligned B                    4 byte     4 byte           8 byte       8 byte  B small data           1095        365            26665         2300 (64KB)  B medium data          1040        350            10800         1200 (3.2MB)   B large data            440        240             7700         1160 (16MB)  N Numbers=number of REAL*8 initialized pr. millisecond CPU time (not real time).   Conclusions:F   - REAL*8 must be longword aligned on VAX but quadword aligned on AXPF   - performance increase with correct alignment is a factor 2-3 on VAX     but a factor 5-10 on AXPC   - the performance increase in keeping the data in cache on AXP is      a factor 2   TEST_ALIGN.FOR ==============         PROGRAM TEST_ALIGN       INTEGER*4 I,J,K        INTEGER*4 SIZ(3)       BYTE DUMMY(9)        REAL*8 X(2000001)        EQUIVALENCE (DUMMY,X) #       DATA SIZ/8000,400000,2000000/        DO 400 K=1,3         DO 200 J=1,100           DO 100 I=1,SIZ(K)+1              X(I)=I 100       CONTINUE           CALL FOOL(X) 200     CONTINUE         DO 300 I=1,9$           CALL TEST(DUMMY(I),SIZ(K)) 300     CONTINUE 400   CONTINUE	       END  C        SUBROUTINE TEST(X,SIZ)       INTEGER*4 SIZ        REAL*8 X(*)        INTEGER*4 I,J,T        INTEGER*4 PAS$CLOCK2       T=PAS$CLOCK2()       DO 200 J=1,100         DO 100 I=1,SIZ           X(I)=I 100     CONTINUE         CALL FOOL(X) 200   CONTINUE8       WRITE(6,'(1X,F10.3)') (100.0*SIZ)/(PAS$CLOCK2()-T)       RETURN	       END  C        SUBROUTINE FOOL        RETURN	       END    Compilers and alignment. ========================   FORTRAN  -------    (FORTRAN 6.1 for AXP)   G The alignment within COMMON blocks and STRUCTUREs are determined by the M ALIGNMENT quailfier (default is /ALIGNMENT=(COMMONS=PACKED,RECORDS=NATURAL)). B The WARNING qualifier controls whether warnings are issued for nonK natural aligned variables in COMMON blocks (default is /WARNING=ALIGNMENT).   + Other variables are always natural aligned.    PASCAL ------   (PASCAL 5.0 for AXP)  F The alignment within RECORDs are determined by the ALIGNMENT qualifierF (default is /ALIGNMENT=ALPHA_AXP, which use natural alignment - on VAX; the default is /ALIGNMENT=VAX, which use packed alignment).   F Other variables are always natural aligned unless another alignment is> explicit defined in the code as an attribute for the variable.   C  -    (DEC C 1.3 for AXP)   C The alignment within STRUCTs are determined by the MEMBER_ALIGNMENT G qualifier (default is /MEMMBER_ALIGNMENT, which use natural alignment - C on VAX the default is /NOMEMBER_ALIGN, which use packed alignment). L The qualifier can be override temporarrily with the #PRAGMA MEMBER_ALIGNMENT* and #PRAGMA NOMEMBER_ALIGNMENT directives.  + Other variables are always natural aligned.    Points ------  H 1)  Natural alignment is necesarry for optimal performance (see previous
     example). G 2)  The AXP compilers "like" natural alignment, while the VAX compilers ?     "like" packed alignment, which makes identical records have 1     non-identical sizes on the two architectures. H 3)  Be carefull when calling f.ex. MACRO32 routines with natural aligned     records.   ALIGN_SIZE.C ============   #include <stdio.h>  # struct example {char a; double b;};    main() {     struct example test;     printf("%d\n",sizeof(test));  }   # Maximizing benefit from pipelining. # ===================================   D The AXP are a highly pipelined architecture meaning that it benefits< greatly from optimization techniques such as loop unrolling.   Example:  :                            VAX 4200               DEC 3400C                       (rated at 5 VUPS)     (rated at 110 Specmark) F                                         /opt=unrol:1   /opt=unroll:5  A FORTRAN standard loop (1)    219820         8360           5350   A FORTRAN unrolled loop (5)    163430         5340           4710   A assembler unrolled loop      157490         4450           4490     C numbers=millisecond CPU time used for 100000 calls to dot poduct of          1000 elements array    Conclusions:F   - the benefit of a 1->5 loop unrolling is a saving of 25% on VAX and     a saving of 35% on AXPD   - the AXP compiler is just as good to unroll as the programmer, so4     manual loop unrolling on AXP are a waste of timeC   - both on VAX and AXP are handwritten assembler code still faster 7     (not much 5% on VAX and 15% on AXP in this example)   % Compiler defaults for loop unrolling:   )                       VAX             AXP   * FORTRAN                no            0 = 5* C                      no            0 = 5( PASCAL                 no             no   LOOP_UNROLL.FOR  ===============          PROGRAM LOOP_UNROLL        INTEGER*4 N        PARAMETER (N=1003)       INTEGER*4 I,T        REAL*8 X(N),SUM        INTEGER*4 PAS$CLOCK2       REAL*8 DOT1,DOT5,DOTASM        DO 100 I=1,N         X(I)=I 100   CONTINUE       T=PAS$CLOCK2()       DO 200 I=1,100000          SUM=DOT1(N,X)  200   CONTINUE+       WRITE(*,*) '1   :',SUM,PAS$CLOCK2()-T        T=PAS$CLOCK2()       DO 300 I=1,100000          SUM=DOT5(N,X)  300   CONTINUE+       WRITE(*,*) '5   :',SUM,PAS$CLOCK2()-T        T=PAS$CLOCK2()       DO 400 I=1,100000          SUM=DOTASM(N,X)  400   CONTINUE+       WRITE(*,*) 'ASM :',SUM,PAS$CLOCK2()-T 	       END  C        REAL*8 FUNCTION DOT1(N,X)        INTEGER*4 N        REAL*8 X(*)        INTEGER*4 I        DOT1=0       DO 100 I=1,N         DOT1=DOT1+X(I)*X(I)  100   CONTINUE       RETURN	       END  C        REAL*8 FUNCTION DOT5(N,X)        INTEGER*4 N        REAL*8 X(*)        INTEGER*4 I        DOT5=0       DO 100 I=1,MOD(N,5)          DOT5=DOT5+X(I)*X(I)  100   CONTINUE       DO 200 I=MOD(N,5)+1,N,5          DOT5=DOT5+X(I)*X(I)+       +            X(I+1)*X(I+1)+       +            X(I+2)*X(I+2)+       +            X(I+3)*X(I+3)+      +            X(I+4)*X(I+4)  200   CONTINUE       RETURN	       END   
 DOTASM.MAR
 ==========           .title  dotasm4         .psect  $CODE quad,pic,con,lcl,shr,exe,nowrt/         .entry  dotasm,^m<iv,r2,r3,r4,r5,r6,r7> .         movl    @4(ap),r2               ; r2=N.         movl    8(ap),r3                ; r3=X3         clrd    r0                      ; r0=DOTASM          divl3   #5,r2,r4         mull3   #5,r4,r45         subl3   r4,r2,r4                ; r4=MOD(N,5)          mull2   #8,r4 :         addl2   r3,r4                   ; r4=X[MOD(N,5)+1]         mull3   #8,r2,r53         addl2   r3,r5                   ; r5=X[N+1]          cmpl    r3,r4          bgeq    200$ 100$:   muld3   (r3),(r3),r6         addd2   r6,r0          addl2   #8,r3          cmpl    r3,r4          blss    100$ 200$:   cmpl    r3,r5          bgeq    400$ 300$:   muld3   (r3),(r3),r6         addd2   r6,r0          muld3   8(r3),8(r3),r6         addd2   r6,r0           muld3   16(r3),16(r3),r6         addd2   r6,r0           muld3   24(r3),24(r3),r6         addd2   r6,r0           muld3   32(r3),32(r3),r6         addd2   r6,r0          addl2   #40,r3         cmpl    r3,r5          blss    300$ 400$:   ret          .end  
 DOTASM.M64
 ==========  5         $routine DOTASM,kind=stack,saved_regs=<r2,r3>          $linkage_section c8:     .long   8          $code_section /         ldl     r22,(r16)               ; r22=N /         mov     r17,r23                 ; r23=X          .base   r27,$ls -         $call   OTS$REM_I,args=<(r16)/l,c8/l>          mull    r0,8,r2 :         addl    r2,r23,r3               ; r3=X[MOD(N,5)+1]         mull    r22,8,r22 4         addl    r22,r23,r24             ; r24=X[N+1]3         fmov    f31,f0                  ; f0=DOTASM          subl    r3,r23,r1          ble     r1,200$  100$:   ldg     f10,(r23)          mulg    f10,f10,f22          addg    f22,f0,f0          addl    r23,8,r23          subl    r3,r23,r1          bgt     r1,100$  200$:   subl    r24,r23,r1         ble     r1,400$  300$:   ldg     f10,(r23)          ldg     f11,8(r23)         ldg     f12,16(r23)          ldg     f13,24(r23)          ldg     f14,32(r23)          addl    r23,40,r23         mulg    f10,f10,f22          mulg    f11,f11,f23          mulg    f12,f12,f24          mulg    f13,f13,f25          mulg    f14,f14,f26          subl    r24,r23,r1         addg    f22,f23,f27          addg    f24,f25,f28          addg    f26,f0,f0          addg    f27,f28,f29          addg    f29,f0,f0          bgt     r1,300$  400$:   $return          $end_routine DOTASM          .end   Porting from VAX C to DEC C. ============================  4 DEC C is fully ANSI compatible, which VAX C is not !    There are a lot of differences !  M 1)  Much stricter syntax-check even with the default /STANDARD=RELAXED_ANSI89 3     (and guess what happend with /STANDARD=ANSI89).   J     You will have to make a lot of small modifications (f.ex. type casts).  F     And if we want to see all the warnings about sloppy code, then the8     qualifier /WARNING=ENABLE:ALL will surely find them.  H     Most of theese problems can be worked around with the /STANDARD=VAXC     qualifier.  G 2)  The C RTL are now automaticly searched by LINK, so no VAXC.OPT file H     anymore. (the routines are prefixed DECC$, if you need them in other     languages)  8 3)  The preprocessor are now a real ANSI C preprocessor.   VAX C: #module ident string " DEC C: #pragma module ident string  , VAX C: #define dmpi(i) printf("i" "=%d\n",i)+ DEC C: #define dmpi(i) printf(#i "=%d\n",i)     VAX C: #define init(i) x_/**/i=i DEC C: #define init(i) x_##i=i  H 4)  The builtins has changes significantly (obviously, since many of theJ     builtins gives directly access to the instruction set). Among the manyJ     small goodies are __ALLOCA, which have been missed in VAX C for years.  K Conclusion: a proper port can take some time, but the /STANDARD=VAXC kludge & can be used for a fast temporary port.    Porting from FORTRAN 5.x to 6.x.  ================================  ! There are some small differences.   L 1)  Unless the /NOSEPERATE qualifier is used, then all subroutines/functionsF     in one source-file is compiled to one object-module, which enables#     global optimization. [AXP only]   F 2)  A new qualifier /RECURSIVE allows recursive programming in FORTRAN3     (it uses stack-allocation for local variables).   B     I strongly recommend to always use /WARNING=UNINITIALIZED withB     /RECURSIVE, since uninitialized stack-variables are not zero !  H 3)  Capability of reading/writing non-native binary-formats (big endian,)     IBM floatings,IEEE floatings on VAX).   F 4)  A new intrinsic function IARGCOUNT returns the number of argumentsC     a subroutine/function is actually called with. This superseedes E     all kinds of small MACRO routine son VAX that do not work on AXP. 
     [V6.1]  D 5)  A new qualifier /WARNING=UNUSED helps you to clean up your code.  J 6)  If you install FORTRAN 6.x on VMS VAX 5.x/6.0, then you get new FORRTLG     and MTHRTL, so EXE-files linked here can not be run on other VAX'es H     running the same VMS version, but without FORTRAN 6.x installed. AndJ     the images can not be VESTed on AXP either. The release notes describeL     how to workaround this problem (link with the old "standard" libraries).  > Conclusion: Easy porting, since most changes are new features.   Porting from PASCAL 4.x to 5.x.  ===============================   D I have not encountered any problems (but I have not ported very much, code either, so do not jump to conclusions).  C The PASCAL compilers optimization on AXP is much better than on VAX " (compared to the other compilers).   Differences in the LINKer. ==========================  0 There are a few differences with the linker too:  > 1)  The C RTL are linked in by default (one options file out).  / 2)  The transfer vectors are created different.   
 VAX MAR-file:          .MACRO  TVGEN ZZZZ         .TRANSFER ZZZZ         .MASK   ZZZZ         JMP     ZZZZ+2         .ENDM   TVGEN            TVGEN   name1          TVGEN   name2          TVGEN   name3   
 AXP OPT-file: ? SYMBOL_VECTOR=(name1=PROCEDURE,name2=PROCEDURE,name3=PROCEDURE)   F 3)  The very usefull but undocumented/unsupported option UNSUPPORTED=1     are missing.  E There are other differences too. Study the LINKER manual carefull, if  you have complex LINK setups.    Problems with system services. ==============================  K In general all system services are present and behaves identical on VAX and F AXP. Most of the differences are due to the differences in versions ofD VMS on VAX and AXP, which will eventually disappear when VSM VAX and VMS AXP merges.   E There are a few system-services that refer to pagecount and start/end B addresses. The pagecounts are not a problem, because they refer to7 pagelets of 512 byte, not physical pages of 8192 bytes.   @ But the start/end addresses can be a problem as input arguments.A The addresses are rounded down and up to nearest page boundry and 6 on AXP that is a physical page boundry of 8192 bytes !   INADR.C  =======    #include <stdio.h> #include <prtdef.h>   
 #define N 100    int mem[N];    int sys$setprt();    main() {     int stat;-    int inadr[2] = {&mem[0],((int)&mem[N])-1};     int retadr[2]; 0    stat = sys$setprt(inadr,retadr,0,PRT$C_UR,0);?    printf("stat   = %8x\ninadr  = %8x %8x\nretadr = %8x %8x\n",            stat,            inadr[0],inadr[1],           retadr[0],retadr[1]);  }    Problems with macro32. ======================  H The purpose of the MACRO32 is easy porting of VMS and special priviligedF user applications. As a consequence it is very effecient and low-levelE (and supports writing device drivers etc.). It is also not as good as : it could have been for porting ordinary user applications.  I There are many good porting hints in "Migrating to an OpenVMS AXP System:  Porting VAX MACRO Code".   Declaration of JSB-routines. ----------------------------  6 JSB routines must be declared in the MACRO32 compiler.   VAX MACRO32:         JSB     name	         . 	         . 	         . 	 name:   . 	         . 	         .          RSB    AXP MACRO32:         JSB     name	         . 	         . 	         . 5 name:   .JSB_ENTRY INPUT=<Rx,Ry,Rz>,OUTPUT=<Ra,Rb,Rc> 	         . 	         . 	         .          RSB   J If not input and output are declared, then the compiler assumes that thereJ are no register dependencies and saves/restores everything, so the routine' do not work properly. And no warnings !    Poor floating point support. ----------------------------  G The MACRO32 compiler do not support floating point register operations. D It only supports floating point operations on memory locations. ThisE gives poor performance (it gets even worse because the floating point D instructions are being translated into JSB calls). And be aware: theA compiler do not give any warnings about the register operations -  the results are simply wrong.    Ignore directive.t -----------------m  H The .LINK directive are ignored. The compiler do not give a warning, but the directive has no effect.  	 Warnings. 	 ---------p  K Quadword operations, which the compiler can not be sure are natural aligned) results in warnings. n  Architectural features/problems.  ================================  	 Integers.=	 ---------   = AXP is a real 64 bit architecture, so it has 64 bit integers.   = FORTRAN:  INTEGER*8 gives a 64 bit integer, and the qualifiera7           /INTEGER_SIZE=64 makes it default for INTEGERl  J C:        you need to use the type __int64 (or if you include ints.h, thenG           you can use int64) - this is not exactly emphasized in online=           help  6 PASCAL:   INTEGER64 is the dattype for 64 bit integers   Conclusions:'   - new integer type with greater rangee   Floating point.d ---------------A  = navn     type/size/precisison       VAX                   AXP   F  F       VAX 4 byte single          full support          full supportI  D       VAX 8 byte double          full support          only load/store4N                                                           all calculation as GF  G       VAX 8 byte double          full support          full supportD  H       VAX 16 byte quadrouple     hardware old VAX      no support5                                     emulation new VAX F  S       IEEE 4 byte single         FORTRAN 6.x can       full support.                                     read/writeF  T       IEEE 8 byte double         FORTRAN 6.x can       full support.                                     read/writeF  X       IEEE 16 byte quadrouple    ?                     emulation inG                                                           FORTRAN 6.2 ?f   Compiler switches:              FORTRAN/PASCAL/C-   F          default on AXP1 S          /FLOAT=IEEE   D          /FLOAT=Dn G          default on AXPi T          /FLOAT=IEEE  
 Conclusions: =O   - new standard datatype (IEEE, used by Intel and various RISC/UNIX platforms)    - H-floating is missing $   - G-floating has beaten D-floating   Procedure calling standard.S ---------------------------   E AXP is not using the VAX procedure calling standard. In an attempt to < speed up procedure calls the first 6 arguments are no longerF passed in a seperate argumentlist, but in 6 hardware registers R16-R21J for integer and F16-F21 for floating point. The VAX proceduure stack frame are not used anymore either.  
 Consequences:*  F 1)  It is not easy to write a routine NARG, which in the call-sequenceA     X -> Y -> NARG can tell Y how many arguments X called Y with.   C     FORTRAN now has the IARGCOUNT function to get this information.(C     C has the standard stdarg/vararg. PASCAL also has facilities to ;     do this. MACRO32 code can still refer directly to (AP).l  ? 2)  It is no longer possible to write a routine that inserts an ;     error-handler for the routine calling this routine. Thes=     LIB$ESTABLISH/LIB$REVERT no longer exist as routines. ThefG     FORTRAN and C compilers on AXP recognizes them and generates inlinefD     code. PASCAL and MACRO32 programs can not use them. MACRO32 codeC     can still refer directly to (FP). Both C and PASCAL has its owni.     error-handling routines, that still works.  M 3)  Fake returns do not work any longer too. (A calls B calls C returns to A)e  ? There are some good descriptions in "OpenVMS Calling Standard".a  E For those who want to start playing with the VMS AXP stacke, then the H routines LIB$GET_CURRENT_INVO_CONTEXT and LIB$GET_PREVIOUS_INVO_CONTEXT F are the way to go (they are documented in the manual mentioned above). .$ Problems with undocumented features.$ ====================================  ! Program linked directly with VMS.f! ---------------------------------   8 VAX: $ LINK anything+SYS$SYSTEM:SYS.STB/SELECTIVE_SEARCHE      (or .link "sys$system:sys.stb"/selective_search in MACRO32 code)  AXP: $ LINK/SYSEXE anything F      (or SYS$LOADABLE_IMAGES:SYS$BASE_IMAGE.EXE/SHARE in options-file)   (DCLDEF.STB are still there)   Size of SYS0 fields. --------------------  . Certain fields in SYS0 space has changed size.* Example: SYS$GW_IJOBCNT -> SYS$GL_IJOBCNT.  
 SYS0.C on VAXn
 -------------    #include <stdio.h>   globalref sys$gw_ijobcnt;m   main() {l<    printf("Number of interactive jobs=%d\n",sys$gw_ijobcnt); }   
 SYS0.C on AXP 
 -------------e   #include <stdio.h>   globalref sys$gl_ijobcnt;d   main() {_<    printf("Number of interactive jobs=%d\n",sys$gl_ijobcnt); }r  * Read/write in other process address-space.* ------------------------------------------  : The well-known hack for reading/writing in another process= address-space via a double/single special kernel mode AST aret> extremely difficult in VMS AXP. It is said to be possible, but; DEC has supplied to new routines (undocumented/unsupported) 0 EXE$READ_PROCESS and EXE$WRITE_PROCESS to do it.   Format of EXE-files. --------------------  I The format of EXE-files has changed. Any program reading and interpretingoF image header and GST/DST must be rewritten. See the difference between= the $IHADEF/$IHDDEF and $EIHADEF/$EIHDDEF macros for details.4   SYS$IMGACT.e -----------o  K I have not yet been able to get the undocumented/unsupported system servicee' SYS$IMGACT to work properly on VMS AXP.    MACRO64 assembler. ==================  J It is actually rather simple to use. It i mostly like MACRO32, but one has0 to get used to the load/store way of addressing.  I I doubt that it will ever be very musch used. It is not bundled with VMS.aA MACRO32 is still used for priviliged code. DEC has announced thato3 writing device-drivers in C will be supported soon.A i VEST.e =====   G The purpose of VEST is easy porting of user applications with no direct  dependence of VMS.  3 It is based on both translation and interpretation.   ( I have moved two applicatiosn with VEST:4   - a 75 block executable with a 500 block shareable   - a 500 block executable  ( Both programs worked fine after VESTing.   Efficiency chart:   7                    VAX 4200                    DEC 3400$F                    (rated at 5 VUPS)           (rated at 110 Specmark)  B                                                VESTed       native  @ integer                  13240                 2260          430 calculations  @ floating point           36510                 2470          420 calculations  @ character                25420                 8500         2250 moves   G Numbers=milliseconds of CPU usage on specified operation on small data.R   Conclusions:*   - VEST gived an overhead of a factor 3-5G   - the gain of the VAX -> AXP change depends on the type of operations A     (integer: factor 25, floating point: factor 75,character: 10)(  H [DEC's tests on the Specmark programs says a factor 2-6 in VEST overheadE and the hardware upgrade scaled to same number of VUPS gives a factor  12-56] 5 VEST.FOR ========         PROGRAM VEST       CALL TESTINT       CALL TESTFLOAT       CALL TESTCHAR,	       END  C5       SUBROUTINE TESTINT%       INTEGER*4 I,J,T,A(4000),B(4000)        INTEGER*4 PAS$CLOCK2       DO 100 I=1,4000+         A(I)=I 100   CONTINUE       T=PAS$CLOCK2()       DO 300 J=1,1000          DO 200 I=4,3997=H           B(I)=(A(I-3)+A(I-2)+2*A(I-1)+8*A(I)+2*A(I+1)+A(I+2)+A(I+3))/16 200     CONTINUE         CALL FOOL(B) 300   CONTINUE       WRITE(*,*) PAS$CLOCK2()-T        RETURN	       ENDo C        SUBROUTINE TESTFLOAT       INTEGER*4 I,J,T0       REAL*8 A(4000),B(4000)       INTEGER*4 PAS$CLOCK2       DO 100 I=1,4000r         A(I)=I 100   CONTINUE       T=PAS$CLOCK2()       DO 300 J=1,1000          DO 200 I=4,39973           B(I)=0.0625D0*A(I-3)+N      +         0.0625D0*A(I-2)+       +         0.125D0*A(I-1)+      +         0.5D0*A(I)+      +         0.125D0*A(I+1)+      +         0.0625D0*A(I+2)+,      +         0.0625D0*A(I+3) 200     CONTINUE         CALL FOOL(B) 300   CONTINUE       WRITE(*,*) PAS$CLOCK2()-T        RETURN	       END$ C  d       SUBROUTINE TESTCHARd       INTEGER*4 I,J,Tu       CHARACTER*4000 A,B       INTEGER*4 PAS$CLOCK2       DO 100 I=1,40006         A(I:I)=CHAR(MOD(I,256))  100   CONTINUE       T=PAS$CLOCK2()       DO 300 J=1,1000d         DO 200 I=1,3901             B(I:I+49)=A(I+50:I+99) 200     CONTINUE         CALL FOOL(B) 300   CONTINUE       WRITE(*,*) PAS$CLOCK2()-TO       RETURN	       END  C        SUBROUTINE FOOLs       RETURN	       END 