Appendix C. Useful Scripts and Code

This appendix contains the following example programs, shell scripts, and awk scripts that are used to create some of the examples in this book:

Program adi2

The program adi2 in Example C-1 is used as an example in several chapters.

Example C-1. Program adi2.f

program fake_adi
      implicit none
      integer ldx, ldy, ldz, nx, ny, nz, maxsteps
      parameter (ldx = 128, ldy = 128, ldz = 128)
      parameter (nx  = 128, ny  = 128, nz  = 128)
      parameter (maxsteps = 2)
      real*8 data(ldx,ldy,ldz)
      integer i, j, k, istep
      external rand, dtime
      real*4 dtime, t, t2(2)
      real*8 rand, checksum
c
      do k = 1, nz
         do j = 1, ny
            do i = 1, nx
               data(i,j,k) = rand()
            enddo
         enddo
      enddo
c
      t = dtime(t2)
c
      do istep = 1, maxsteps
c
c*$* assert concurrent call
         do k = 1, nz
            do j = 1, ny
               call xsweep(data(1,j,k),1,nx)
            enddo
         enddo
c
c*$* assert concurrent call
         do k = 1, nz
            do i = 1, nx
               call ysweep(data(i,1,k),ldx,ny)
            enddo
         enddo
c
c*$* assert concurrent call
         do j = 1, ny
            do i = 1, nx
               call zsweep(data(i,j,1),ldx*ldy,nz)
            enddo
         enddo
c
      enddo
c
      t = dtime(t2)
      write(6,1) t
 1    format(1x,'Time:     ',f6.3,' seconds')
      checksum = 0.0d0
      do k = 1, nz
         do j = 1, ny
            do i = 1, nx
               checksum = checksum + data(i,j,k)
            enddo
         enddo
      enddo
c
      write(6,2) checksum
 2    format(1x,'Checksum: ',1pe17.10)
c
      end
c---------------------------------------------------------------------
      subroutine xsweep(v,is,n)
      implicit none
      real*8 v(1+is*(n-1))
      integer is, n
      integer i
      real*8 half
      parameter (half = 0.5d0)
c
      do i = 2, n
         v(1+is*(i-1)) = v(1+is*(i-1)) + half*v(1+is*(i-2))
      enddo
c
      do i = n-1, 1, -1
         v(1+is*(i-1)) = v(1+is*(i-1)) - half*v(1+is*i)
      enddo
c
      return
      end
c---------------------------------------------------------------------
      subroutine ysweep(v,is,n)
      implicit none
      real*8 v(1+is*(n-1))
      integer is, n
      integer i
      real*8 half
      parameter (half = 0.5d0)
c
      do i = 2, n
         v(1+is*(i-1)) = v(1+is*(i-1)) + half*v(1+is*(i-2))
      enddo
c
      do i = n-1, 1, -1
         v(1+is*(i-1)) = v(1+is*(i-1)) - half*v(1+is*i)
      enddo
c
      return
      end
c---------------------------------------------------------------------
      subroutine zsweep(v,is,n)
      implicit none
      real*8 v(1+is*(n-1))
      integer is, n
      integer i
      real*8 half
      parameter (half = 0.5d0)
c
      do i = 2, n
         v(1+is*(i-1)) = v(1+is*(i-1)) + half*v(1+is*(i-2))
      enddo
c
      do i = n-1, 1, -1
         v(1+is*(i-1)) = v(1+is*(i-1)) - half*v(1+is*i)
      enddo
c
      return
      end

Program adi5.f is identical to Example C-1 except for the line shown in bold type in Example C-2.

Example C-2. Program adi5.f

program fake_adi
      implicit none
      integer ldx, ldy, ldz, nx, ny, nz, maxsteps
      parameter (ldx = 129, ldy = 129, ldz = 128)
      parameter (nx  = 128, ny  = 128, nz  = 128)
      parameter (maxsteps = 2)
      real*8 data(ldx,ldy,ldz)

Program adi53.f is identical to Example C-1 except for the lines shown in bold in Example C-3.

Example C-3. Program adi53.f

program fake_adi
c
      implicit none
c
      integer ldx, ldy, ldz, nx, ny, nz, maxsteps
      parameter (ldx = 129, ldy = 129, ldz = 128)
      parameter (nx  = 128, ny  = 128, nz  = 128)
      parameter (maxsteps = 2)
...
      do j = 1, ny
            call copy(data(1,j,1),ldx*ldy,temp,nx,nx,nz)
            do i = 1, nx
               call zsweep(temp(i,1),nx,nz)
            enddo
            call copy(temp,nx,data(1,j,1),ldx*ldy,nx,nz)
         enddo
...
subroutine copy(from,lf,to,lt,nr,nc)
      implicit none
      real*8 from(lf,nc), to(lt,nc)
      integer lf, lt, nr, nc
      integer i, j
      do j = 1, nc
         do i = 1, nr
            to(i,j) = from(i,j)
         enddo
      enddo
      return
      end


Basic Makefile

This Makefile is a template for a Makefile suitable for any moderately complex program composed of Fortran and C source files. It isolates compiler options into groups for easy editing and experimentation.

Example C-4. Basic Makefile

#! /usr/sbin/smake 
# --------------------------------------------------------------------
# Basic Makefile for a program composed of Fortran and C modules
# --------------------------------------------------------------------
# The following variables specify the compiler and linker options,
# assembling them by groups for use in later commands. You may
# need to edit these lines several times while tuning.
#
#  -- flags related to ISA, ABI, and model (ipxx) go to $ARCH
#     -- set -n32 or -64. -TARG, -TENV could go here too.
ABI     = -n32
#     -- probably -mips4
ISA     = -mips4 -r10000
#     -- ip27 for Origin2000/Onyx2
PROC    = ip27 
ARCH    = $(ABI) $(ISA)
#  -- flags related to optimization level go to $OPT
#     -- set level, e.g. -O0 g3, -O3, -Ofast=$(PROC) 
OLEVEL  = -O2
#     -- set -OPT: option group
OOPT    = -OPT:alias=restrict
#     -- set -IPA: option group
OIPA    = 
#     -- set -LNO: option group
OLNO    = 
OPT     = $(OLEVEL) $(OOPT) $(OIPA) $(OLNO)
# -- flags related to numeric precision, by compiler
FOPTS   = -OPT:IEEE_arithmetic=3:roundoff=2 
COPTS   = -OPT:IEEE_arithmetic=3:roundoff=2 
# Assemble the f77 and cc flags into single variables
FFLAGS  = $(ARCH) $(OPT) $(FOPTS) 
CFLAGS  = $(ARCH) $(OPT) $(COPTS) 
# Link-time flags must include ABI, ISA, and opt flags
LDFLAGS = $(ARCH) $(OPT) 
# --------------------------------------------------------------------
# The following variables specify the program components.
# You typically edit these lines only once, to specify the modules.
#
#   -- Specify the name of the executable program:
EXEC    = execname 
#   -- list all Fortran object files, e.g. FOBJS = f1.o f2.o
FOBJS   =  
#   -- list all C object files, e.g. COBJS = c1.o c2.o c3.o
COBJS   = 
#   -- List all linked libs
LIBS    = -lfastm -lm
# The program comprises the following object files:
OBJS    = $(FOBJS) $(COBJS) 
# --------------------------------------------------------------------
# The following variables locate tools based on an environment
# variable (or command-line argument) $TOOLROOT.
FC      = $(TOOLROOT)/usr/bin/f77 
CC      = $(TOOLROOT)/usr/bin/cc 
LD      = $(FC) 
F77     = $(FC)
# Locate a script that processes the .S output files
SWP     = swplist 
# Shorthand for "rm" for use in "make clean"
RM      = /bin/rm -f 
# --------------------------------------------------------------------
# Nothing below this point should need editing.
# --------------------------------------------------------------------
# The following target implements "make clean"
clean: 
        $(RM) $(EXEC) $(OBJS) 
# --------------------------------------------------------------------
# The following target implements "make execname" by linking all
# all object files:
$(EXEC):        $(OBJS) 
        $(LD) -o $@ $(LDFLAGS) $(OBJS) $(LIBS) 
# --------------------------------------------------------------------
# The following targets tell how to compile objects from sources.
# Variable $DEFINES is set on the make command line, if at all.
.SUFFIXES: .o .F .c .f .swp 
.F.o: 
        $(FC)  -c $(FFLAGS) $(DEFINES) $< 
.f.o: 
        $(FC)  -c $(FFLAGS) $(DEFINES) $< 
.c.o: 
        $(CC)  -c $(CFLAGS) $(DEFINES) $< 
# --------------------------------------------------------------------
# The following targets implement "make sourcename.swp" to inspect
# the SWP code generation (requires swplist script)
.F.swp: 
        $(SWP) -c $(FFLAGS) $(DEFINES) -WK,-cmp=$*.m $< 
.f.swp: 
        $(SWP) -c $(FFLAGS) $(DEFINES) -WK,-cmp=$*.m $< 
.c.swp: 
        $(SWP) -c $(CFLAGS) $(DEFINES) $< 


Software Pipeline Script swplist

This complex csh script compiles one or more C or Fortran source files with the -S option, which produces only an assembler listing, not an object file. Then it processes each of the listing files, extracting just the software pipeline “report cards,” and merges these back into the original source files. The merged files, showing pipeline statistics above the loops to which they apply, are written with .swp extensions.

Note that the source line number the compiler assigns to a generated loop is only approximate because the higher levels of optimization transform the code. As a result, a report card in the .swp file sometimes precedes the loop to which it applies, although the report card sections appear in the correct sequence.

Example C-5. Shell Script swplist

#!/bin/csh -f
if ( $#argv == 0 ) then
    echo ""
    echo "Usage: $0 [compiler flags] files..."
    echo "    This version of the script uses the Environment variable"
    echo "    TOOLROOT if set."
    echo "    All tools are called as "\$"TOOLROOT/usr/bin/<tool>."
    exit
endif
set t = /usr/tmp
if (${?TMPDIR}) then
   if (-e ${TMPDIR}) then
      set t = ${TMPDIR}
   endif
endif
if ( ! $?TOOLROOT ) then
    setenv TOOLROOT /
endif
echo 'TOOLROOT is "'$TOOLROOT'"'

set nawk_file1 = $t/$$.SWP.NAWK_1
set nawk_file2 = $t/$$.SWP.NAWK_2
# First awk program extracts SWP descriptive lines and saves
# in temp files, one per loop. Output is a list of loop-files.
cat << NAWK_FILE1_END > $nawk_file1
BEGIN {
        Loop = 0;
        GotLine = 0;
        LoopID = 0;
        TmpFileRoot = sprintf("$t/%s_SWP",FILENAME)
}
/#<swps>/ || /#<swpf>/ {
        if (Loop == 0) {
            Loop = 1; 
            LoopID++; 
            TmpFile = TmpFileRoot"."LoopID;
        }
        print > TmpFile;
}
/oop line/ {
        if (Loop == 1) {
                if (GotLine == 0) {
                        GotLine = 1;
                        split(\$0, Line);
                        i=0;
                        while (Line[i] != "line") {i++}
                        LoopLine = Line[++i];
                        print LoopLine " " TmpFile 
                }
        }
}
!/#<swps>/ && !/#<swpf>/ {
        if (Loop == 1) {
                Loop = 0; 
                GotLine = 0;
                close(TmpFile)
        }
}
END {
    if (Loop == 1) close(TmpFile)
}
NAWK_FILE1_END
# Second awk program 
cat << NAWK_FILE2_END > $nawk_file2
BEGIN {
    CurrentLine = 1
    TmpFileRoot = sprintf("$t/%s_SWP",FILENAME)
    Name = substr(FILENAME, 1, length(FILENAME)-3)
    SortInp = Name".sort"
    OutFile = Name".swp"; system("rm -f "OutFile);
    while ( (getline pair < SortInp) != 0 ) {
        split(pair,rec);
        NextLine = rec[1];
        NextInpFile = rec[2];
        while ( CurrentLine < NextLine ) {
            getline;
            print >> OutFile;
            ++CurrentLine;
        }
        system("cat " NextInpFile " >> " OutFile);
        system("rm " NextInpFile);
    }
}
{
    print >> OutFile;
}
END {
    system("rm " SortInp);
}
NAWK_FILE2_END
# compile all modules with -S given flags and modules specified
${TOOLROOT}/usr/bin/f77 -S $*
# for each module named on command line, process the output
set narg = $#argv
@ i = 1
while ($i <= $narg)
        if (($argv[$i]:e == f) || ($argv[$i]:e == F) || ($argv[$i]:e == c)) then
#           This guards against interpreting flags such as -WK,-inff=file.f
#           as files to compile.
if (-e $argv[$i]) then
                set s = $argv[$i]:r
                pr -t -n10 $argv[$i] > $s.pr
                nawk -f $nawk_file1 $s.s | sort -n > $s.sort
                nawk -f $nawk_file2 $s.pr
                /bin/rm $s.pr
            endif
        endif
        @ i = $i + 1
end
/bin/rm $nawk_file1
/bin/rm $nawk_file2


Shell Script ssruno

This script simplifies the run of a SpeedShop experiment.

Example C-6. SpeedShop Experiment Script ssruno

#!/bin/csh
# script to ssrun a program with designated output dir/filename.
# if no arguments, document usage
if (0 == $#argv) then
echo "$0 [-d output_dir] [-o output_file] [-ssrun_opts] prog_and_args"
   exit -1
endif
# initialize operands
set ssopts = ""
set otdir = "."
set otfile = ""
set proggy = ""
# collect -d, -o, and -ssrun options. Upon encountering name
# of program, break out of the loop, leaving $argv == prog_and_args
while ($#argv > 0)
   switch ($1)
   case "-o"
      setenv _SPEEDSHOP_OUTPUT_FILENAME $2
      set otfile = $2
      shift
      breaksw
   case "-d"
      setenv _SPEEDSHOP_OUTPUT_DIRECTORY $2
      set otdir = $2
      shift
      breaksw
   case "-*"
      set ssopts = ($ssopts $1)
      breaksw
   default
#     # get only tail, allowing ssrun /foo/bar/a.out
      set proggy = ${1}:t
      break
   endsw
   shift
end
# have to have seen a program
if ("X$proggy" == "X") then
   echo you must name a subject program
   exit -2
endif
# default the experiment type
if ("X$ssopts" == "X") then
   set ssopts = -usertime
endif
# run the experiment
echo ssrun $ssopts $argv....
ssrun $ssopts $argv
echo ...... ssrun ends.
# display all the output files with names starting $proggy
if ("X$otfile" == "X") then
#  # outfile not given, file is name.exptype.xpid
   ls -l $otdir/$proggy.*.?[0-9][0-9][0-9]*
else
#  # outfile given, file is name.xpid
   ls -l $otdir/$otfile.*
endif


Awk Script for Perfex Output

This script demonstrates one way to reduce and analyze the output of a perfex profile.

Example C-7. Awk Script to Analyze Output of perfex -a

# Reads output of perfex -a [-y]. Prints selected, reordered counters
# interpolating calculated ratios and percents.  Perfex runs of short
# programs often have zero values for some counts - allow for these.
BEGIN {
  maxline = 0 # track highest counter value seen
  mhz = 200 # assumed MHZ, adjust as needed
}
$0 ~ /^[ 123][0-9] / { # perfex data line
  lines[$1] = $0                 # save the whole line
  counter[$1] = $NF              # save reported value
  if (maxline < $1) maxline = $1 # note high line# seen
}
END { # at end, print report
  if (maxline >=31)
  {
    print lines[0]
    seconds = counter[0]/(mhz*1000000)
    print "   " seconds " seconds elapsed at " mhz "MHZ"
    print lines[17]
    if (counter[17])
    {
      print "   " counter[0]/counter[17] " cycles/graduated instruction"
      print "   " (counter[17]/seconds)/1000000 " MIPS at 200MHZ"
      print lines[18]
      print lines[19]
      if (counter[18]*counter[19])
      {
        print "   " (counter[17]-counter[18])/counter[18] " instructions/load"
        print "   " (counter[17]-counter[19])/counter[19] " instructions/store"
        print "   " counter[18]/counter[19] " loads/store"
      }
      print lines[21]
      if (counter[21])
      {
        print "   " int((counter[21]/counter[17])*100) "% fp instructions"
      }
    }
    print lines[6]
    print lines[24]
    if (counter[6]*counter[24])
    {
      print "   " int((counter[24]/counter[6])*100) "% branches mispredicted"
    }
    print lines[23]
    if (counter[17]*counter[23])
    {
      print "   " counter[17]/counter[23] " instructions/TLB miss"
    }
    print lines[9]
    if (counter[17]*counter[9])
    {
      print "   " counter[17]/counter[9] " instructions/i-L1 miss"
    }
    print lines[10]
    print lines[11]
    if (counter[17]*counter[10])
    {
      print "   " counter[17]/counter[10] " instructions/i-L2 miss"
    }
    print lines[25]
    if (counter[17]*counter[25])
    {
      print "   " counter[17]/counter[25] " instructions/d-L1 miss"
    }
    print lines[26]
    print lines[27]
    if (counter[17]*counter[25])
    {
      print "   " counter[17]/counter[26] " instructions/d-L2 miss"
    }
    smiss = counter[10]+counter[26]
    print "   " smiss " total L2 misses, " 128*smiss " bytes from memory"
    print "       " int(((128*smiss)/seconds)/1024) \
  " KB/sec memory bandwidth use at " mhz "MHZ"
    print lines[22]
    print lines[7]
    print lines[30]
    print lines[31]
    }
    else print "incomplete input"
}


Awk Script for Amdahl's Law Estimation

The script in Example C-8 can be run with the command awk -f amdahl.awk. Each line of input must be a list of numbers that represent execution times for one program using different numbers of CPUs. The nth number must be the execution time using n CPUs, T(n). Use 0 for an unknown time; however, at least the first and last numbers must be nonzero.

The script displays the calculated parallel fraction of the code, p, and the speedup and expected run time for various numbers of CPUs. Enter another line of times, or terminate the program with Ctrl+C.

Example C-8. Awk Script to Extrapolate Amdahl's Law from Measured Times

# amdahl.awk: an input line is a series of execution times
# T(1), T(2),...T(N) for a program run with 1, 2, ... N CPUs.
# Use 0 for an unknown time.  T(1) and T(N) must be nonzero.
# For example, after test with 1, 2, and 4 CPUs, you could enter
#           240 190 0 75
# to show those times, with 0 for the unknown time T(3).

{ 
   # save times T(n) in array t[]
   for (j=1;j<=NF;++j) t[j] = $j 
   # calculate p, parallel fraction of code
   if (2==NF) 
   {  # use simple formula for p given only T1, T2 
      s2 = t[1]/t[2] 
      p = 2*(s2-1)/s2 
   } 
   else 
   {  # use general formula on the last 2 nonzero inputs
      for (m=NF-1; t[m]==0; --m) ;
      sm = t[1]/t[m] 
      sn = t[1]/t[NF] 
      invm = 1/m 
      invn = 1/NF 
      p = (sm - sn)/( sm*(1-invm) - sn*(1-invn) ) 
   } 
   if (p<1) 
   { 
      printf("#CPUs   SpeedUp(n)    T(n)     p=%6.3g\n",p) 
      npat = "%5d   %6.3g    %8.3g\n" 
      # print the actual times as given and their speedups 
      printf(npat,1,1.0,t[1]) 
      for (j=2;j<=NF;++j) 
      { 
         if (t[j]) printf(npat,j,t[1]/t[j],t[j]) 
      } 
      # extrapolate using amdahl's law based on calculated p 
      # first, for CPUs one at a time to 7 
      for (j=NF+1;j<8;++j) 
      { 
         sj = 1/((p/j)+(1-p)) 
         printf(npat,j,sj,t[1]/sj) 
      } 
      # then 8, 16, 32, 64 and 128 
      for (j=8;j<=128;j=j+j) 
      { 
        sj = 1/((p/j)+(1-p)) 
        if (j>NF) printf(npat,j,sj,t[1]/sj) 
      } 
   } 
   else 
   { 
      printf("p=%6.3g, hyperlinear speedup\n",p) 
      printf("Enter a list of times for more more than %d CPUs\n\n",NF) 
   } 
} 


Page Address Routine va2pa()

This routine allows a program to pass the address of any variable, and recover the physical memory address of the page containing the variable. It can be used to investigate memory distribution effectiveness.

You can translate a virtual address to a node number with the following macro, which calls va2pa().

#define VADR2NODE(A) ((int) (va2pa(A) >> 32))

You can retrieve the CPU number instead of a node number using this macro:

#define VADR2CPU(A) ((int) (va2pa(A) >> 16))

Example C-9. Routine va2pa() Returns the Physical Page of a Virtual Address

#include <stdio.h>
#include <sys/types.h>
#include <sys/syssgi.h>
__uint64_t
va2pa( void *va)
{
   __uint64_t   pa;
   __uint32_t   pfn;
   int          status;
   static int   lpgsz, pgsz = -1;
   if (pgsz < 0) { /* first time: log2(pagesize) */
      int itmp;
      pgsz = itmp = getpagesize();
      for (lpgsz=0; itmp>1; itmp>>=1, lpgsz++);
   }
   if ((status = syssgi(SGI_PHYSP,va,&pfn)) != 0) {
      perror("Virtual to physical mapping failed");
      exit(1);
   }
   pa = (((__uint64_t) pfn << lpgsz) | ((__uint64_t) va & (pgsz-1)));
   return (pa);
}


CPU Clock Rate Routine cpuclock()

This routine gets the clock rate in megahertz of the first CPU listed in the hardware inventory, and returns it as an integer. You can use this number to convert an elapsed time into a count of CPU cycles.

Example C-10. Routine cpuclock() Gets the Clock Speed from the Hardware Inventory

/* =============================================================
|| Return CPU clock rate in megahertz, by the rather
|| byzantine method of scanning the hardware inventory
*/
#include <invent.h>
#DEFINE DFLT_MHZ 195 /* return if any error */
int cpuclock(void)
{
        inventory_t *p_inv;
        if (setinvent()) return DFLT_MHZ;
        for(p_inv = getinvent(); (p_inv); p_inv = getinvent())
        {
                if (  (p_inv->inv_class == INV_PROCESSOR)
                        &&(p_inv->inv_type == INV_CPUBOARD) )
                break;
        }
        endinvent();
        if (p_inv)
                return p_inv->inv_controller;
        else
                return DFLT_MHZ;
}