This appendix contains the following example programs, shell scripts, and awk scripts that are used to create some of the examples in this book:
“Program adi2” is an example Fortran program used to demonstrate problems in cache and TLB use.
“Basic Makefile” is the skeleton of a makefile that handles compiler options in different categories.
“Software Pipeline Script swplist” is a shell script that compiles a module to create an assembly listing and extracts the software pipeline report cards.
“Shell Script ssruno” is a simple script to make SpeedShop experiments more convenient to run.
“Awk Script for Perfex Output” demonstrates how the output of perfex can be post-processed for analysis.
“Awk Script for Amdahl's Law Estimation” reads execution times, derives the parallel fraction of the program, and extrapolates the execution time for larger numbers of CPUs based on Amdahl's law.
“Page Address Routine va2pa()” is a C function to return the physical address of a virtual memory variable.
The program adi2 in Example C-1 is used as an example in several chapters.
program fake_adi implicit none integer ldx, ldy, ldz, nx, ny, nz, maxsteps parameter (ldx = 128, ldy = 128, ldz = 128) parameter (nx = 128, ny = 128, nz = 128) parameter (maxsteps = 2) real*8 data(ldx,ldy,ldz) integer i, j, k, istep external rand, dtime real*4 dtime, t, t2(2) real*8 rand, checksum c do k = 1, nz do j = 1, ny do i = 1, nx data(i,j,k) = rand() enddo enddo enddo c t = dtime(t2) c do istep = 1, maxsteps c c*$* assert concurrent call do k = 1, nz do j = 1, ny call xsweep(data(1,j,k),1,nx) enddo enddo c c*$* assert concurrent call do k = 1, nz do i = 1, nx call ysweep(data(i,1,k),ldx,ny) enddo enddo c c*$* assert concurrent call do j = 1, ny do i = 1, nx call zsweep(data(i,j,1),ldx*ldy,nz) enddo enddo c enddo c t = dtime(t2) write(6,1) t 1 format(1x,'Time: ',f6.3,' seconds') checksum = 0.0d0 do k = 1, nz do j = 1, ny do i = 1, nx checksum = checksum + data(i,j,k) enddo enddo enddo c write(6,2) checksum 2 format(1x,'Checksum: ',1pe17.10) c end c--------------------------------------------------------------------- subroutine xsweep(v,is,n) implicit none real*8 v(1+is*(n-1)) integer is, n integer i real*8 half parameter (half = 0.5d0) c do i = 2, n v(1+is*(i-1)) = v(1+is*(i-1)) + half*v(1+is*(i-2)) enddo c do i = n-1, 1, -1 v(1+is*(i-1)) = v(1+is*(i-1)) - half*v(1+is*i) enddo c return end c--------------------------------------------------------------------- subroutine ysweep(v,is,n) implicit none real*8 v(1+is*(n-1)) integer is, n integer i real*8 half parameter (half = 0.5d0) c do i = 2, n v(1+is*(i-1)) = v(1+is*(i-1)) + half*v(1+is*(i-2)) enddo c do i = n-1, 1, -1 v(1+is*(i-1)) = v(1+is*(i-1)) - half*v(1+is*i) enddo c return end c--------------------------------------------------------------------- subroutine zsweep(v,is,n) implicit none real*8 v(1+is*(n-1)) integer is, n integer i real*8 half parameter (half = 0.5d0) c do i = 2, n v(1+is*(i-1)) = v(1+is*(i-1)) + half*v(1+is*(i-2)) enddo c do i = n-1, 1, -1 v(1+is*(i-1)) = v(1+is*(i-1)) - half*v(1+is*i) enddo c return end |
Program adi5.f is identical to Example C-1 except for the line shown in bold type in Example C-2.
program fake_adi implicit none integer ldx, ldy, ldz, nx, ny, nz, maxsteps parameter (ldx = 129, ldy = 129, ldz = 128) parameter (nx = 128, ny = 128, nz = 128) parameter (maxsteps = 2) real*8 data(ldx,ldy,ldz) |
Program adi53.f is identical to Example C-1 except for the lines shown in bold in Example C-3.
program fake_adi c implicit none c integer ldx, ldy, ldz, nx, ny, nz, maxsteps parameter (ldx = 129, ldy = 129, ldz = 128) parameter (nx = 128, ny = 128, nz = 128) parameter (maxsteps = 2) ... do j = 1, ny call copy(data(1,j,1),ldx*ldy,temp,nx,nx,nz) do i = 1, nx call zsweep(temp(i,1),nx,nz) enddo call copy(temp,nx,data(1,j,1),ldx*ldy,nx,nz) enddo ... subroutine copy(from,lf,to,lt,nr,nc) implicit none real*8 from(lf,nc), to(lt,nc) integer lf, lt, nr, nc integer i, j do j = 1, nc do i = 1, nr to(i,j) = from(i,j) enddo enddo return end |
This Makefile is a template for a Makefile suitable for any moderately complex program composed of Fortran and C source files. It isolates compiler options into groups for easy editing and experimentation.
#! /usr/sbin/smake # -------------------------------------------------------------------- # Basic Makefile for a program composed of Fortran and C modules # -------------------------------------------------------------------- # The following variables specify the compiler and linker options, # assembling them by groups for use in later commands. You may # need to edit these lines several times while tuning. # # -- flags related to ISA, ABI, and model (ipxx) go to $ARCH # -- set -n32 or -64. -TARG, -TENV could go here too. ABI = -n32 # -- probably -mips4 ISA = -mips4 -r10000 # -- ip27 for Origin2000/Onyx2 PROC = ip27 ARCH = $(ABI) $(ISA) # -- flags related to optimization level go to $OPT # -- set level, e.g. -O0 g3, -O3, -Ofast=$(PROC) OLEVEL = -O2 # -- set -OPT: option group OOPT = -OPT:alias=restrict # -- set -IPA: option group OIPA = # -- set -LNO: option group OLNO = OPT = $(OLEVEL) $(OOPT) $(OIPA) $(OLNO) # -- flags related to numeric precision, by compiler FOPTS = -OPT:IEEE_arithmetic=3:roundoff=2 COPTS = -OPT:IEEE_arithmetic=3:roundoff=2 # Assemble the f77 and cc flags into single variables FFLAGS = $(ARCH) $(OPT) $(FOPTS) CFLAGS = $(ARCH) $(OPT) $(COPTS) # Link-time flags must include ABI, ISA, and opt flags LDFLAGS = $(ARCH) $(OPT) # -------------------------------------------------------------------- # The following variables specify the program components. # You typically edit these lines only once, to specify the modules. # # -- Specify the name of the executable program: EXEC = execname # -- list all Fortran object files, e.g. FOBJS = f1.o f2.o FOBJS = # -- list all C object files, e.g. COBJS = c1.o c2.o c3.o COBJS = # -- List all linked libs LIBS = -lfastm -lm # The program comprises the following object files: OBJS = $(FOBJS) $(COBJS) # -------------------------------------------------------------------- # The following variables locate tools based on an environment # variable (or command-line argument) $TOOLROOT. FC = $(TOOLROOT)/usr/bin/f77 CC = $(TOOLROOT)/usr/bin/cc LD = $(FC) F77 = $(FC) # Locate a script that processes the .S output files SWP = swplist # Shorthand for "rm" for use in "make clean" RM = /bin/rm -f # -------------------------------------------------------------------- # Nothing below this point should need editing. # -------------------------------------------------------------------- # The following target implements "make clean" clean: $(RM) $(EXEC) $(OBJS) # -------------------------------------------------------------------- # The following target implements "make execname" by linking all # all object files: $(EXEC): $(OBJS) $(LD) -o $@ $(LDFLAGS) $(OBJS) $(LIBS) # -------------------------------------------------------------------- # The following targets tell how to compile objects from sources. # Variable $DEFINES is set on the make command line, if at all. .SUFFIXES: .o .F .c .f .swp .F.o: $(FC) -c $(FFLAGS) $(DEFINES) $< .f.o: $(FC) -c $(FFLAGS) $(DEFINES) $< .c.o: $(CC) -c $(CFLAGS) $(DEFINES) $< # -------------------------------------------------------------------- # The following targets implement "make sourcename.swp" to inspect # the SWP code generation (requires swplist script) .F.swp: $(SWP) -c $(FFLAGS) $(DEFINES) -WK,-cmp=$*.m $< .f.swp: $(SWP) -c $(FFLAGS) $(DEFINES) -WK,-cmp=$*.m $< .c.swp: $(SWP) -c $(CFLAGS) $(DEFINES) $< |
This complex csh script compiles one or more C or Fortran source files with the -S option, which produces only an assembler listing, not an object file. Then it processes each of the listing files, extracting just the software pipeline “report cards,” and merges these back into the original source files. The merged files, showing pipeline statistics above the loops to which they apply, are written with .swp extensions.
Note that the source line number the compiler assigns to a generated loop is only approximate because the higher levels of optimization transform the code. As a result, a report card in the .swp file sometimes precedes the loop to which it applies, although the report card sections appear in the correct sequence.
Example C-5. Shell Script swplist
#!/bin/csh -f if ( $#argv == 0 ) then echo "" echo "Usage: $0 [compiler flags] files..." echo " This version of the script uses the Environment variable" echo " TOOLROOT if set." echo " All tools are called as "\$"TOOLROOT/usr/bin/<tool>." exit endif set t = /usr/tmp if (${?TMPDIR}) then if (-e ${TMPDIR}) then set t = ${TMPDIR} endif endif if ( ! $?TOOLROOT ) then setenv TOOLROOT / endif echo 'TOOLROOT is "'$TOOLROOT'"' set nawk_file1 = $t/$$.SWP.NAWK_1 set nawk_file2 = $t/$$.SWP.NAWK_2 # First awk program extracts SWP descriptive lines and saves # in temp files, one per loop. Output is a list of loop-files. cat << NAWK_FILE1_END > $nawk_file1 BEGIN { Loop = 0; GotLine = 0; LoopID = 0; TmpFileRoot = sprintf("$t/%s_SWP",FILENAME) } /#<swps>/ || /#<swpf>/ { if (Loop == 0) { Loop = 1; LoopID++; TmpFile = TmpFileRoot"."LoopID; } print > TmpFile; } /oop line/ { if (Loop == 1) { if (GotLine == 0) { GotLine = 1; split(\$0, Line); i=0; while (Line[i] != "line") {i++} LoopLine = Line[++i]; print LoopLine " " TmpFile } } } !/#<swps>/ && !/#<swpf>/ { if (Loop == 1) { Loop = 0; GotLine = 0; close(TmpFile) } } END { if (Loop == 1) close(TmpFile) } NAWK_FILE1_END # Second awk program cat << NAWK_FILE2_END > $nawk_file2 BEGIN { CurrentLine = 1 TmpFileRoot = sprintf("$t/%s_SWP",FILENAME) Name = substr(FILENAME, 1, length(FILENAME)-3) SortInp = Name".sort" OutFile = Name".swp"; system("rm -f "OutFile); while ( (getline pair < SortInp) != 0 ) { split(pair,rec); NextLine = rec[1]; NextInpFile = rec[2]; while ( CurrentLine < NextLine ) { getline; print >> OutFile; ++CurrentLine; } system("cat " NextInpFile " >> " OutFile); system("rm " NextInpFile); } } { print >> OutFile; } END { system("rm " SortInp); } NAWK_FILE2_END # compile all modules with -S given flags and modules specified ${TOOLROOT}/usr/bin/f77 -S $* # for each module named on command line, process the output set narg = $#argv @ i = 1 while ($i <= $narg) if (($argv[$i]:e == f) || ($argv[$i]:e == F) || ($argv[$i]:e == c)) then # This guards against interpreting flags such as -WK,-inff=file.f # as files to compile. if (-e $argv[$i]) then set s = $argv[$i]:r pr -t -n10 $argv[$i] > $s.pr nawk -f $nawk_file1 $s.s | sort -n > $s.sort nawk -f $nawk_file2 $s.pr /bin/rm $s.pr endif endif @ i = $i + 1 end /bin/rm $nawk_file1 /bin/rm $nawk_file2 |
This script simplifies the run of a SpeedShop experiment.
Example C-6. SpeedShop Experiment Script ssruno
#!/bin/csh # script to ssrun a program with designated output dir/filename. # if no arguments, document usage if (0 == $#argv) then echo "$0 [-d output_dir] [-o output_file] [-ssrun_opts] prog_and_args" exit -1 endif # initialize operands set ssopts = "" set otdir = "." set otfile = "" set proggy = "" # collect -d, -o, and -ssrun options. Upon encountering name # of program, break out of the loop, leaving $argv == prog_and_args while ($#argv > 0) switch ($1) case "-o" setenv _SPEEDSHOP_OUTPUT_FILENAME $2 set otfile = $2 shift breaksw case "-d" setenv _SPEEDSHOP_OUTPUT_DIRECTORY $2 set otdir = $2 shift breaksw case "-*" set ssopts = ($ssopts $1) breaksw default # # get only tail, allowing ssrun /foo/bar/a.out set proggy = ${1}:t break endsw shift end # have to have seen a program if ("X$proggy" == "X") then echo you must name a subject program exit -2 endif # default the experiment type if ("X$ssopts" == "X") then set ssopts = -usertime endif # run the experiment echo ssrun $ssopts $argv.... ssrun $ssopts $argv echo ...... ssrun ends. # display all the output files with names starting $proggy if ("X$otfile" == "X") then # # outfile not given, file is name.exptype.xpid ls -l $otdir/$proggy.*.?[0-9][0-9][0-9]* else # # outfile given, file is name.xpid ls -l $otdir/$otfile.* endif |
This script demonstrates one way to reduce and analyze the output of a perfex profile.
Example C-7. Awk Script to Analyze Output of perfex -a
# Reads output of perfex -a [-y]. Prints selected, reordered counters # interpolating calculated ratios and percents. Perfex runs of short # programs often have zero values for some counts - allow for these. BEGIN { maxline = 0 # track highest counter value seen mhz = 200 # assumed MHZ, adjust as needed } $0 ~ /^[ 123][0-9] / { # perfex data line lines[$1] = $0 # save the whole line counter[$1] = $NF # save reported value if (maxline < $1) maxline = $1 # note high line# seen } END { # at end, print report if (maxline >=31) { print lines[0] seconds = counter[0]/(mhz*1000000) print " " seconds " seconds elapsed at " mhz "MHZ" print lines[17] if (counter[17]) { print " " counter[0]/counter[17] " cycles/graduated instruction" print " " (counter[17]/seconds)/1000000 " MIPS at 200MHZ" print lines[18] print lines[19] if (counter[18]*counter[19]) { print " " (counter[17]-counter[18])/counter[18] " instructions/load" print " " (counter[17]-counter[19])/counter[19] " instructions/store" print " " counter[18]/counter[19] " loads/store" } print lines[21] if (counter[21]) { print " " int((counter[21]/counter[17])*100) "% fp instructions" } } print lines[6] print lines[24] if (counter[6]*counter[24]) { print " " int((counter[24]/counter[6])*100) "% branches mispredicted" } print lines[23] if (counter[17]*counter[23]) { print " " counter[17]/counter[23] " instructions/TLB miss" } print lines[9] if (counter[17]*counter[9]) { print " " counter[17]/counter[9] " instructions/i-L1 miss" } print lines[10] print lines[11] if (counter[17]*counter[10]) { print " " counter[17]/counter[10] " instructions/i-L2 miss" } print lines[25] if (counter[17]*counter[25]) { print " " counter[17]/counter[25] " instructions/d-L1 miss" } print lines[26] print lines[27] if (counter[17]*counter[25]) { print " " counter[17]/counter[26] " instructions/d-L2 miss" } smiss = counter[10]+counter[26] print " " smiss " total L2 misses, " 128*smiss " bytes from memory" print " " int(((128*smiss)/seconds)/1024) \ " KB/sec memory bandwidth use at " mhz "MHZ" print lines[22] print lines[7] print lines[30] print lines[31] } else print "incomplete input" } |
The script in Example C-8 can be run with the command awk -f amdahl.awk. Each line of input must be a list of numbers that represent execution times for one program using different numbers of CPUs. The nth number must be the execution time using n CPUs, T(n). Use 0 for an unknown time; however, at least the first and last numbers must be nonzero.
The script displays the calculated parallel fraction of the code, p, and the speedup and expected run time for various numbers of CPUs. Enter another line of times, or terminate the program with Ctrl+C.
Example C-8. Awk Script to Extrapolate Amdahl's Law from Measured Times
# amdahl.awk: an input line is a series of execution times # T(1), T(2),...T(N) for a program run with 1, 2, ... N CPUs. # Use 0 for an unknown time. T(1) and T(N) must be nonzero. # For example, after test with 1, 2, and 4 CPUs, you could enter # 240 190 0 75 # to show those times, with 0 for the unknown time T(3). { # save times T(n) in array t[] for (j=1;j<=NF;++j) t[j] = $j # calculate p, parallel fraction of code if (2==NF) { # use simple formula for p given only T1, T2 s2 = t[1]/t[2] p = 2*(s2-1)/s2 } else { # use general formula on the last 2 nonzero inputs for (m=NF-1; t[m]==0; --m) ; sm = t[1]/t[m] sn = t[1]/t[NF] invm = 1/m invn = 1/NF p = (sm - sn)/( sm*(1-invm) - sn*(1-invn) ) } if (p<1) { printf("#CPUs SpeedUp(n) T(n) p=%6.3g\n",p) npat = "%5d %6.3g %8.3g\n" # print the actual times as given and their speedups printf(npat,1,1.0,t[1]) for (j=2;j<=NF;++j) { if (t[j]) printf(npat,j,t[1]/t[j],t[j]) } # extrapolate using amdahl's law based on calculated p # first, for CPUs one at a time to 7 for (j=NF+1;j<8;++j) { sj = 1/((p/j)+(1-p)) printf(npat,j,sj,t[1]/sj) } # then 8, 16, 32, 64 and 128 for (j=8;j<=128;j=j+j) { sj = 1/((p/j)+(1-p)) if (j>NF) printf(npat,j,sj,t[1]/sj) } } else { printf("p=%6.3g, hyperlinear speedup\n",p) printf("Enter a list of times for more more than %d CPUs\n\n",NF) } } |
This routine allows a program to pass the address of any variable, and recover the physical memory address of the page containing the variable. It can be used to investigate memory distribution effectiveness.
You can translate a virtual address to a node number with the following macro, which calls va2pa().
#define VADR2NODE(A) ((int) (va2pa(A) >> 32)) |
You can retrieve the CPU number instead of a node number using this macro:
#define VADR2CPU(A) ((int) (va2pa(A) >> 16)) |
Example C-9. Routine va2pa() Returns the Physical Page of a Virtual Address
#include <stdio.h> #include <sys/types.h> #include <sys/syssgi.h> __uint64_t va2pa( void *va) { __uint64_t pa; __uint32_t pfn; int status; static int lpgsz, pgsz = -1; if (pgsz < 0) { /* first time: log2(pagesize) */ int itmp; pgsz = itmp = getpagesize(); for (lpgsz=0; itmp>1; itmp>>=1, lpgsz++); } if ((status = syssgi(SGI_PHYSP,va,&pfn)) != 0) { perror("Virtual to physical mapping failed"); exit(1); } pa = (((__uint64_t) pfn << lpgsz) | ((__uint64_t) va & (pgsz-1))); return (pa); } |
This routine gets the clock rate in megahertz of the first CPU listed in the hardware inventory, and returns it as an integer. You can use this number to convert an elapsed time into a count of CPU cycles.
Example C-10. Routine cpuclock() Gets the Clock Speed from the Hardware Inventory
/* ============================================================= || Return CPU clock rate in megahertz, by the rather || byzantine method of scanning the hardware inventory */ #include <invent.h> #DEFINE DFLT_MHZ 195 /* return if any error */ int cpuclock(void) { inventory_t *p_inv; if (setinvent()) return DFLT_MHZ; for(p_inv = getinvent(); (p_inv); p_inv = getinvent()) { if ( (p_inv->inv_class == INV_PROCESSOR) &&(p_inv->inv_type == INV_CPUBOARD) ) break; } endinvent(); if (p_inv) return p_inv->inv_controller; else return DFLT_MHZ; } |