/* ****************************************************************************
*
* =======
* AUTHOR:
* =======
*
* Michael Holst, Postdoc Email: holst@ama.caltech.edu
* Applied Math 217-50 Phone: (818) 395-4549
* Caltech, Pasadena, CA 91125 FAX: (818) 683-3549
*
* ========
* PURPOSE:
* ========
*
* A simple benchmark code to compute raw floating point performance of
* common computational kernels occuring in numerical simulation codes.
*
* Our main interest is in solving partial differential equations, so in
* this benchmark we stress vector-vector and SPARSE matrix-vector
* operations, as opposed to DENSE matrix-vector operations often used for
* quoting benchmarks by others. Sparse matrix (perhaps with structures
* such as diagonal bands, but still quite sparse) operations are much more
* common than dense matrix operations in any application involving the
* numerical solution of ordinary or partial differential equations, and
* therefore the floating point performance on sparse matrix operations is
* a much more important indicator of simulation speed than dense matrix
* performance. However, the latter is often quoted as the floating point
* speed of the architecture, using LINPACK-type dense matrix benchmarks,
* which is unfortunate since dense matrix operations allow for much higher
* levels of cache reuse, and artifically pump up the floating point
* performance in the sense that one will never see such performance for
* a typical simulation code. This benchmark code is intended to time some
* of the kernels that really occur in simulation codes.
*
* ====
* USE:
* ====
*
* The only parameter that may need to be set for different machines is the
* parameter "irepeat", which determines how many times the loops are
* executed for the timing. The larger the number, the longer the test
* takes, but the more accurate the timings are. For slower machines you
* will need to set this to be a smaller number (such as irepeat=2 for an
* Intel 80386) unless you are willing to wait forever. For faster
* machines, if you don't set this number large enough (such as at least
* irepeat=100 for an Intel P5-90) then you will get inaccurate timings,
* and in fact may get floating point exceptions when the code tries to
* divide the number of operations by the time taken (time=0 if fast the
* machine is fast and irepeat small, or if you timing routine does not
* provide high resolution.)
*
* ================
* SAMPLE MAKEFILE:
* ================
*
* #########################################################################
* # purpose: makefile for benchmark code.
* # author: michael holst
* #########################################################################
* ARCH = -DLINUX
* TOPLEVC = /usr
* CC = gcc
* CCFLAGS = -c
* COPT = -O2
* CDEBUG =
* CLIBPATHS = -L$(TOPLEVC)/lib
* CLIBRARIES =
* CINCLUDES = -I$(TOPLEVC)/include
* CLIBS = $(CLIBPATHS) $(CLIBRARIES)
* CFLAGS = $(CINCLUDES) $(COPT) $(CDEBUG) $(ARCH)
* bench : bench.o
* $(CC) -o bench bench.o $(CLIBS)
* .c.o :
* $(CC) $(CCFLAGS) $(CFLAGS) $*.c
* clean :
* rm -f *.o bench
*
* ==================
* CONDITIONS OF USE:
* ==================
*
* Total unrestricted use of the code is granted, provided that the user
* be kind enough to forward interesting benchmark results on interesting
* machines to the author. In addition, if you must make modifications in
* order to run the code on a particular machine, the author would also
* like to know exactly what was required so that the code can continue to
* become more general. Any modifications should (hopefully) be isolated
* to the last routine in the file, "tsecnd()", and can be "#ifdef"-ed such
* as was need for SOLARIS and AIX. At the bottom of this comment block
* is a list of benchmark results on various architectures, which will be
* updated as results on new machines are obtained.
*
* ===========================================
* BENCHMARK RESULTS ON VARIOUS ARCHITECTURES:
* ===========================================
*
* All performance numbers for "gcc -O2" unless otherwise indicated.
* Kernels timed are the following. Timings on multiprocessor machines
* are for one node only.
*
* ----------------------------------------------------------------------------
* Benchmarking your machine with...
* ----------------------------------------------------------------------------
* Vector Copy D[i] = A[i]
* Vector Add D[i] = A[i] + B[i]
* Vector Multiply D[i] = A[i] * B[i]
* Vector Divide D[i] = A[i] / B[i]
* Vector Add-Multiply D[i] = A[i] + B[i] * C[i]
* Vector Add-Divide D[i] = A[i] + B[i] / C[i]
* Matrix-vector product (5-diagonal sparse matrix)
* ----------------------------------------------------------------------------
*
* ------------ -------- -------- -------- -------- -------- -------- --------
* Architecture Copy Add Multiply Divide Add-Mult Add-Divi SpMatvec
* ------------ -------- -------- -------- -------- -------- -------- --------
* Intel 80386 3.20e-01 1.83e-01 1.28e-01 4.27e-01 5.12e-01 3.66e-01 9.85e-01
* Sun SPARC-2 1.85e+00 1.15e+00 1.10e+00 6.58e-01 1.52e+00 1.10e+00 3.50e+00
* Sun SPARC-5 4.17e+00 2.94e+00 2.70e+00 1.47e+00 4.08e+00 2.47e+00 8.93e+00
* Sun SPARC-10 4.00e+00 3.12e+00 2.70e+00 2.08e+00 6.67e+00 4.55e+00 9.17e+00
* Sun SPARC-20 4.17e+00 4.17e+00 4.17e+00 2.27e+00 5.41e+00 3.85e+00 9.90e+00
* Intel Paragon 4.00e+00 4.00e+00 3.57e+00 2.51e-01 6.06e+00 4.45e-01 9.52e+00
* Intel P5-90 2.43e+00 2.46e+00 2.72e+00 1.45e+00 3.88e+00 2.60e+00 9.57e+00
* IBM RS6000 3.70e+00 6.25e+00 6.25e+00 2.33e+00 9.52e+00 4.88e+00 1.47e+01
* RS6000 (xlc) 1.25e+01 1.25e+01 7.69e+00 2.50e+00 1.25e+01 4.35e+00 3.23e+01
* Your Machine ... ... ... ... ... ... ...
* ------------ -------- -------- -------- -------- -------- -------- --------
*
* ==============
* LAST MODIFIED:
* ==============
*
* 6-18-94
*
* ************************************************************************* */
#define irepeat 1000
/* ****************************************************************************
* Shouldn't have to change the code below this point, other than possibly
* the details of the timing routine "tsecnd()" which appears at the very
* end of the code, before the benchmark database.
* ************************************************************************* */
#include
#include
#include
#include
#include
extern double tsecnd();
extern void tstart();
extern void tstop ();
#define II2(i,j) ( ( (j-J0) * (I1-I0+1) ) + (i-I0) )
#define nhalf 100
#define n (nhalf * nhalf)
#define ndim ( (nhalf+1) * (nhalf+1) )
#define nloops 100
main() {
/* vectors for vector-vector and matrix-vector operations */
double a0[ndim], a1[ndim], a2[ndim], a3[ndim], a4[ndim], a5[ndim];
double a6[ndim], a7[ndim], a8[ndim], a9[ndim];
double b0[ndim], b1[ndim], b2[ndim], b3[ndim], b4[ndim], b5[ndim];
double b6[ndim], b7[ndim], b8[ndim], b9[ndim];
double x[ndim], y[ndim];
double oC[ndim], oN[ndim], oE[ndim], oNW[ndim], oNE[ndim];
/* keep track of timings and operation counts */
double rtime[nloops],rscale[nloops];
/* various other business */
double bf, oh, bf_g, oh_g, ttime, scale, s, mflop;
long int icount, index, i, j, I0, I1, J0, J1, ij, im1j, ip1j, ijm1, ijp1;
long int my_loop;
char* loop_name[78];
/* global timing */
tstart(&bf_g, &oh_g);
/* Set loop names */
/* '1234567890123456789012345678901234567890' */
loop_name[0] = " Vector Copy D[i] = A[i] ";
loop_name[1] = " Vector Add D[i] = A[i] + B[i] ";
loop_name[2] = " Vector Multiply D[i] = A[i] * B[i] ";
loop_name[3] = " Vector Divide D[i] = A[i] / B[i] ";
loop_name[4] = " Vector Add-Multiply D[i] = A[i] + B[i] * C[i] ";
loop_name[5] = " Vector Add-Divide D[i] = A[i] + B[i] / C[i] ";
loop_name[6] = " Matrix-vector product (5-diagonal sparse matrix) ";
/* some i/o */
printf(" * --------------------------------------");
printf("--------------------------------------\n");
printf(" * Benchmarking your machine with...\n");
printf(" * --------------------------------------");
printf("--------------------------------------\n");
for (j=0; j<7; j++) printf(" * %s\n",loop_name[j]);
printf(" * --------------------------------------");
printf("--------------------------------------\n");
printf(" * \n");
/* Zero time vector */
for (i=0; i