/* ****************************************************************************
 *
 * =======
 * AUTHOR:
 * =======
 *
 *    Michael Holst, Postdoc           Email:  holst@ama.caltech.edu
 *    Applied Math 217-50              Phone:  (818) 395-4549
 *    Caltech, Pasadena, CA 91125      FAX:    (818) 683-3549
 *
 * ========
 * PURPOSE:
 * ========
 *
 *    A simple benchmark code to compute raw floating point performance of
 *    common computational kernels occuring in numerical simulation codes.
 *
 *    Our main interest is in solving partial differential equations, so in
 *    this benchmark we stress vector-vector and SPARSE matrix-vector
 *    operations, as opposed to DENSE matrix-vector operations often used for
 *    quoting benchmarks by others.  Sparse matrix (perhaps with structures
 *    such as diagonal bands, but still quite sparse) operations are much more
 *    common than dense matrix operations in any application involving the
 *    numerical solution of ordinary or partial differential equations, and
 *    therefore the floating point performance on sparse matrix operations is
 *    a much more important indicator of simulation speed than dense matrix
 *    performance.  However, the latter is often quoted as the floating point
 *    speed of the architecture, using LINPACK-type dense matrix benchmarks,
 *    which is unfortunate since dense matrix operations allow for much higher
 *    levels of cache reuse, and artifically pump up the floating point
 *    performance in the sense that one will never see such performance for
 *    a typical simulation code.  This benchmark code is intended to time some
 *    of the kernels that really occur in simulation codes.
 *
 * ====
 * USE:
 * ====
 *
 *    The only parameter that may need to be set for different machines is the
 *    parameter "irepeat", which determines how many times the loops are
 *    executed for the timing.  The larger the number, the longer the test
 *    takes, but the more accurate the timings are.  For slower machines you
 *    will need to set this to be a smaller number (such as irepeat=2 for an
 *    Intel 80386) unless you are willing to wait forever.  For faster
 *    machines, if you don't set this number large enough (such as at least
 *    irepeat=100 for an Intel P5-90) then you will get inaccurate timings,
 *    and in fact may get floating point exceptions when the code tries to
 *    divide the  number of operations by the time taken (time=0 if fast the
 *    machine is fast and irepeat small, or if you timing routine does not
 *    provide high resolution.)
 *
 * ================
 * SAMPLE MAKEFILE:
 * ================
 *
 *    #########################################################################
 *    # purpose: makefile for benchmark code.
 *    # author:  michael holst
 *    #########################################################################
 *    ARCH       = -DLINUX
 *    TOPLEVC    = /usr
 *    CC         = gcc
 *    CCFLAGS    = -c
 *    COPT       = -O2
 *    CDEBUG     =
 *    CLIBPATHS  = -L$(TOPLEVC)/lib
 *    CLIBRARIES =
 *    CINCLUDES  = -I$(TOPLEVC)/include
 *    CLIBS      = $(CLIBPATHS) $(CLIBRARIES)
 *    CFLAGS     = $(CINCLUDES) $(COPT) $(CDEBUG) $(ARCH)
 *    bench : bench.o 
 *        $(CC) -o bench bench.o $(CLIBS)
 *    .c.o :
 *        $(CC) $(CCFLAGS) $(CFLAGS) $*.c
 *    clean :
 *        rm -f *.o bench
 * 
 * ==================
 * CONDITIONS OF USE:
 * ==================
 *
 *    Total unrestricted use of the code is granted, provided that the user
 *    be kind enough to forward interesting benchmark results on interesting
 *    machines to the author.  In addition, if you must make modifications in
 *    order to run the code on a particular machine, the author would also
 *    like to know exactly what was required so that the code can continue to
 *    become more general.  Any modifications should (hopefully) be isolated
 *    to the last routine in the file, "tsecnd()", and can be "#ifdef"-ed such
 *    as was need for SOLARIS and AIX.  At the bottom of this comment block
 *    is a list of benchmark results on various architectures, which will be 
 *    updated as results on new machines are obtained.
 *
 * ===========================================
 * BENCHMARK RESULTS ON VARIOUS ARCHITECTURES:
 * ===========================================
 * 
 *    All performance numbers for "gcc -O2" unless otherwise indicated.
 *    Kernels timed are the following.  Timings on multiprocessor machines
 *    are for one node only.
 *
 * ----------------------------------------------------------------------------
 * Benchmarking your machine with...
 * ----------------------------------------------------------------------------
 *    Vector Copy            D[i] = A[i]                 
 *    Vector Add             D[i] = A[i] + B[i]          
 *    Vector Multiply        D[i] = A[i] * B[i]          
 *    Vector Divide          D[i] = A[i] / B[i]          
 *    Vector Add-Multiply    D[i] = A[i] + B[i] * C[i]   
 *    Vector Add-Divide      D[i] = A[i] + B[i] / C[i]   
 *    Matrix-vector product  (5-diagonal sparse matrix)  
 * ----------------------------------------------------------------------------
 * 
 * ------------  -------- -------- -------- -------- -------- -------- --------
 * Architecture  Copy     Add      Multiply Divide   Add-Mult Add-Divi SpMatvec
 * ------------  -------- -------- -------- -------- -------- -------- --------
 * Intel 80386   3.20e-01 1.83e-01 1.28e-01 4.27e-01 5.12e-01 3.66e-01 9.85e-01
 * Sun SPARC-2   1.85e+00 1.15e+00 1.10e+00 6.58e-01 1.52e+00 1.10e+00 3.50e+00
 * Sun SPARC-5   4.17e+00 2.94e+00 2.70e+00 1.47e+00 4.08e+00 2.47e+00 8.93e+00
 * Sun SPARC-10  4.00e+00 3.12e+00 2.70e+00 2.08e+00 6.67e+00 4.55e+00 9.17e+00
 * Sun SPARC-20  4.17e+00 4.17e+00 4.17e+00 2.27e+00 5.41e+00 3.85e+00 9.90e+00 
 * Intel Paragon 4.00e+00 4.00e+00 3.57e+00 2.51e-01 6.06e+00 4.45e-01 9.52e+00
 * Intel P5-90   2.43e+00 2.46e+00 2.72e+00 1.45e+00 3.88e+00 2.60e+00 9.57e+00
 * IBM RS6000    3.70e+00 6.25e+00 6.25e+00 2.33e+00 9.52e+00 4.88e+00 1.47e+01
 * RS6000 (xlc)  1.25e+01 1.25e+01 7.69e+00 2.50e+00 1.25e+01 4.35e+00 3.23e+01
 * Your Machine    ...      ...      ...      ...      ...      ...      ...
 * ------------  -------- -------- -------- -------- -------- -------- --------
 *
 * ==============
 * LAST MODIFIED:
 * ==============
 *
 *    6-18-94
 *
 * ************************************************************************* */

#define irepeat	1000

/* ****************************************************************************
 * Shouldn't have to change the code below this point, other than possibly
 * the details of the timing routine "tsecnd()" which appears at the very
 * end of the code, before the benchmark database.
 * ************************************************************************* */

#include <math.h>
#include <stdio.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/resource.h>

extern double tsecnd();
extern void tstart();
extern void tstop ();

#define II2(i,j)   ( ( (j-J0) * (I1-I0+1) ) + (i-I0) )

#define nhalf	100
#define n 		(nhalf * nhalf)
#define ndim	( (nhalf+1) * (nhalf+1) )
#define nloops	100

main() {

	/* vectors for vector-vector and matrix-vector operations */
	double a0[ndim], a1[ndim], a2[ndim], a3[ndim], a4[ndim], a5[ndim];
	double a6[ndim], a7[ndim], a8[ndim], a9[ndim];
	double b0[ndim], b1[ndim], b2[ndim], b3[ndim], b4[ndim], b5[ndim];
	double b6[ndim], b7[ndim], b8[ndim], b9[ndim];
	double x[ndim], y[ndim];
	double oC[ndim], oN[ndim], oE[ndim], oNW[ndim], oNE[ndim];

	/* keep track of timings and operation counts */
	double rtime[nloops],rscale[nloops];

	/* various other business */
	double bf, oh, bf_g, oh_g, ttime, scale, s, mflop;
	long int icount, index, i, j, I0, I1, J0, J1, ij, im1j, ip1j, ijm1, ijp1;
	long int my_loop;
	char* loop_name[78];

	/* global timing */
    tstart(&bf_g, &oh_g);

	/* Set loop names */
	/* '1234567890123456789012345678901234567890' */
	loop_name[0] = "   Vector Copy            D[i] = A[i]                 ";
	loop_name[1] = "   Vector Add             D[i] = A[i] + B[i]          ";
	loop_name[2] = "   Vector Multiply        D[i] = A[i] * B[i]          ";
	loop_name[3] = "   Vector Divide          D[i] = A[i] / B[i]          ";
	loop_name[4] = "   Vector Add-Multiply    D[i] = A[i] + B[i] * C[i]   ";
	loop_name[5] = "   Vector Add-Divide      D[i] = A[i] + B[i] / C[i]   ";
	loop_name[6] = "   Matrix-vector product  (5-diagonal sparse matrix)  ";

	/* some i/o */
	printf(" * --------------------------------------");
	printf("--------------------------------------\n");
	printf(" * Benchmarking your machine with...\n");
	printf(" * --------------------------------------");
	printf("--------------------------------------\n");
	for (j=0; j<7; j++) printf(" * %s\n",loop_name[j]);
	printf(" * --------------------------------------");
	printf("--------------------------------------\n");
	printf(" * \n");

	/* Zero time vector */
	for (i=0; i<nloops; i++) {
		rtime[i] = 0.0;
	}

	/*
	 * Initialize arrays -- the values are unimportant, just need to
	 * fake even the best optimizing compiler into thinking that we
	 * are doing something useful, and that each and every vector is
	 * unique and important.
	 */
	s = 1.0 / 3.0;
	for (i=0; i<n; i++) {
		s = s + 0.0001;
		a0[i] = s;
		a1[i] = s * 2;
		a2[i] = 2.0 / 3.0;
		a3[i] = a1[i] + a2[i];
		a4[i] = 1.0 / s;
		a5[i] = a2[i] * a1[i];
		a6[i] = a1[i] + a4[i];
		a7[i] = a1[i] * a4[i];
		a8[i] = a3[i] + a6[i];
		a9[i] = a3[i] * a6[i];
		b0[i] = s;
		b1[i] = s * 3;
		b2[i] = 2.0 / 5.0;
		b3[i] = b1[i] - b2[i];
		b4[i] = 1.5 / s;
		b5[i] = b2[i] * b1[i];
		b6[i] = b2[i] - b4[i];
		b7[i] = b2[i] * b4[i];
		b8[i] = b5[i] - b6[i];
		b9[i] = b5[i] * b6[i];
		x[i] = s;
		y[i] = 2.0 / 7.0;
		oN[i] = x[i] - y[i];
		oE[i] = 1.0 / s;
		oC[i] = oE[i] * oN[i];
		oNW[i] = oC[i] * oN[i];
		oNE[i] = oE[i] * oC[i];
	}

	/* Loop back here for repeat */
	icount = 0;
	for (icount=0; icount<irepeat; icount++) {

		/* Copy:  e(i) = a(i) */
		my_loop = 0;
		tstart(&bf, &oh);
		for (i=0; i<n; i++) a5[i] = a1[i];
		tstop(&bf, &oh, &ttime);
		rtime[my_loop] = rtime[my_loop] + ttime;
		rscale[my_loop] = 1.0;

		/* Add:  e(i) = a(i) + b(i) */
		my_loop = 1;
		tstart(&bf, &oh);
		for (i=0; i<n; i++) a5[i] = a1[i] + a2[i];
		tstop(&bf, &oh, &ttime);
		rtime[my_loop] = rtime[my_loop] + ttime;
		rscale[my_loop] = 1.0;

		/* Multiply:  e(i) = a(i) * b(i) */
		my_loop = 2;
		tstart(&bf, &oh);
		for (i=0; i<n; i++) a5[i] = a1[i] * a2[i];
		tstop(&bf, &oh, &ttime);
		rtime[my_loop] = rtime[my_loop] + ttime;
		rscale[my_loop] = 1.0;

		/* Divide:  e(i) = a(i) / b(i) */
		my_loop = 3;
		tstart(&bf, &oh);
		for (i=0; i<n; i++) a5[i] = a1[i] / a2[i];
		tstop(&bf, &oh, &ttime);
		rtime[my_loop] = rtime[my_loop] + ttime;
		rscale[my_loop] = 1.0;

		/* Add-Multiply:  e(i) = a(i) + b(i) * d(i) */
		my_loop = 4;
		tstart(&bf, &oh);
		for (i=0; i<n; i++) a5[i] = a1[i] + a2[i] * a3[i];
		tstop(&bf, &oh, &ttime);
		rtime[my_loop] = rtime[my_loop] + ttime;
		rscale[my_loop] = 1.0 / 2.0;

		/* Add-Divide:  e(i) = a(i) + b(i) / d(i) */
		my_loop = 5;
		tstart(&bf, &oh);
		for (i=0; i<n; i++) a5[i] = a1[i] + a2[i] / a3[i];
		tstop(&bf, &oh, &ttime);
		rtime[my_loop] = rtime[my_loop] + ttime;
		rscale[my_loop] = 1.0 / 2.0;

		/* Matvec: */
		my_loop = 6;
		tstart(&bf, &oh);
		J0 = 0;
		I0 = 0;
		I1 = nhalf+1;
		J1 = nhalf+1;
		for (j=J0+1; j<=J1-1; j++) {
			for (i=I0+1; i<=I1-1; i++) {
				ij   = II2(i,j);
				im1j = II2(i-1,j);
				ip1j = II2(i+1,j);
				ijm1 = II2(i,j-1);
				ijp1 = II2(i,j+1);

                y[ij] =
                    - oN[ij]   * x[ijp1]
                    - oN[ijm1] * x[ijm1]
                    - oE[ij]   * x[ip1j]
                    - oE[im1j] * x[im1j]
                    + oC[ij]   * x[ij];
            }
		}
		tstop(&bf, &oh, &ttime);
		rtime[my_loop] = rtime[my_loop] + ttime;
		rscale[my_loop] = 1.0 / 10.0;
	}

	/* the i/o */
	printf(" * ------------  -------- -------- -------- -------- -------- ");
		printf("-------- --------\n");
	printf(" * Architecture  Copy     Add      Multiply Divide   Add-Mult ");
		printf("Add-Divi SpMatvec\n");
	printf(" * ------------  -------- -------- -------- -------- -------- ");
		printf("-------- --------\n");
	printf(" * Your Machine  ");
	scale = 1.0 / (double)(n * irepeat);
	for (j=0; j<7; j++) {
		rtime[j] = scale * rtime[j];
		mflop = 1.0 / (rtime[j]*rscale[j]*1.0e6);
		printf("%8.2e ", mflop);
	}
	printf("\n");
	printf(" * --------------------------------------");
	printf("--------------------------------------\n");
 
	/* finish up */
	tstop(&bf_g, &oh_g, &ttime);
	printf("Total time = %g seconds.\n",ttime);
}

/* ****************************************************************************
 * purpose: starts the timer.
 * author:  michael holst
 * ************************************************************************* */
void tstart(before, overhd)
	double* before;
	double* overhd;
{
	/* compute overhead and mark timer */
	double garbge = tsecnd();
	double t0     = tsecnd();
	double t1     = tsecnd();
	*overhd = t1 - t0;
	*before = tsecnd();
}

/* ****************************************************************************
 * purpose: stops the timer.
 * author:  michael holst
 * ************************************************************************* */
void tstop (before, overhd, cputme)
	double* before;
	double* overhd;
	double* cputme;
{
	/* stop timer, compute elapsed time, add to time counter */
	double after  = tsecnd();
	*cputme = (after - *before) - *overhd;
}

/* ****************************************************************************
 * purpose: returns the time in seconds used by the process.
 *          the system call is "getrusage", which returns a struct
 *          containing various pieces of resource usage information.
 * machine: most UNIX machines.
 * author:  michael holst
 * ************************************************************************* */
#ifdef _AIX
extern int getrusage(int, struct rusage*);
#endif

double tsecnd()
{
#ifdef _SOLARIS_ /* Solaris doesn't properly support getrusage. */
	return 0.0;
#else
        static struct rusage temporary;
        long foo, foo1;

        getrusage(RUSAGE_SELF,&temporary)    ; /* get clock */
        foo     = temporary.ru_utime.tv_sec  ; /* seconds */
        foo1    = temporary.ru_utime.tv_usec ; /* uSecs */
        return ((double)foo + (double)foo1*0.000001) ; /* milliseconds */
#endif
}