Ported stream benchmark.
This commit is contained in:
parent
022d9065d3
commit
ce05c7b079
|
@ -0,0 +1,3 @@
|
|||
NAME = stream
|
||||
SRCS = $(shell find -L ./src/ -name "*.c" -o -name "*.cpp")
|
||||
include $(AM_HOME)/Makefile.app
|
|
@ -0,0 +1,110 @@
|
|||
===============================================
|
||||
|
||||
STREAM is the de facto industry standard benchmark
|
||||
for measuring sustained memory bandwidth.
|
||||
|
||||
Documentation for STREAM is on the web at:
|
||||
http://www.cs.virginia.edu/stream/ref.html
|
||||
|
||||
===============================================
|
||||
NEWS
|
||||
===============================================
|
||||
UPDATE: October 28 2014:
|
||||
|
||||
"stream_mpi.c" released in the Versions directory.
|
||||
|
||||
Based on Version 5.10 of stream.c, stream_mpi.c
|
||||
brings the following new features:
|
||||
* MPI implementation that *distributes* the arrays
|
||||
across all MPI ranks. (The older Fortran version
|
||||
of STREAM in MPI *replicates* the arrays across
|
||||
all MPI ranks.)
|
||||
* Data is allocated using "posix_memalign"
|
||||
rather than using static arrays. Different
|
||||
compiler flags may be needed for both portability
|
||||
and optimization.
|
||||
See the READ.ME file in the Versions directory
|
||||
for more details.
|
||||
* Error checking and timing done by all ranks and
|
||||
gathered by rank 0 for processing and output.
|
||||
* Timing code uses barriers to ensure correct
|
||||
operation even when multiple MPI ranks run on
|
||||
shared memory systems.
|
||||
|
||||
NOTE: MPI is not a preferred implementation for
|
||||
STREAM, which is intended to measure memory
|
||||
bandwidth in shared-memory systems. In stream_mpi,
|
||||
the MPI calls are only used to properly synchronize
|
||||
the timers (using MPI_Barrier) and to gather
|
||||
timing and error data, so the performance should
|
||||
scale linearly with the size of the cluster.
|
||||
But it may be useful, and was an interesting
|
||||
exercise to develop and debug.
|
||||
|
||||
===============================================
|
||||
UPDATE: January 17 2013:
|
||||
|
||||
Version 5.10 of stream.c is finally available!
|
||||
|
||||
There are no changes to what is being measured, but
|
||||
a number of long-awaited improvements have been made:
|
||||
|
||||
* Updated validation code does not suffer from
|
||||
accumulated roundoff error for large arrays.
|
||||
* Defining the preprocessor variable "VERBOSE"
|
||||
when compiling will (1) cause the code to print the
|
||||
measured average relative absolute error (rather than
|
||||
simply printing "Solution Validates", and (2) print
|
||||
the first 10 array entries with relative error exceeding
|
||||
the error tolerance.
|
||||
* Array index variables have been upgraded from
|
||||
"int" to "ssize_t" to allow arrays with more
|
||||
than 2 billion elements on 64-bit systems.
|
||||
* Substantial improvements to the comments in
|
||||
the source on how to configure/compile/run the
|
||||
benchmark.
|
||||
* The proprocessor variable controlling the array
|
||||
size has been changed from "N" to "STREAM_ARRAY_SIZE".
|
||||
* A new preprocessor variable "STREAM_TYPE" can be
|
||||
used to override the data type from the default
|
||||
"double" to "float".
|
||||
This mechanism could also be used to change to
|
||||
non-floating-point types, but several "printf"
|
||||
statements would need to have their formats changed
|
||||
to accomodate the modified data type.
|
||||
* Some small changes in output, including printing
|
||||
array sizes is GiB as well as MiB.
|
||||
* Change to the default output format to print fewer
|
||||
decimals for the bandwidth and more decimals for
|
||||
the min/max/avg execution times.
|
||||
|
||||
|
||||
===============================================
|
||||
UPDATE: February 19 2009:
|
||||
|
||||
The most recent "official" versions have been renamed
|
||||
"stream.f" and "stream.c" -- all other versions have
|
||||
been moved to the "Versions" subdirectory and should be
|
||||
considered obsolete.
|
||||
|
||||
The "official" timer (was "second_wall.c") has been
|
||||
renamed "mysecond.c". This is embedded in the C version
|
||||
("stream.c"), but still needs to be externally linked to
|
||||
the FORTRAN version ("stream.f"). The new version defines
|
||||
entry points both with and without trailing underscores,
|
||||
so it *should* link automagically with any Fortran compiler.
|
||||
|
||||
===============================================
|
||||
|
||||
STREAM is a project of "Dr. Bandwidth":
|
||||
John D. McCalpin, Ph.D.
|
||||
john@mccalpin.com
|
||||
|
||||
===============================================
|
||||
|
||||
The STREAM web and ftp sites are currently hosted at
|
||||
the Department of Computer Science at the University of
|
||||
Virginia under the generous sponsorship of Professor Bill
|
||||
Wulf and Professor Alan Batson.
|
||||
|
||||
===============================================
|
|
@ -0,0 +1,113 @@
|
|||
#ifndef __BENCHMARK_H__
|
||||
#define __BENCHMARK_H__
|
||||
|
||||
#include <am.h>
|
||||
#include <klib.h>
|
||||
#include <klib-macros.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define MB * 1024 * 1024
|
||||
#define KB * 1024
|
||||
|
||||
#define REF_CPU "i7-7700K @ 4.20GHz"
|
||||
#define REF_SCORE 100000
|
||||
|
||||
#define REPEAT 1
|
||||
|
||||
// size | heap | time | checksum
|
||||
#define QSORT_S { 100, 1 KB, 0, 0x08467105}
|
||||
#define QSORT_M { 30000, 128 KB, 0, 0xa3e99fe4}
|
||||
#define QSORT_L { 100000, 640 KB, 5114, 0xed8cff89}
|
||||
#define QUEEN_S { 8, 0 KB, 0, 0x0000005c}
|
||||
#define QUEEN_M { 11, 0 KB, 0, 0x00000a78}
|
||||
#define QUEEN_L { 12, 0 KB, 4707, 0x00003778}
|
||||
#define BF_S { 2, 32 KB, 0, 0xa6f0079e}
|
||||
#define BF_M { 25, 32 KB, 0, 0xa88f8a65}
|
||||
#define BF_L { 180, 32 KB, 23673, 0x9221e2b3}
|
||||
#define FIB_S { 2, 1 KB, 0, 0x7cfeddf0}
|
||||
#define FIB_M { 23, 16 KB, 0, 0x94ad8800}
|
||||
#define FIB_L { 91, 256 KB, 28318, 0xebdc5f80}
|
||||
#define SIEVE_S { 100, 1 KB, 0, 0x00000019}
|
||||
#define SIEVE_M { 200000, 32 KB, 0, 0x00004640}
|
||||
#define SIEVE_L {10000000, 2 MB, 39361, 0x000a2403}
|
||||
#define PZ15_S { 0, 1 KB, 0, 0x00000006}
|
||||
#define PZ15_M { 1, 256 KB, 0, 0x0000b0df}
|
||||
#define PZ15_L { 2, 2 MB, 4486, 0x00068b8c}
|
||||
#define DINIC_S { 10, 8 KB, 0, 0x0000019c}
|
||||
#define DINIC_M { 80, 512 KB, 0, 0x00004f99}
|
||||
#define DINIC_L { 128, 1 MB, 10882, 0x0000c248}
|
||||
#define LZIP_S { 128, 128 KB, 0, 0xe05fc832}
|
||||
#define LZIP_M { 50000, 1 MB, 0, 0xdc93e90c}
|
||||
#define LZIP_L { 1048576, 4 MB, 7593, 0x8d62c81f}
|
||||
#define SSORT_S { 100, 4 KB, 0, 0x4c555e09}
|
||||
#define SSORT_M { 10000, 512 KB, 0, 0x0db7909b}
|
||||
#define SSORT_L { 100000, 4 MB, 4504, 0x4f0ab431}
|
||||
#define MD5_S { 100, 1 KB, 0, 0xf902f28f}
|
||||
#define MD5_M { 200000, 256 KB, 0, 0xd4f9bc6d}
|
||||
#define MD5_L {10000000, 16 MB, 17239, 0x27286a42}
|
||||
|
||||
#define BENCHMARK_LIST(def) \
|
||||
def(qsort, "qsort", QSORT_S, QSORT_M, QSORT_L, "Quick sort") \
|
||||
def(queen, "queen", QUEEN_S, QUEEN_M, QUEEN_L, "Queen placement") \
|
||||
def( bf, "bf", BF_S, BF_M, BF_L, "Brainf**k interpreter") \
|
||||
def( fib, "fib", FIB_S, FIB_M, FIB_L, "Fibonacci number") \
|
||||
def(sieve, "sieve", SIEVE_S, SIEVE_M, SIEVE_L, "Eratosthenes sieve") \
|
||||
def( 15pz, "15pz", PZ15_S, PZ15_M, PZ15_L, "A* 15-puzzle search") \
|
||||
def(dinic, "dinic", DINIC_S, DINIC_M, DINIC_L, "Dinic's maxflow algorithm") \
|
||||
def( lzip, "lzip", LZIP_S, LZIP_M, LZIP_L, "Lzip compression") \
|
||||
def(ssort, "ssort", SSORT_S, SSORT_M, SSORT_L, "Suffix sort") \
|
||||
def( md5, "md5", MD5_S, MD5_M, MD5_L, "MD5 digest") \
|
||||
|
||||
// Each benchmark will run REPEAT times
|
||||
|
||||
#define DECL(_name, _sname, _s, _m, _l, _desc) \
|
||||
void bench_##_name##_prepare(); \
|
||||
void bench_##_name##_run(); \
|
||||
int bench_##_name##_validate();
|
||||
|
||||
BENCHMARK_LIST(DECL)
|
||||
|
||||
typedef struct Setting {
|
||||
int size;
|
||||
unsigned long mlim, ref;
|
||||
uint32_t checksum;
|
||||
} Setting;
|
||||
|
||||
typedef struct Benchmark {
|
||||
void (*prepare)();
|
||||
void (*run)();
|
||||
int (*validate)();
|
||||
const char *name, *desc;
|
||||
Setting settings[3];
|
||||
} Benchmark;
|
||||
|
||||
extern Benchmark *current;
|
||||
extern Setting *setting;
|
||||
|
||||
typedef struct Result {
|
||||
int pass;
|
||||
unsigned long tsc, msec;
|
||||
} Result;
|
||||
|
||||
void prepare(Result *res);
|
||||
void done(Result *res);
|
||||
|
||||
// memory allocation
|
||||
void* bench_alloc(size_t size);
|
||||
void bench_free(void *ptr);
|
||||
|
||||
// random number generator
|
||||
void bench_srand(uint32_t seed);
|
||||
uint32_t bench_rand(); // return a random number between 0..32767
|
||||
|
||||
// checksum
|
||||
uint32_t checksum(void *start, void *end);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -0,0 +1,552 @@
|
|||
/*-----------------------------------------------------------------------*/
|
||||
/* Program: STREAM */
|
||||
/* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */
|
||||
/* Original code developed by John D. McCalpin */
|
||||
/* Programmers: John D. McCalpin */
|
||||
/* Joe R. Zagar */
|
||||
/* */
|
||||
/* This program measures memory transfer rates in MB/s for simple */
|
||||
/* computational kernels coded in C. */
|
||||
/*-----------------------------------------------------------------------*/
|
||||
/* Copyright 1991-2013: John D. McCalpin */
|
||||
/*-----------------------------------------------------------------------*/
|
||||
/* License: */
|
||||
/* 1. You are free to use this program and/or to redistribute */
|
||||
/* this program. */
|
||||
/* 2. You are free to modify this program for your own use, */
|
||||
/* including commercial use, subject to the publication */
|
||||
/* restrictions in item 3. */
|
||||
/* 3. You are free to publish results obtained from running this */
|
||||
/* program, or from works that you derive from this program, */
|
||||
/* with the following limitations: */
|
||||
/* 3a. In order to be referred to as "STREAM benchmark results", */
|
||||
/* published results must be in conformance to the STREAM */
|
||||
/* Run Rules, (briefly reviewed below) published at */
|
||||
/* http://www.cs.virginia.edu/stream/ref.html */
|
||||
/* and incorporated herein by reference. */
|
||||
/* As the copyright holder, John McCalpin retains the */
|
||||
/* right to determine conformity with the Run Rules. */
|
||||
/* 3b. Results based on modified source code or on runs not in */
|
||||
/* accordance with the STREAM Run Rules must be clearly */
|
||||
/* labelled whenever they are published. Examples of */
|
||||
/* proper labelling include: */
|
||||
/* "tuned STREAM benchmark results" */
|
||||
/* "based on a variant of the STREAM benchmark code" */
|
||||
/* Other comparable, clear, and reasonable labelling is */
|
||||
/* acceptable. */
|
||||
/* 3c. Submission of results to the STREAM benchmark web site */
|
||||
/* is encouraged, but not required. */
|
||||
/* 4. Use of this program or creation of derived works based on this */
|
||||
/* program constitutes acceptance of these licensing restrictions. */
|
||||
/* 5. Absolutely no warranty is expressed or implied. */
|
||||
/*-----------------------------------------------------------------------*/
|
||||
# include <klib.h>
|
||||
|
||||
/*-----------------------------------------------------------------------
|
||||
* INSTRUCTIONS:
|
||||
*
|
||||
* 1) STREAM requires different amounts of memory to run on different
|
||||
* systems, depending on both the system cache size(s) and the
|
||||
* granularity of the system timer.
|
||||
* You should adjust the value of 'STREAM_ARRAY_SIZE' (below)
|
||||
* to meet *both* of the following criteria:
|
||||
* (a) Each array must be at least 4 times the size of the
|
||||
* available cache memory. I don't worry about the difference
|
||||
* between 10^6 and 2^20, so in practice the minimum array size
|
||||
* is about 3.8 times the cache size.
|
||||
* Example 1: One Xeon E3 with 8 MB L3 cache
|
||||
* STREAM_ARRAY_SIZE should be >= 4 million, giving
|
||||
* an array size of 30.5 MB and a total memory requirement
|
||||
* of 91.5 MB.
|
||||
* Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP)
|
||||
* STREAM_ARRAY_SIZE should be >= 20 million, giving
|
||||
* an array size of 153 MB and a total memory requirement
|
||||
* of 458 MB.
|
||||
* (b) The size should be large enough so that the 'timing calibration'
|
||||
* output by the program is at least 20 clock-ticks.
|
||||
* Example: most versions of Windows have a 10 millisecond timer
|
||||
* granularity. 20 "ticks" at 10 ms/tic is 200 milliseconds.
|
||||
* If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec.
|
||||
* This means the each array must be at least 1 GB, or 128M elements.
|
||||
*
|
||||
* Version 5.10 increases the default array size from 2 million
|
||||
* elements to 10 million elements in response to the increasing
|
||||
* size of L3 caches. The new default size is large enough for caches
|
||||
* up to 20 MB.
|
||||
* Version 5.10 changes the loop index variables from "register int"
|
||||
* to "ssize_t", which allows array indices >2^32 (4 billion)
|
||||
* on properly configured 64-bit systems. Additional compiler options
|
||||
* (such as "-mcmodel=medium") may be required for large memory runs.
|
||||
*
|
||||
* Array size can be set at compile time without modifying the source
|
||||
* code for the (many) compilers that support preprocessor definitions
|
||||
* on the compile line. E.g.,
|
||||
* gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M
|
||||
* will override the default size of 10M with a new size of 100M elements
|
||||
* per array.
|
||||
*/
|
||||
#ifndef STREAM_ARRAY_SIZE
|
||||
# define STREAM_ARRAY_SIZE 2097152
|
||||
#endif
|
||||
|
||||
/* 2) STREAM runs each kernel "NTIMES" times and reports the *best* result
|
||||
* for any iteration after the first, therefore the minimum value
|
||||
* for NTIMES is 2.
|
||||
* There are no rules on maximum allowable values for NTIMES, but
|
||||
* values larger than the default are unlikely to noticeably
|
||||
* increase the reported performance.
|
||||
* NTIMES can also be set on the compile line without changing the source
|
||||
* code using, for example, "-DNTIMES=7".
|
||||
*/
|
||||
#ifdef NTIMES
|
||||
#if NTIMES<=1
|
||||
# define NTIMES 10
|
||||
#endif
|
||||
#endif
|
||||
#ifndef NTIMES
|
||||
# define NTIMES 10
|
||||
#endif
|
||||
|
||||
/* Users are allowed to modify the "OFFSET" variable, which *may* change the
|
||||
* relative alignment of the arrays (though compilers may change the
|
||||
* effective offset by making the arrays non-contiguous on some systems).
|
||||
* Use of non-zero values for OFFSET can be especially helpful if the
|
||||
* STREAM_ARRAY_SIZE is set to a value close to a large power of 2.
|
||||
* OFFSET can also be set on the compile line without changing the source
|
||||
* code using, for example, "-DOFFSET=56".
|
||||
*/
|
||||
#ifndef OFFSET
|
||||
# define OFFSET 0
|
||||
#endif
|
||||
|
||||
/*
|
||||
* 3) Compile the code with optimization. Many compilers generate
|
||||
* unreasonably bad code before the optimizer tightens things up.
|
||||
* If the results are unreasonably good, on the other hand, the
|
||||
* optimizer might be too smart for me!
|
||||
*
|
||||
* For a simple single-core version, try compiling with:
|
||||
* cc -O stream.c -o stream
|
||||
* This is known to work on many, many systems....
|
||||
*
|
||||
* To use multiple cores, you need to tell the compiler to obey the OpenMP
|
||||
* directives in the code. This varies by compiler, but a common example is
|
||||
* gcc -O -fopenmp stream.c -o stream_omp
|
||||
* The environment variable OMP_NUM_THREADS allows runtime control of the
|
||||
* number of threads/cores used when the resulting "stream_omp" program
|
||||
* is executed.
|
||||
*
|
||||
* To run with single-precision variables and arithmetic, simply add
|
||||
* -DSTREAM_TYPE=float
|
||||
* to the compile line.
|
||||
* Note that this changes the minimum array sizes required --- see (1) above.
|
||||
*
|
||||
* The preprocessor directive "TUNED" does not do much -- it simply causes the
|
||||
* code to call separate functions to execute each kernel. Trivial versions
|
||||
* of these functions are provided, but they are *not* tuned -- they just
|
||||
* provide predefined interfaces to be replaced with tuned code.
|
||||
*
|
||||
*
|
||||
* 4) Optional: Mail the results to mccalpin@cs.virginia.edu
|
||||
* Be sure to include info that will help me understand:
|
||||
* a) the computer hardware configuration (e.g., processor model, memory type)
|
||||
* b) the compiler name/version and compilation flags
|
||||
* c) any run-time information (such as OMP_NUM_THREADS)
|
||||
* d) all of the output from the test case.
|
||||
*
|
||||
* Thanks!
|
||||
*
|
||||
*-----------------------------------------------------------------------*/
|
||||
|
||||
# define HLINE "-------------------------------------------------------------\n"
|
||||
|
||||
# ifndef MIN
|
||||
# define MIN(x,y) ((x)<(y)?(x):(y))
|
||||
# endif
|
||||
# ifndef MAX
|
||||
# define MAX(x,y) ((x)>(y)?(x):(y))
|
||||
# endif
|
||||
|
||||
#ifndef STREAM_TYPE
|
||||
#define STREAM_TYPE double
|
||||
#endif
|
||||
|
||||
static STREAM_TYPE a[STREAM_ARRAY_SIZE+OFFSET],
|
||||
b[STREAM_ARRAY_SIZE+OFFSET],
|
||||
c[STREAM_ARRAY_SIZE+OFFSET];
|
||||
|
||||
#define FLT_MAX 1E+37
|
||||
static double avgtime[4] = {0}, maxtime[4] = {0},
|
||||
mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
|
||||
|
||||
static char *label[4] = {"Copy: ", "Scale: ",
|
||||
"Add: ", "Triad: "};
|
||||
|
||||
static double bytes[4] = {
|
||||
2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
|
||||
2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
|
||||
3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
|
||||
3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE
|
||||
};
|
||||
|
||||
extern double mysecond();
|
||||
extern void checkSTREAMresults();
|
||||
#ifdef TUNED
|
||||
extern void tuned_STREAM_Copy();
|
||||
extern void tuned_STREAM_Scale(STREAM_TYPE scalar);
|
||||
extern void tuned_STREAM_Add();
|
||||
extern void tuned_STREAM_Triad(STREAM_TYPE scalar);
|
||||
#endif
|
||||
#ifdef _OPENMP
|
||||
extern int omp_get_num_threads();
|
||||
#endif
|
||||
int
|
||||
main()
|
||||
{
|
||||
int quantum, checktick();
|
||||
int BytesPerWord;
|
||||
int k;
|
||||
long j;
|
||||
STREAM_TYPE scalar;
|
||||
double t, times[4][NTIMES];
|
||||
|
||||
/* --- SETUP --- determine precision and check timing --- */
|
||||
|
||||
printf(HLINE);
|
||||
printf("STREAM version $Revision: 5.10 $\n");
|
||||
printf(HLINE);
|
||||
BytesPerWord = sizeof(STREAM_TYPE);
|
||||
printf("This system uses %d bytes per array element.\n",
|
||||
BytesPerWord);
|
||||
|
||||
printf(HLINE);
|
||||
#ifdef N
|
||||
printf("***** WARNING: ******\n");
|
||||
printf(" It appears that you set the preprocessor variable N when compiling this code.\n");
|
||||
printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
|
||||
printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE);
|
||||
printf("***** WARNING: ******\n");
|
||||
#endif
|
||||
|
||||
printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET);
|
||||
printf("Memory per array = %.1f MiB (= %.1f GiB).\n",
|
||||
BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0),
|
||||
BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0));
|
||||
printf("Total memory required = %.1f MiB (= %.1f GiB).\n",
|
||||
(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.),
|
||||
(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.));
|
||||
printf("Each kernel will be executed %d times.\n", NTIMES);
|
||||
printf(" The *best* time for each kernel (excluding the first iteration)\n");
|
||||
printf(" will be used to compute the reported bandwidth.\n");
|
||||
|
||||
/* Get initial value for system clock. */
|
||||
for (j=0; j<STREAM_ARRAY_SIZE; j++) {
|
||||
a[j] = 1.0;
|
||||
b[j] = 2.0;
|
||||
c[j] = 0.0;
|
||||
}
|
||||
|
||||
printf(HLINE);
|
||||
|
||||
if ( (quantum = checktick()) >= 1)
|
||||
printf("Your clock granularity/precision appears to be "
|
||||
"%d microseconds.\n", quantum);
|
||||
else {
|
||||
printf("Your clock granularity appears to be "
|
||||
"less than one microsecond.\n");
|
||||
quantum = 1;
|
||||
}
|
||||
|
||||
t = mysecond();
|
||||
for (j = 0; j < STREAM_ARRAY_SIZE; j++)
|
||||
a[j] = 2.0E0 * a[j];
|
||||
t = 1.0E6 * (mysecond() - t);
|
||||
|
||||
printf("Each test below will take on the order"
|
||||
" of %d microseconds.\n", (int) t );
|
||||
printf(" (= %d clock ticks)\n", (int) (t/quantum) );
|
||||
printf("Increase the size of the arrays if this shows that\n");
|
||||
printf("you are not getting at least 20 clock ticks per test.\n");
|
||||
|
||||
printf(HLINE);
|
||||
|
||||
printf("WARNING -- The above is only a rough guideline.\n");
|
||||
printf("For best results, please be sure you know the\n");
|
||||
printf("precision of your system timer.\n");
|
||||
printf(HLINE);
|
||||
|
||||
/* --- MAIN LOOP --- repeat test cases NTIMES times --- */
|
||||
|
||||
scalar = 3.0;
|
||||
for (k=0; k<NTIMES; k++)
|
||||
{
|
||||
times[0][k] = mysecond();
|
||||
#ifdef TUNED
|
||||
tuned_STREAM_Copy();
|
||||
#else
|
||||
for (j=0; j<STREAM_ARRAY_SIZE; j++)
|
||||
c[j] = a[j];
|
||||
#endif
|
||||
times[0][k] = mysecond() - times[0][k];
|
||||
|
||||
times[1][k] = mysecond();
|
||||
#ifdef TUNED
|
||||
tuned_STREAM_Scale(scalar);
|
||||
#else
|
||||
for (j=0; j<STREAM_ARRAY_SIZE; j++)
|
||||
b[j] = scalar*c[j];
|
||||
#endif
|
||||
times[1][k] = mysecond() - times[1][k];
|
||||
|
||||
times[2][k] = mysecond();
|
||||
#ifdef TUNED
|
||||
tuned_STREAM_Add();
|
||||
#else
|
||||
for (j=0; j<STREAM_ARRAY_SIZE; j++)
|
||||
c[j] = a[j]+b[j];
|
||||
#endif
|
||||
times[2][k] = mysecond() - times[2][k];
|
||||
|
||||
times[3][k] = mysecond();
|
||||
#ifdef TUNED
|
||||
tuned_STREAM_Triad(scalar);
|
||||
#else
|
||||
for (j=0; j<STREAM_ARRAY_SIZE; j++)
|
||||
a[j] = b[j]+scalar*c[j];
|
||||
#endif
|
||||
times[3][k] = mysecond() - times[3][k];
|
||||
}
|
||||
|
||||
/* --- SUMMARY --- */
|
||||
|
||||
for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
|
||||
{
|
||||
for (j=0; j<4; j++)
|
||||
{
|
||||
avgtime[j] = avgtime[j] + times[j][k];
|
||||
mintime[j] = MIN(mintime[j], times[j][k]);
|
||||
maxtime[j] = MAX(maxtime[j], times[j][k]);
|
||||
}
|
||||
}
|
||||
|
||||
printf("Function Best Rate MB/s Avg time Min time Max time\n");
|
||||
for (j=0; j<4; j++) {
|
||||
avgtime[j] = avgtime[j]/(double)(NTIMES-1);
|
||||
|
||||
printf("%s%12.1f %11.6f %11.6f %11.6f\n", label[j],
|
||||
1.0E-06 * bytes[j]/mintime[j],
|
||||
avgtime[j],
|
||||
mintime[j],
|
||||
maxtime[j]);
|
||||
}
|
||||
printf(HLINE);
|
||||
|
||||
/* --- Check Results --- */
|
||||
checkSTREAMresults();
|
||||
printf(HLINE);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
# define M 20
|
||||
|
||||
int
|
||||
checktick()
|
||||
{
|
||||
int i, minDelta, Delta;
|
||||
double t1, t2, timesfound[M];
|
||||
|
||||
/* Collect a sequence of M unique time values from the system. */
|
||||
|
||||
for (i = 0; i < M; i++) {
|
||||
t1 = mysecond();
|
||||
while( ((t2=mysecond()) - t1) < 1.0E-6 )
|
||||
;
|
||||
timesfound[i] = t1 = t2;
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine the minimum difference between these M values.
|
||||
* This result will be our estimate (in microseconds) for the
|
||||
* clock granularity.
|
||||
*/
|
||||
|
||||
minDelta = 1000000;
|
||||
for (i = 1; i < M; i++) {
|
||||
Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
|
||||
minDelta = MIN(minDelta, MAX(Delta,0));
|
||||
}
|
||||
|
||||
return(minDelta);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* A gettimeofday routine to give access to the wall
|
||||
clock timer on most UNIX-like systems. */
|
||||
|
||||
double mysecond()
|
||||
{
|
||||
/*
|
||||
struct timeval tp;
|
||||
struct timezone tzp;
|
||||
int i;
|
||||
|
||||
i = gettimeofday(&tp,&tzp);
|
||||
return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
|
||||
*/
|
||||
return (double)uptime();
|
||||
}
|
||||
|
||||
#ifndef abs
|
||||
#define abs(a) ((a) >= 0 ? (a) : -(a))
|
||||
#endif
|
||||
void checkSTREAMresults ()
|
||||
{
|
||||
STREAM_TYPE aj,bj,cj,scalar;
|
||||
STREAM_TYPE aSumErr,bSumErr,cSumErr;
|
||||
STREAM_TYPE aAvgErr,bAvgErr,cAvgErr;
|
||||
double epsilon;
|
||||
long j;
|
||||
int k,ierr,err;
|
||||
|
||||
/* reproduce initialization */
|
||||
aj = 1.0;
|
||||
bj = 2.0;
|
||||
cj = 0.0;
|
||||
/* a[] is modified during timing check */
|
||||
aj = 2.0E0 * aj;
|
||||
/* now execute timing loop */
|
||||
scalar = 3.0;
|
||||
for (k=0; k<NTIMES; k++)
|
||||
{
|
||||
cj = aj;
|
||||
bj = scalar*cj;
|
||||
cj = aj+bj;
|
||||
aj = bj+scalar*cj;
|
||||
}
|
||||
|
||||
/* accumulate deltas between observed and expected results */
|
||||
aSumErr = 0.0;
|
||||
bSumErr = 0.0;
|
||||
cSumErr = 0.0;
|
||||
for (j=0; j<STREAM_ARRAY_SIZE; j++) {
|
||||
aSumErr += abs(a[j] - aj);
|
||||
bSumErr += abs(b[j] - bj);
|
||||
cSumErr += abs(c[j] - cj);
|
||||
// if (j == 417) printf("Index 417: c[j]: %f, cj: %f\n",c[j],cj); // MCCALPIN
|
||||
}
|
||||
aAvgErr = aSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
|
||||
bAvgErr = bSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
|
||||
cAvgErr = cSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
|
||||
|
||||
if (sizeof(STREAM_TYPE) == 4) {
|
||||
epsilon = 1.e-6;
|
||||
}
|
||||
else if (sizeof(STREAM_TYPE) == 8) {
|
||||
epsilon = 1.e-13;
|
||||
}
|
||||
else {
|
||||
printf("WEIRD: sizeof(STREAM_TYPE) = %lu\n",sizeof(STREAM_TYPE));
|
||||
epsilon = 1.e-6;
|
||||
}
|
||||
|
||||
err = 0;
|
||||
if (abs(aAvgErr/aj) > epsilon) {
|
||||
err++;
|
||||
printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
|
||||
printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj);
|
||||
ierr = 0;
|
||||
for (j=0; j<STREAM_ARRAY_SIZE; j++) {
|
||||
if (abs(a[j]/aj-1.0) > epsilon) {
|
||||
ierr++;
|
||||
#ifdef VERBOSE
|
||||
if (ierr < 10) {
|
||||
printf(" array a: index: %ld, expected: %e, observed: %e, relative error: %e\n",
|
||||
j,aj,a[j],abs((aj-a[j])/aAvgErr));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
printf(" For array a[], %d errors were found.\n",ierr);
|
||||
}
|
||||
if (abs(bAvgErr/bj) > epsilon) {
|
||||
err++;
|
||||
printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
|
||||
printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj);
|
||||
printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon);
|
||||
ierr = 0;
|
||||
for (j=0; j<STREAM_ARRAY_SIZE; j++) {
|
||||
if (abs(b[j]/bj-1.0) > epsilon) {
|
||||
ierr++;
|
||||
#ifdef VERBOSE
|
||||
if (ierr < 10) {
|
||||
printf(" array b: index: %ld, expected: %e, observed: %e, relative error: %e\n",
|
||||
j,bj,b[j],abs((bj-b[j])/bAvgErr));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
printf(" For array b[], %d errors were found.\n",ierr);
|
||||
}
|
||||
if (abs(cAvgErr/cj) > epsilon) {
|
||||
err++;
|
||||
printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
|
||||
printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj);
|
||||
printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon);
|
||||
ierr = 0;
|
||||
for (j=0; j<STREAM_ARRAY_SIZE; j++) {
|
||||
if (abs(c[j]/cj-1.0) > epsilon) {
|
||||
ierr++;
|
||||
#ifdef VERBOSE
|
||||
if (ierr < 10) {
|
||||
printf(" array c: index: %ld, expected: %e, observed: %e, relative error: %e\n",
|
||||
j,cj,c[j],abs((cj-c[j])/cAvgErr));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
printf(" For array c[], %d errors were found.\n",ierr);
|
||||
}
|
||||
if (err == 0) {
|
||||
printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon);
|
||||
}
|
||||
#ifdef VERBOSE
|
||||
printf ("Results Validation Verbose Results: \n");
|
||||
printf (" Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj);
|
||||
printf (" Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]);
|
||||
printf (" Rel Errors on a, b, c: %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj));
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef TUNED
|
||||
/* stubs for "tuned" versions of the kernels */
|
||||
void tuned_STREAM_Copy()
|
||||
{
|
||||
ssize_t j;
|
||||
for (j=0; j<STREAM_ARRAY_SIZE; j++)
|
||||
c[j] = a[j];
|
||||
}
|
||||
|
||||
void tuned_STREAM_Scale(STREAM_TYPE scalar)
|
||||
{
|
||||
ssize_t j;
|
||||
for (j=0; j<STREAM_ARRAY_SIZE; j++)
|
||||
b[j] = scalar*c[j];
|
||||
}
|
||||
|
||||
void tuned_STREAM_Add()
|
||||
{
|
||||
ssize_t j;
|
||||
for (j=0; j<STREAM_ARRAY_SIZE; j++)
|
||||
c[j] = a[j]+b[j];
|
||||
}
|
||||
|
||||
void tuned_STREAM_Triad(STREAM_TYPE scalar)
|
||||
{
|
||||
ssize_t j;
|
||||
for (j=0; j<STREAM_ARRAY_SIZE; j++)
|
||||
a[j] = b[j]+scalar*c[j];
|
||||
}
|
||||
/* end of stubs for the "tuned" versions of the kernels */
|
||||
#endif
|
Loading…
Reference in New Issue