From 95f9efb7d92a7ed741f2ce20d4c949fe89363a8b Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 11:40:46 +0100 Subject: [PATCH] Remove old version --- Makefile | 27 --- common.cpp | 115 ---------- common.h | 112 ---------- cuda-stream.cu | 397 ---------------------------------- ocl-stream-kernels.cl | 70 ------ ocl-stream.cpp | 488 ------------------------------------------ 6 files changed, 1209 deletions(-) delete mode 100644 Makefile delete mode 100644 common.cpp delete mode 100644 common.h delete mode 100644 cuda-stream.cu delete mode 100644 ocl-stream-kernels.cl delete mode 100644 ocl-stream.cpp diff --git a/Makefile b/Makefile deleted file mode 100644 index 4fb5f7a4..00000000 --- a/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -LDLIBS = -l OpenCL -CXXFLAGS = -std=c++11 -O3 - -PLATFORM = $(shell uname -s) -ifeq ($(PLATFORM), Darwin) - LDLIBS = -framework OpenCL -endif - -all: gpu-stream-ocl gpu-stream-cuda - -gpu-stream-ocl: ocl-stream.cpp common.o Makefile - $(CXX) $(CXXFLAGS) -Wno-deprecated-declarations common.o $< -o $@ $(LDLIBS) - -common.o: common.cpp common.h Makefile - -gpu-stream-cuda: cuda-stream.cu common.o Makefile -ifeq ($(shell which nvcc > /dev/null; echo $$?), 0) - nvcc $(CXXFLAGS) common.o $< -o $@ -else - $(error "Cannot find nvcc, please install CUDA toolkit") -endif - -.PHONY: clean - -clean: - rm -f gpu-stream-ocl gpu-stream-cuda *.o - diff --git a/common.cpp b/common.cpp deleted file mode 100644 index 781d70e8..00000000 --- a/common.cpp +++ /dev/null @@ -1,115 +0,0 @@ -/*============================================================================= -*------------------------------------------------------------------------------ -* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC -* Based on John D. McCalpin’s original STREAM benchmark for CPUs -*------------------------------------------------------------------------------ -* License: -* 1. You are free to use this program and/or to redistribute -* this program. -* 2. You are free to modify this program for your own use, -* including commercial use, subject to the publication -* restrictions in item 3. -* 3. You are free to publish results obtained from running this -* program, or from works that you derive from this program, -* with the following limitations: -* 3a. In order to be referred to as "GPU-STREAM benchmark results", -* published results must be in conformance to the GPU-STREAM -* Run Rules published at -* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules -* and incorporated herein by reference. -* The copyright holders retain the -* right to determine conformity with the Run Rules. -* 3b. Results based on modified source code or on runs not in -* accordance with the GPU-STREAM Run Rules must be clearly -* labelled whenever they are published. Examples of -* proper labelling include: -* "tuned GPU-STREAM benchmark results" -* "based on a variant of the GPU-STREAM benchmark code" -* Other comparable, clear and reasonable labelling is -* acceptable. -* 3c. Submission of results to the GPU-STREAM benchmark web site -* is encouraged, but not required. -* 4. Use of this program or creation of derived works based on this -* program constitutes acceptance of these licensing restrictions. -* 5. Absolutely no warranty is expressed or implied. -*———————————————————————————————————-----------------------------------------*/ - -#include "common.h" - -// Default array size 50 * 2^20 (50*8 Mebibytes double precision) -// Use binary powers of two so divides 1024 -unsigned int ARRAY_SIZE = 52428800; - -unsigned int NTIMES = 10; - -bool useFloat = false; - -unsigned int deviceIndex = 0; - -int parseUInt(const char *str, unsigned int *output) -{ - char *next; - *output = strtoul(str, &next, 10); - return !strlen(next); -} - -void parseArguments(int argc, char *argv[]) -{ - for (int i = 1; i < argc; i++) - { - if (!strcmp(argv[i], "--list")) - { - listDevices(); - exit(0); - } - else if (!strcmp(argv[i], "--device")) - { - if (++i >= argc || !parseUInt(argv[i], &deviceIndex)) - { - std::cout << "Invalid device index" << std::endl; - exit(1); - } - } - else if (!strcmp(argv[i], "--arraysize") || !strcmp(argv[i], "-s")) - { - if (++i >= argc || !parseUInt(argv[i], &ARRAY_SIZE)) - { - std::cout << "Invalid array size" << std::endl; - exit(1); - } - } - else if (!strcmp(argv[i], "--numtimes") || !strcmp(argv[i], "-n")) - { - if (++i >= argc || !parseUInt(argv[i], &NTIMES)) - { - std::cout << "Invalid number of times" << std::endl; - exit(1); - } - } - else if (!strcmp(argv[i], "--float")) - { - useFloat = true; - std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision" << std::endl; - } - else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) - { - std::cout << std::endl; - std::cout << "Usage: ./gpu-stream-cuda [OPTIONS]" << std::endl << std::endl; - std::cout << "Options:" << std::endl; - std::cout << " -h --help Print the message" << std::endl; - std::cout << " --list List available devices" << std::endl; - std::cout << " --device INDEX Select device at INDEX" << std::endl; - std::cout << " -s --arraysize SIZE Use SIZE elements in the array" << std::endl; - std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl; - std::cout << " --float Use floats (rather than doubles)" << std::endl; - std::cout << std::endl; - exit(0); - } - else - { - std::cout << "Unrecognized argument '" << argv[i] << "' (try '--help')" - << std::endl; - exit(1); - } - } -} diff --git a/common.h b/common.h deleted file mode 100644 index a4dd8864..00000000 --- a/common.h +++ /dev/null @@ -1,112 +0,0 @@ -/*============================================================================= -*------------------------------------------------------------------------------ -* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC -* Based on John D. McCalpin’s original STREAM benchmark for CPUs -*------------------------------------------------------------------------------ -* License: -* 1. You are free to use this program and/or to redistribute -* this program. -* 2. You are free to modify this program for your own use, -* including commercial use, subject to the publication -* restrictions in item 3. -* 3. You are free to publish results obtained from running this -* program, or from works that you derive from this program, -* with the following limitations: -* 3a. In order to be referred to as "GPU-STREAM benchmark results", -* published results must be in conformance to the GPU-STREAM -* Run Rules published at -* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules -* and incorporated herein by reference. -* The copyright holders retain the -* right to determine conformity with the Run Rules. -* 3b. Results based on modified source code or on runs not in -* accordance with the GPU-STREAM Run Rules must be clearly -* labelled whenever they are published. Examples of -* proper labelling include: -* "tuned GPU-STREAM benchmark results" -* "based on a variant of the GPU-STREAM benchmark code" -* Other comparable, clear and reasonable labelling is -* acceptable. -* 3c. Submission of results to the GPU-STREAM benchmark web site -* is encouraged, but not required. -* 4. Use of this program or creation of derived works based on this -* program constitutes acceptance of these licensing restrictions. -* 5. Absolutely no warranty is expressed or implied. -*———————————————————————————————————-----------------------------------------*/ - -#include -#include -#include -#include -#include -#include - -#define VERSION_STRING "1.0" - -extern void parseArguments(int argc, char *argv[]); - -extern void listDevices(void); - -extern unsigned int ARRAY_SIZE; -extern unsigned int NTIMES; - -extern bool useFloat; - -extern unsigned int deviceIndex; - - -template < typename T > -void check_solution(void* a_in, void* b_in, void* c_in) -{ - // Generate correct solution - T golda = 1.0; - T goldb = 2.0; - T goldc = 0.0; - - T * a = static_cast(a_in); - T * b = static_cast(b_in); - T * c = static_cast(c_in); - - const T scalar = 3.0; - - for (unsigned int i = 0; i < NTIMES; i++) - { - // Double - goldc = golda; - goldb = scalar * goldc; - goldc = golda + goldb; - golda = goldb + scalar * goldc; - } - - // Calculate average error - double erra = 0.0; - double errb = 0.0; - double errc = 0.0; - - for (unsigned int i = 0; i < ARRAY_SIZE; i++) - { - erra += fabs(a[i] - golda); - errb += fabs(b[i] - goldb); - errc += fabs(c[i] - goldc); - } - - erra /= ARRAY_SIZE; - errb /= ARRAY_SIZE; - errc /= ARRAY_SIZE; - - double epsi = std::numeric_limits::epsilon() * 100; - - if (erra > epsi) - std::cout - << "Validation failed on a[]. Average error " << erra - << std::endl; - if (errb > epsi) - std::cout - << "Validation failed on b[]. Average error " << errb - << std::endl; - if (errc > epsi) - std::cout - << "Validation failed on c[]. Average error " << errc - << std::endl; -} - diff --git a/cuda-stream.cu b/cuda-stream.cu deleted file mode 100644 index 2ab3adbf..00000000 --- a/cuda-stream.cu +++ /dev/null @@ -1,397 +0,0 @@ -/*============================================================================= -*------------------------------------------------------------------------------ -* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC -* Based on John D. McCalpin’s original STREAM benchmark for CPUs -*------------------------------------------------------------------------------ -* License: -* 1. You are free to use this program and/or to redistribute -* this program. -* 2. You are free to modify this program for your own use, -* including commercial use, subject to the publication -* restrictions in item 3. -* 3. You are free to publish results obtained from running this -* program, or from works that you derive from this program, -* with the following limitations: -* 3a. In order to be referred to as "GPU-STREAM benchmark results", -* published results must be in conformance to the GPU-STREAM -* Run Rules published at -* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules -* and incorporated herein by reference. -* The copyright holders retain the -* right to determine conformity with the Run Rules. -* 3b. Results based on modified source code or on runs not in -* accordance with the GPU-STREAM Run Rules must be clearly -* labelled whenever they are published. Examples of -* proper labelling include: -* "tuned GPU-STREAM benchmark results" -* "based on a variant of the GPU-STREAM benchmark code" -* Other comparable, clear and reasonable labelling is -* acceptable. -* 3c. Submission of results to the GPU-STREAM benchmark web site -* is encouraged, but not required. -* 4. Use of this program or creation of derived works based on this -* program constitutes acceptance of these licensing restrictions. -* 5. Absolutely no warranty is expressed or implied. -*———————————————————————————————————-----------------------------------------*/ - - -#include -#include -#include -#include -#include -#include - -#include -#include "common.h" - -std::string getDeviceName(int device); -int getDriver(void); - -// Code to check CUDA errors -void check_cuda_error(void) -{ - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - std::cerr - << "Error: " - << cudaGetErrorString(err) - << std::endl; - exit(err); - } -} - -template -__global__ void copy(const T * a, T * c) -{ - const int i = blockDim.x * blockIdx.x + threadIdx.x; - c[i] = a[i]; -} - -template -__global__ void mul(T * b, const T * c) -{ - const T scalar = 3.0; - const int i = blockDim.x * blockIdx.x + threadIdx.x; - b[i] = scalar * c[i]; -} - -template -__global__ void add(const T * a, const T * b, T * c) -{ - const int i = blockDim.x * blockIdx.x + threadIdx.x; - c[i] = a[i] + b[i]; -} - -template -__global__ void triad(T * a, const T * b, const T * c) -{ - const T scalar = 3.0; - const int i = blockDim.x * blockIdx.x + threadIdx.x; - a[i] = b[i] + scalar * c[i]; -} - -int main(int argc, char *argv[]) -{ - - // Print out run information - std::cout - << "GPU-STREAM" << std::endl - << "Version: " << VERSION_STRING << std::endl - << "Implementation: CUDA" << std::endl; - - parseArguments(argc, argv); - - if (NTIMES < 2) - throw std::runtime_error("Chosen number of times is invalid, must be >= 2"); - - std::cout << "Precision: "; - if (useFloat) std::cout << "float"; - else std::cout << "double"; - std::cout << std::endl << std::endl; - - std::cout << "Running kernels " << NTIMES << " times" << std::endl; - - if (ARRAY_SIZE % 1024 != 0) - { - unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE; - ARRAY_SIZE -= ARRAY_SIZE % 1024; - std::cout - << "Warning: array size must divide 1024" << std::endl - << "Resizing array from " << OLD_ARRAY_SIZE - << " to " << ARRAY_SIZE << std::endl; - if (ARRAY_SIZE == 0) - throw std::runtime_error("Array size must be >= 1024"); - } - - // Get precision (used to reset later) - std::streamsize ss = std::cout.precision(); - - size_t DATATYPE_SIZE; - - if (useFloat) - { - DATATYPE_SIZE = sizeof(float); - } - else - { - DATATYPE_SIZE = sizeof(double); - } - - // Display number of bytes in array - std::cout << std::setprecision(1) << std::fixed - << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" - << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" - << std::endl; - std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" - << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" - << std::endl; - - // Reset precision - std::cout.precision(ss); - - // Check device index is in range - int count; - cudaGetDeviceCount(&count); - check_cuda_error(); - if (deviceIndex >= count) - throw std::runtime_error("Chosen device index is invalid"); - cudaSetDevice(deviceIndex); - check_cuda_error(); - - // Print out device name - std::cout << "Using CUDA device " << getDeviceName(deviceIndex) << std::endl; - - // Print out device CUDA driver version - std::cout << "Driver: " << getDriver() << std::endl; - - // Check buffers fit on the device - cudaDeviceProp props; - cudaGetDeviceProperties(&props, deviceIndex); - if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE) - throw std::runtime_error("Device does not have enough memory for all 3 buffers"); - - // Create host vectors - void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE); - void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE); - void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE); - - // Initilise arrays - for (unsigned int i = 0; i < ARRAY_SIZE; i++) - { - if (useFloat) - { - ((float*)h_a)[i] = 1.0f; - ((float*)h_b)[i] = 2.0f; - ((float*)h_c)[i] = 0.0f; - } - else - { - ((double*)h_a)[i] = 1.0; - ((double*)h_b)[i] = 2.0; - ((double*)h_c)[i] = 0.0; - } - } - - // Create device buffers - void * d_a, * d_b, *d_c; - cudaMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE); - check_cuda_error(); - cudaMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE); - check_cuda_error(); - cudaMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE); - check_cuda_error(); - - // Copy host memory to device - cudaMemcpy(d_a, h_a, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice); - check_cuda_error(); - cudaMemcpy(d_b, h_b, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice); - check_cuda_error(); - cudaMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice); - check_cuda_error(); - - // Make sure the copies are finished - cudaDeviceSynchronize(); - check_cuda_error(); - - // List of times - std::vector< std::vector > timings; - - // Declare timers - std::chrono::high_resolution_clock::time_point t1, t2; - - // Main loop - for (unsigned int k = 0; k < NTIMES; k++) - { - std::vector times; - t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - copy<<>>((float*)d_a, (float*)d_c); - else - copy<<>>((double*)d_a, (double*)d_c); - check_cuda_error(); - cudaDeviceSynchronize(); - check_cuda_error(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - mul<<>>((float*)d_b, (float*)d_c); - else - mul<<>>((double*)d_b, (double*)d_c); - check_cuda_error(); - cudaDeviceSynchronize(); - check_cuda_error(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - add<<>>((float*)d_a, (float*)d_b, (float*)d_c); - else - add<<>>((double*)d_a, (double*)d_b, (double*)d_c); - check_cuda_error(); - cudaDeviceSynchronize(); - check_cuda_error(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - triad<<>>((float*)d_a, (float*)d_b, (float*)d_c); - else - triad<<>>((double*)d_a, (double*)d_b, (double*)d_c); - check_cuda_error(); - cudaDeviceSynchronize(); - check_cuda_error(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - timings.push_back(times); - - } - - // Check solutions - cudaMemcpy(h_a, d_a, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost); - check_cuda_error(); - cudaMemcpy(h_b, d_b, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost); - check_cuda_error(); - cudaMemcpy(h_c, d_c, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost); - check_cuda_error(); - - if (useFloat) - { - check_solution(h_a, h_b, h_c); - } - else - { - check_solution(h_a, h_b, h_c); - } - - // Crunch results - size_t sizes[4] = { - 2 * DATATYPE_SIZE * ARRAY_SIZE, - 2 * DATATYPE_SIZE * ARRAY_SIZE, - 3 * DATATYPE_SIZE * ARRAY_SIZE, - 3 * DATATYPE_SIZE * ARRAY_SIZE - }; - double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX}; - double max[4] = {0.0, 0.0, 0.0, 0.0}; - double avg[4] = {0.0, 0.0, 0.0, 0.0}; - - // Ignore first result - for (unsigned int i = 1; i < NTIMES; i++) - { - for (int j = 0; j < 4; j++) - { - avg[j] += timings[i][j]; - min[j] = std::min(min[j], timings[i][j]); - max[j] = std::max(max[j], timings[i][j]); - } - } - - for (int j = 0; j < 4; j++) - avg[j] /= (double)(NTIMES-1); - - // Display results - std::string labels[] = {"Copy", "Mul", "Add", "Triad"}; - std::cout - << std::left << std::setw(12) << "Function" - << std::left << std::setw(12) << "MBytes/sec" - << std::left << std::setw(12) << "Min (sec)" - << std::left << std::setw(12) << "Max" - << std::left << std::setw(12) << "Average" - << std::endl; - - for (int j = 0; j < 4; j++) - { - std::cout - << std::left << std::setw(12) << labels[j] - << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j] - << std::left << std::setw(12) << std::setprecision(5) << min[j] - << std::left << std::setw(12) << std::setprecision(5) << max[j] - << std::left << std::setw(12) << std::setprecision(5) << avg[j] - << std::endl; - } - - // Free host vectors - free(h_a); - free(h_b); - free(h_c); - - // Free cuda buffers - cudaFree(d_a); - check_cuda_error(); - cudaFree(d_b); - check_cuda_error(); - cudaFree(d_c); - check_cuda_error(); - -} - -std::string getDeviceName(int device) -{ - struct cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, device); - check_cuda_error(); - return std::string(prop.name); -} - -int getDriver(void) -{ - int driver; - cudaDriverGetVersion(&driver); - check_cuda_error(); - return driver; -} - -void listDevices(void) -{ - // Get number of devices - int count; - cudaGetDeviceCount(&count); - check_cuda_error(); - - // Print device names - if (count == 0) - { - std::cout << "No devices found." << std::endl; - } - else - { - std::cout << std::endl; - std::cout << "Devices:" << std::endl; - for (int i = 0; i < count; i++) - { - std::cout << i << ": " << getDeviceName(i) << std::endl; - check_cuda_error(); - } - std::cout << std::endl; - } -} - diff --git a/ocl-stream-kernels.cl b/ocl-stream-kernels.cl deleted file mode 100644 index e5af7ce5..00000000 --- a/ocl-stream-kernels.cl +++ /dev/null @@ -1,70 +0,0 @@ -/*============================================================================= -*------------------------------------------------------------------------------ -* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC -* Based on John D. McCalpin’s original STREAM benchmark for CPUs -*------------------------------------------------------------------------------ -* License: -* 1. You are free to use this program and/or to redistribute -* this program. -* 2. You are free to modify this program for your own use, -* including commercial use, subject to the publication -* restrictions in item 3. -* 3. You are free to publish results obtained from running this -* program, or from works that you derive from this program, -* with the following limitations: -* 3a. In order to be referred to as "GPU-STREAM benchmark results", -* published results must be in conformance to the GPU-STREAM -* Run Rules published at -* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules -* and incorporated herein by reference. -* The copyright holders retain the -* right to determine conformity with the Run Rules. -* 3b. Results based on modified source code or on runs not in -* accordance with the GPU-STREAM Run Rules must be clearly -* labelled whenever they are published. Examples of -* proper labelling include: -* "tuned GPU-STREAM benchmark results" -* "based on a variant of the GPU-STREAM benchmark code" -* Other comparable, clear and reasonable labelling is -* acceptable. -* 3c. Submission of results to the GPU-STREAM benchmark web site -* is encouraged, but not required. -* 4. Use of this program or creation of derived works based on this -* program constitutes acceptance of these licensing restrictions. -* 5. Absolutely no warranty is expressed or implied. -*———————————————————————————————————-----------------------------------------*/ - - -#ifdef FLOAT - #define DATATYPE float - constant DATATYPE scalar = 3.0f; -#else - #pragma OPENCL EXTENSION cl_khr_fp64 : enable - #define DATATYPE double - constant DATATYPE scalar = 3.0; -#endif - - -kernel void copy(global const DATATYPE * restrict a, global DATATYPE * restrict c) -{ - const size_t i = get_global_id(0); - c[i] = a[i]; -} - -kernel void mul(global DATATYPE * restrict b, global const DATATYPE * restrict c) -{ - const size_t i = get_global_id(0); - b[i] = scalar * c[i]; -} - -kernel void add(global const DATATYPE * restrict a, global const DATATYPE * restrict b, global DATATYPE * restrict c) -{ - const size_t i = get_global_id(0); - c[i] = a[i] + b[i]; -} - -kernel void triad(global DATATYPE * restrict a, global const DATATYPE * restrict b, global const DATATYPE * restrict c) -{ - const size_t i = get_global_id(0); - a[i] = b[i] + scalar * c[i]; -} diff --git a/ocl-stream.cpp b/ocl-stream.cpp deleted file mode 100644 index 1a462959..00000000 --- a/ocl-stream.cpp +++ /dev/null @@ -1,488 +0,0 @@ -/*============================================================================= -*------------------------------------------------------------------------------ -* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC -* Based on John D. McCalpin’s original STREAM benchmark for CPUs -*------------------------------------------------------------------------------ -* License: -* 1. You are free to use this program and/or to redistribute -* this program. -* 2. You are free to modify this program for your own use, -* including commercial use, subject to the publication -* restrictions in item 3. -* 3. You are free to publish results obtained from running this -* program, or from works that you derive from this program, -* with the following limitations: -* 3a. In order to be referred to as "GPU-STREAM benchmark results", -* published results must be in conformance to the GPU-STREAM -* Run Rules published at -* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules -* and incorporated herein by reference. -* The copyright holders retain the -* right to determine conformity with the Run Rules. -* 3b. Results based on modified source code or on runs not in -* accordance with the GPU-STREAM Run Rules must be clearly -* labelled whenever they are published. Examples of -* proper labelling include: -* "tuned GPU-STREAM benchmark results" -* "based on a variant of the GPU-STREAM benchmark code" -* Other comparable, clear and reasonable labelling is -* acceptable. -* 3c. Submission of results to the GPU-STREAM benchmark web site -* is encouraged, but not required. -* 4. Use of this program or creation of derived works based on this -* program constitutes acceptance of these licensing restrictions. -* 5. Absolutely no warranty is expressed or implied. -*———————————————————————————————————-----------------------------------------*/ - - -#include -#include -#include -#include -#include -#include - -#define CL_HPP_ENABLE_EXCEPTIONS -#define CL_HPP_MINIMUM_OPENCL_VERSION 110 -#define CL_HPP_TARGET_OPENCL_VERSION 110 -#include "CL/cl2.hpp" -#include "common.h" - -std::string getDeviceName(const cl::Device& device); -std::string getDeviceDriver(const cl::Device& device); -unsigned getDeviceList(std::vector& devices); - - -// Print error and exit -void die(std::string msg, cl::Error& e) -{ - std::cerr - << "Error: " - << msg - << ": " << e.what() - << "(" << e.err() << ")" - << std::endl; - exit(e.err()); -} - - -int main(int argc, char *argv[]) -{ - - // Print out run information - std::cout - << "GPU-STREAM" << std::endl - << "Version: " << VERSION_STRING << std::endl - << "Implementation: OpenCL" << std::endl; - - std::string status; - - try - { - parseArguments(argc, argv); - if (NTIMES < 2) - throw std::runtime_error("Chosen number of times is invalid, must be >= 2"); - - - std::cout << "Precision: "; - if (useFloat) std::cout << "float"; - else std::cout << "double"; - std::cout << std::endl << std::endl; - - std::cout << "Running kernels " << NTIMES << " times" << std::endl; - - if (ARRAY_SIZE % 1024 != 0) - { - unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE; - ARRAY_SIZE -= ARRAY_SIZE % 1024; - std::cout - << "Warning: array size must divide 1024" << std::endl - << "Resizing array from " << OLD_ARRAY_SIZE - << " to " << ARRAY_SIZE << std::endl; - if (ARRAY_SIZE == 0) - throw std::runtime_error("Array size must be >= 1024"); - } - - // Get precision (used to reset later) - std::streamsize ss = std::cout.precision(); - - size_t DATATYPE_SIZE; - - if (useFloat) - { - DATATYPE_SIZE = sizeof(float); - } - else - { - DATATYPE_SIZE = sizeof(double); - } - - // Display number of bytes in array - std::cout << std::setprecision(1) << std::fixed - << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" - << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" - << std::endl; - std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" - << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" - << std::endl; - - // Reset precision - std::cout.precision(ss); - - // Open the Kernel source - std::string kernels; - { - std::ifstream in("ocl-stream-kernels.cl"); - if (!in.is_open()) - throw std::runtime_error("Cannot open kernel file"); - kernels = std::string (std::istreambuf_iterator(in), (std::istreambuf_iterator())); - } - - - // Setup OpenCL - - // Get list of devices - std::vector devices; - getDeviceList(devices); - - // Check device index is in range - if (deviceIndex >= devices.size()) - throw std::runtime_error("Chosen device index is invalid"); - - cl::Device device = devices[deviceIndex]; - - status = "Creating context"; - cl::Context context(device); - - status = "Creating queue"; - cl::CommandQueue queue(context); - - status = "Creating program"; - cl::Program program(context, kernels); - - // Print out device name - std::string name = getDeviceName(device); - std::cout << "Using OpenCL device " << name << std::endl; - - // Print out OpenCL driver version for this device - std::string driver = getDeviceDriver(device); - std::cout << "Driver: " << driver << std::endl; - - // Check device can do double precision if requested - if (!useFloat && !device.getInfo()) - throw std::runtime_error("Device does not support double precision, please use --float"); - - // Check buffers fit on the device - status = "Getting device memory sizes"; - cl_ulong totalmem = device.getInfo(); - cl_ulong maxbuffer = device.getInfo(); - if (maxbuffer < DATATYPE_SIZE*ARRAY_SIZE) - throw std::runtime_error("Device cannot allocate a buffer big enough"); - if (totalmem < 3*DATATYPE_SIZE*ARRAY_SIZE) - throw std::runtime_error("Device does not have enough memory for all 3 buffers"); - - try - { - std::string options = ""; - if (useFloat) - options = "-DFLOAT"; - program.build(options.c_str()); - } - catch (cl::Error& e) - { - std::vector devices = context.getInfo(); - std::string buildlog = program.getBuildInfo(devices[0]); - std::cerr - << "Build error:" - << buildlog - << std::endl; - throw e; - } - - status = "Making kernel copy"; - auto copy = cl::KernelFunctor(program, "copy"); - status = "Making kernel mul"; - auto mul = cl::KernelFunctor(program, "mul"); - status = "Making kernel add"; - auto add = cl::KernelFunctor(program, "add"); - status = "Making kernel triad"; - auto triad = cl::KernelFunctor(program, "triad"); - - // Create host vectors - void *h_a = malloc(ARRAY_SIZE * DATATYPE_SIZE); - void *h_b = malloc(ARRAY_SIZE * DATATYPE_SIZE); - void *h_c = malloc(ARRAY_SIZE * DATATYPE_SIZE); - - // Initilise arrays - for (unsigned int i = 0; i < ARRAY_SIZE; i++) - { - if (useFloat) - { - ((float*)h_a)[i] = 1.0f; - ((float*)h_b)[i] = 2.0f; - ((float*)h_c)[i] = 0.0f; - } - else - { - ((double*)h_a)[i] = 1.0; - ((double*)h_b)[i] = 2.0; - ((double*)h_c)[i] = 0.0; - } - } - - // Create device buffers - status = "Creating buffers"; - cl::Buffer d_a(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE); - cl::Buffer d_b(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE); - cl::Buffer d_c(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE); - - - // Copy host memory to device - status = "Copying buffers"; - queue.enqueueWriteBuffer(d_a, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_a); - queue.enqueueWriteBuffer(d_b, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_b); - queue.enqueueWriteBuffer(d_c, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_c); - - // Make sure the copies are finished - queue.finish(); - - - // List of times - std::vector< std::vector > timings; - - // Declare timers - std::chrono::high_resolution_clock::time_point t1, t2; - - // Main loop - for (unsigned int k = 0; k < NTIMES; k++) - { - status = "Executing copy"; - std::vector times; - t1 = std::chrono::high_resolution_clock::now(); - copy( - cl::EnqueueArgs( - queue, - cl::NDRange(ARRAY_SIZE)), - d_a, d_c); - queue.finish(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - status = "Executing mul"; - t1 = std::chrono::high_resolution_clock::now(); - mul( - cl::EnqueueArgs( - queue, - cl::NDRange(ARRAY_SIZE)), - d_b, d_c); - queue.finish(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - status = "Executing add"; - t1 = std::chrono::high_resolution_clock::now(); - add( - cl::EnqueueArgs( - queue, - cl::NDRange(ARRAY_SIZE)), - d_a, d_b, d_c); - queue.finish(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - status = "Executing triad"; - t1 = std::chrono::high_resolution_clock::now(); - triad( - cl::EnqueueArgs( - queue, - cl::NDRange(ARRAY_SIZE)), - d_a, d_b, d_c); - queue.finish(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - timings.push_back(times); - - } - - // Check solutions - status = "Copying back buffers"; - queue.enqueueReadBuffer(d_a, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_a); - queue.enqueueReadBuffer(d_b, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_b); - queue.enqueueReadBuffer(d_c, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_c); - queue.finish(); - - - if (useFloat) - { - check_solution(h_a, h_b, h_c); - } - else - { - check_solution(h_a, h_b, h_c); - } - - // Crunch results - size_t sizes[4] = { - 2 * DATATYPE_SIZE * ARRAY_SIZE, - 2 * DATATYPE_SIZE * ARRAY_SIZE, - 3 * DATATYPE_SIZE * ARRAY_SIZE, - 3 * DATATYPE_SIZE * ARRAY_SIZE - }; - double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX}; - double max[4] = {0.0, 0.0, 0.0, 0.0}; - double avg[4] = {0.0, 0.0, 0.0, 0.0}; - // Ignore first result - for (unsigned int i = 1; i < NTIMES; i++) - { - for (int j = 0; j < 4; j++) - { - avg[j] += timings[i][j]; - min[j] = std::min(min[j], timings[i][j]); - max[j] = std::max(max[j], timings[i][j]); - } - } - for (int j = 0; j < 4; j++) - avg[j] /= (double)(NTIMES-1); - - // Display results - std::string labels[] = {"Copy", "Mul", "Add", "Triad"}; - std::cout - << std::left << std::setw(12) << "Function" - << std::left << std::setw(12) << "MBytes/sec" - << std::left << std::setw(12) << "Min (sec)" - << std::left << std::setw(12) << "Max" - << std::left << std::setw(12) << "Average" - << std::endl; - for (int j = 0; j < 4; j++) - { - std::cout - << std::left << std::setw(12) << labels[j] - << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j] - << std::left << std::setw(12) << std::setprecision(5) << min[j] - << std::left << std::setw(12) << std::setprecision(5) << max[j] - << std::left << std::setw(12) << std::setprecision(5) << avg[j] - << std::endl; - } - - // Free host vectors - free(h_a); - free(h_b); - free(h_c); - - } - catch (cl::Error &e) - { - die(status, e); - } - catch (std::exception& e) - { - std::cerr - << "Error: " - << e.what() - << std::endl; - exit(EXIT_FAILURE); - } - -} - - -unsigned getDeviceList(std::vector& devices) -{ - // Get list of platforms - std::vector platforms; - try - { - cl::Platform::get(&platforms); - } - catch (cl::Error &e) - { - die("Getting platforms", e); - } - - // Enumerate devices - for (unsigned int i = 0; i < platforms.size(); i++) - { - std::vector plat_devices; - try - { - platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &plat_devices); - } - catch (cl::Error &e) - { - die("Getting devices", e); - } - devices.insert(devices.end(), plat_devices.begin(), plat_devices.end()); - } - - return devices.size(); -} - - -std::string getDeviceName(const cl::Device& device) -{ - std::string name; - cl_device_info info = CL_DEVICE_NAME; - - try - { - - // Special case for AMD -#ifdef CL_DEVICE_BOARD_NAME_AMD - device.getInfo(CL_DEVICE_VENDOR, &name); - if (strstr(name.c_str(), "Advanced Micro Devices")) - info = CL_DEVICE_BOARD_NAME_AMD; -#endif - - device.getInfo(info, &name); - } - catch (cl::Error &e) - { - die("Getting device name", e); - } - - return name; -} - -std::string getDeviceDriver(const cl::Device& device) -{ - std::string driver; - try - { - device.getInfo(CL_DRIVER_VERSION, &driver); - } - catch (cl::Error &e) - { - die("Getting device driver", e); - } - - return driver; -} - - -void listDevices(void) -{ - // Get list of devices - std::vector devices; - getDeviceList(devices); - - // Print device names - if (devices.size() == 0) - { - std::cout << "No devices found." << std::endl; - } - else - { - std::cout << std::endl; - std::cout << "Devices:" << std::endl; - for (unsigned i = 0; i < devices.size(); i++) - { - std::cout << i << ": " << getDeviceName(devices[i]) << std::endl; - } - std::cout << std::endl; - } -} -