Skip to content

Commit

Permalink
Move host-side allocation to benchmarks and reuse device with UVM
Browse files Browse the repository at this point in the history
This commit puts benchmarks in control of allocating the host
memory used for verifying the results.

This enables benchmarks that use Unified Memory for the device
allocations, to avoid the host-side allocation and just pass
pointers to the device allocation to the benchmark driver.

Closes #128 .
  • Loading branch information
gonzalobg committed Jun 3, 2024
1 parent 321ba62 commit d5b776c
Show file tree
Hide file tree
Showing 43 changed files with 578 additions and 509 deletions.
7 changes: 6 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,14 @@ if ((NOT BUILD_TYPE STREQUAL RELEASE) AND (NOT BUILD_TYPE STREQUAL DEBUG))
message(FATAL_ERROR "Only Release or Debug is supported, got `${CMAKE_BUILD_TYPE}`")
endif ()

option(BUILD_NATIVE "Builds for the current systems CPU and GPU architecture." ON)

# setup some defaults flags for everything
set(DEFAULT_DEBUG_FLAGS -O2 -fno-omit-frame-pointer)
set(DEFAULT_RELEASE_FLAGS -O3 -march=native)
set(DEFAULT_RELEASE_FLAGS -O3)
if (BUILD_NATIVE)
set(DEFAULT_RELEASE_FLAGS ${DEFAULT_RELEASE_FLAGS} -march=native)
endif()

macro(hint_flag FLAG DESCRIPTION)
if (NOT DEFINED ${FLAG})
Expand Down
13 changes: 4 additions & 9 deletions src/Stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,10 @@

#pragma once

#include <array>
#include <vector>
#include <string>

// Array values
#define startA (0.1)
#define startB (0.2)
#define startC (0.0)
#define startScalar (0.4)
#include "benchmark.h"

template <class T>
class Stream
Expand All @@ -31,9 +27,8 @@ class Stream
virtual void nstream() = 0;
virtual T dot() = 0;

// Copy memory between host and device
virtual void init_arrays(T initA, T initB, T initC) = 0;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
// Set pointers to read from arrays
virtual void get_arrays(T const*& a, T const*& b, T const*& c) = 0;
};

// Implementation specific device functions
Expand Down
34 changes: 17 additions & 17 deletions src/StreamModels.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,67 +35,67 @@
#include "FutharkStream.h"
#endif

template <typename T>
std::unique_ptr<Stream<T>> make_stream(intptr_t array_size, int deviceIndex) {
template <typename T, typename...Args>
std::unique_ptr<Stream<T>> make_stream(Args... args) {
#if defined(CUDA)
// Use the CUDA implementation
return std::make_unique<CUDAStream<T>>(array_size, deviceIndex);
return std::make_unique<CUDAStream<T>>(args...);

#elif defined(HIP)
// Use the HIP implementation
return std::make_unique<HIPStream<T>>(array_size, deviceIndex);
return std::make_unique<HIPStream<T>>(args...);

#elif defined(HC)
// Use the HC implementation
return std::make_unique<HCStream<T>>(array_size, deviceIndex);
return std::make_unique<HCStream<T>>(args...);

#elif defined(OCL)
// Use the OpenCL implementation
return std::make_unique<OCLStream<T>>(array_size, deviceIndex);
return std::make_unique<OCLStream<T>>(args...);

#elif defined(USE_RAJA)
// Use the RAJA implementation
return std::make_unique<RAJAStream<T>>(array_size, deviceIndex);
return std::make_unique<RAJAStream<T>>(args...);

#elif defined(KOKKOS)
// Use the Kokkos implementation
return std::make_unique<KokkosStream<T>>(array_size, deviceIndex);
return std::make_unique<KokkosStream<T>>(args...);

#elif defined(STD_DATA)
// Use the C++ STD data-oriented implementation
return std::make_unique<STDDataStream<T>>(array_size, deviceIndex);
return std::make_unique<STDDataStream<T>>(args...);

#elif defined(STD_INDICES)
// Use the C++ STD index-oriented implementation
return std::make_unique<STDIndicesStream<T>>(array_size, deviceIndex);
return std::make_unique<STDIndicesStream<T>>(args...);

#elif defined(STD_RANGES)
// Use the C++ STD ranges implementation
return std::make_unique<STDRangesStream<T>>(array_size, deviceIndex);
return std::make_unique<STDRangesStream<T>>(args...);

#elif defined(TBB)
// Use the C++20 implementation
return std::make_unique<TBBStream<T>>(array_size, deviceIndex);
return std::make_unique<TBBStream<T>>(args...);

#elif defined(THRUST)
// Use the Thrust implementation
return std::make_unique<ThrustStream<T>>(array_size, deviceIndex);
return std::make_unique<ThrustStream<T>>(args...);

#elif defined(ACC)
// Use the OpenACC implementation
return std::make_unique<ACCStream<T>>(array_size, deviceIndex);
return std::make_unique<ACCStream<T>>(args...);

#elif defined(SYCL) || defined(SYCL2020)
// Use the SYCL implementation
return std::make_unique<SYCLStream<T>>(array_size, deviceIndex);
return std::make_unique<SYCLStream<T>>(args...);

#elif defined(OMP)
// Use the OpenMP implementation
return std::make_unique<OMPStream<T>>(array_size, deviceIndex);
return std::make_unique<OMPStream<T>>(args...);

#elif defined(FUTHARK)
// Use the Futhark implementation
return std::make_unique<FutharkStream<T>>(array_size, deviceIndex);
return std::make_unique<FutharkStream<T>>(args...);

#else

Expand Down
20 changes: 10 additions & 10 deletions src/acc/ACCStream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
#include "ACCStream.h"

template <class T>
ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
: array_size{ARRAY_SIZE}
ACCStream<T>::ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
T initA, T initB, T initC)
: array_size{array_size}
{
acc_device_t device_type = acc_get_device_type();
acc_set_device_num(device, device_type);
acc_set_device_num(device_id, device_type);

// Set up data region on device
this->a = new T[array_size];
Expand All @@ -25,6 +26,8 @@ ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)

#pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
{}

init_arrays(initA, initB, initC);
}

template <class T>
Expand Down Expand Up @@ -62,20 +65,17 @@ void ACCStream<T>::init_arrays(T initA, T initB, T initC)
}

template <class T>
void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
{
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size])
{}

for (intptr_t i = 0; i < array_size; i++)
{
h_a[i] = a[i];
h_b[i] = b[i];
h_c[i] = c[i];
}
h_a = a;
h_b = b;
h_c = c;
}

template <class T>
Expand Down
33 changes: 13 additions & 20 deletions src/acc/ACCStream.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,32 +19,25 @@
template <class T>
class ACCStream : public Stream<T>
{
struct A{
T *a;
T *b;
T *c;
};

protected:
// Size of arrays
intptr_t array_size;
A aa;
// Device side pointers
T *a;
T *b;
T *c;
T* restrict a;
T* restrict b;
T* restrict c;

public:
ACCStream(const intptr_t, int);
ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
T initA, T initB, T initC);
~ACCStream();

virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
void copy() override;
void add() override;
void mul() override;
void triad() override;
void nstream() override;
T dot() override;

virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
void get_arrays(T const*& a, T const*& b, T const*& c) override;
void init_arrays(T initA, T initB, T initC);
};
66 changes: 66 additions & 0 deletions src/benchmark.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#pragma once

#include <algorithm>
#include <array>
#include <initializer_list>
#include <iostream>

// Array values
#define startA (0.1)
#define startB (0.2)
#define startC (0.0)
#define startScalar (0.4)

// Benchmark Identifier: identifies individual & groups of benchmarks:
// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot.
// - All: all kernels.
// - Individual kernels only.
enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All};

struct Benchmark {
BenchId id;
char const* label;
// Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW:
// bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur
size_t weight;
// Is it one of: Copy, Mul, Add, Triad, Dot?
bool classic = false;
};

// Benchmarks in the order in which - if present - should be run for validation purposes:
constexpr size_t num_benchmarks = 6;
constexpr std::array<Benchmark, num_benchmarks> bench = {
Benchmark { .id = BenchId::Copy, .label = "Copy", .weight = 2, .classic = true },
Benchmark { .id = BenchId::Mul, .label = "Mul", .weight = 2, .classic = true },
Benchmark { .id = BenchId::Add, .label = "Add", .weight = 3, .classic = true },
Benchmark { .id = BenchId::Triad, .label = "Triad", .weight = 3, .classic = true },
Benchmark { .id = BenchId::Dot, .label = "Dot", .weight = 2, .classic = true },
Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false }
};

// Which buffers are needed by each benchmark
inline bool needs_buffer(BenchId id, char n) {
auto in = [n](std::initializer_list<char> values) {
return std::find(values.begin(), values.end(), n) != values.end();
};
switch(id) {
case BenchId::All: return in({'a','b','c'});
case BenchId::Classic: return in({'a','b','c'});
case BenchId::Copy: return in({'a','c'});
case BenchId::Mul: return in({'b','c'});
case BenchId::Add: return in({'a','b','c'});
case BenchId::Triad: return in({'a','b','c'});
case BenchId::Dot: return in({'a','b'});
case BenchId::Nstream: return in({'a','b','c'});
default:
std::cerr << "Unknown benchmark" << std::endl;
abort();
}
}

// Returns true if the benchmark needs to be run:
inline bool run_benchmark(BenchId selection, Benchmark const& b) {
if (selection == BenchId::All) return true;
if (selection == BenchId::Classic && b.classic) return true;
return selection == b.id;
}
40 changes: 26 additions & 14 deletions src/cuda/CUDAStream.cu
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ void free_host(T* p) {
}

template <class T>
CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
CUDAStream<T>::CUDAStream(BenchId bs, const intptr_t array_size, const int device_index,
T initA, T initB, T initC)
: array_size(array_size)
{
// Set device
Expand Down Expand Up @@ -131,14 +132,20 @@ CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE_DOT << std::endl;

// Check buffers fit on the device
if (dprop.totalGlobalMem < total_bytes)
if (dprop.totalGlobalMem < total_bytes) {
std::cerr << "Requested array size of " << total_bytes * 1e-9
<< " GB exceeds memory capacity of " << dprop.totalGlobalMem * 1e-9 << " GB !" << std::endl;
throw std::runtime_error("Device does not have enough memory for all buffers");
}

// Allocate buffers:
d_a = alloc_device<T>(array_size);
d_b = alloc_device<T>(array_size);
d_c = alloc_device<T>(array_size);
sums = alloc_host<T>(dot_num_blocks);

// Initialize buffers:
init_arrays(initA, initB, initC);
}

template <class T>
Expand Down Expand Up @@ -204,21 +211,26 @@ void CUDAStream<T>::init_arrays(T initA, T initB, T initC)
}

template <class T>
void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
void CUDAStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
{
// Copy device memory to host
#if defined(PAGEFAULT) || defined(MANAGED)
CU(cudaStreamSynchronize(stream));
for (intptr_t i = 0; i < array_size; ++i)
{
a[i] = d_a[i];
b[i] = d_b[i];
c[i] = d_c[i];
}
#if defined(PAGEFAULT) || defined(MANAGED)
// Unified memory: return pointers to device memory
a = d_a;
b = d_b;
c = d_c;
#else
CU(cudaMemcpy(a.data(), d_a, a.size()*sizeof(T), cudaMemcpyDeviceToHost));
CU(cudaMemcpy(b.data(), d_b, b.size()*sizeof(T), cudaMemcpyDeviceToHost));
CU(cudaMemcpy(c.data(), d_c, c.size()*sizeof(T), cudaMemcpyDeviceToHost));
// No Unified memory: copy data to the host
size_t nbytes = array_size * sizeof(T);
h_a.resize(array_size);
h_b.resize(array_size);
h_c.resize(array_size);
a = h_a.data();
b = h_b.data();
c = h_c.data();
CU(cudaMemcpy(h_a.data(), d_a, nbytes, cudaMemcpyDeviceToHost));
CU(cudaMemcpy(h_b.data(), d_b, nbytes, cudaMemcpyDeviceToHost));
CU(cudaMemcpy(h_c.data(), d_c, nbytes, cudaMemcpyDeviceToHost));
#endif
}

Expand Down
Loading

0 comments on commit d5b776c

Please sign in to comment.