From 9695b3b22a91362614af32d30f45b054624dfb08 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Thu, 14 Mar 2024 13:47:16 -0700 Subject: [PATCH] support_large_arrays --- src/StreamModels.h | 2 +- src/acc/ACCStream.cpp | 42 ++++++++++++++++------------ src/acc/ACCStream.h | 18 +++++------- src/cuda/CUDAStream.cu | 34 +++++++++++----------- src/cuda/CUDAStream.h | 6 ++-- src/hip/HIPStream.cpp | 33 +++++++++++----------- src/hip/HIPStream.h | 6 ++-- src/kokkos/KokkosStream.cpp | 4 +-- src/kokkos/KokkosStream.hpp | 4 +-- src/legacy/HCStream.cpp | 2 +- src/legacy/HCStream.h | 4 +-- src/main.cpp | 5 ++-- src/ocl/OCLStream.cpp | 9 +++--- src/ocl/OCLStream.h | 6 ++-- src/omp/OMPStream.cpp | 34 +++++++++++----------- src/omp/OMPStream.h | 7 ++--- src/raja/RAJAStream.cpp | 13 ++++++--- src/raja/RAJAStream.hpp | 6 ++-- src/raja/model.cmake | 4 +-- src/std-data/STDDataStream.cpp | 2 +- src/std-data/STDDataStream.h | 4 +-- src/std-data/model.cmake | 4 +-- src/std-indices/STDIndicesStream.cpp | 10 +++---- src/std-indices/STDIndicesStream.h | 6 ++-- src/std-ranges/STDRangesStream.cpp | 26 ++++++++--------- src/std-ranges/STDRangesStream.hpp | 4 +-- src/std-ranges/model.cmake | 22 +++++++++++++++ src/sycl/SYCLStream.cpp | 2 +- src/sycl/SYCLStream.h | 2 +- src/sycl2020-acc/SYCLStream2020.cpp | 2 +- src/sycl2020-acc/SYCLStream2020.h | 2 +- src/sycl2020-usm/SYCLStream2020.cpp | 2 +- src/sycl2020-usm/SYCLStream2020.h | 2 +- src/tbb/TBBStream.cpp | 4 +-- src/tbb/TBBStream.hpp | 5 +--- src/thrust/ThrustStream.cu | 4 +-- src/thrust/ThrustStream.h | 4 +-- 37 files changed, 184 insertions(+), 162 deletions(-) diff --git a/src/StreamModels.h b/src/StreamModels.h index 0a4649b9..556beb4d 100644 --- a/src/StreamModels.h +++ b/src/StreamModels.h @@ -36,7 +36,7 @@ #endif template -std::unique_ptr> make_stream(int array_size, int deviceIndex) { +std::unique_ptr> make_stream(intptr_t array_size, int deviceIndex) { #if defined(CUDA) // Use the CUDA implementation return std::make_unique>(array_size, deviceIndex); diff --git a/src/acc/ACCStream.cpp b/src/acc/ACCStream.cpp index 48b9f2de..a346a39c 100644 --- a/src/acc/ACCStream.cpp +++ b/src/acc/ACCStream.cpp @@ -8,13 +8,12 @@ #include "ACCStream.h" template -ACCStream::ACCStream(const int ARRAY_SIZE, int device) +ACCStream::ACCStream(const intptr_t ARRAY_SIZE, int device) + : array_size{ARRAY_SIZE} { acc_device_t device_type = acc_get_device_type(); acc_set_device_num(device, device_type); - array_size = ARRAY_SIZE; - // Set up data region on device this->a = new T[array_size]; this->b = new T[array_size]; @@ -32,7 +31,7 @@ template ACCStream::~ACCStream() { // End data region on device - int array_size = this->array_size; + intptr_t array_size = this->array_size; T * restrict a = this->a; T * restrict b = this->b; @@ -49,12 +48,12 @@ ACCStream::~ACCStream() template void ACCStream::init_arrays(T initA, T initB, T initC) { - int array_size = this->array_size; + intptr_t array_size = this->array_size; T * restrict a = this->a; T * restrict b = this->b; T * restrict c = this->c; #pragma acc parallel loop present(a[0:array_size], b[0:array_size], c[0:array_size]) wait - for (int i = 0; i < array_size; i++) + for (intptr_t i = 0; i < array_size; i++) { a[i] = initA; b[i] = initB; @@ -70,16 +69,23 @@ void ACCStream::read_arrays(std::vector& h_a, std::vector& h_b, std::ve T *c = this->c; #pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size]) {} + + for (intptr_t i = 0; i < array_size; i++) + { + h_a[i] = a[i]; + h_b[i] = b[i]; + h_c[i] = c[i]; + } } template void ACCStream::copy() { - int array_size = this->array_size; + intptr_t array_size = this->array_size; T * restrict a = this->a; T * restrict c = this->c; #pragma acc parallel loop present(a[0:array_size], c[0:array_size]) wait - for (int i = 0; i < array_size; i++) + for (intptr_t i = 0; i < array_size; i++) { c[i] = a[i]; } @@ -90,11 +96,11 @@ void ACCStream::mul() { const T scalar = startScalar; - int array_size = this->array_size; + intptr_t array_size = this->array_size; T * restrict b = this->b; T * restrict c = this->c; #pragma acc parallel loop present(b[0:array_size], c[0:array_size]) wait - for (int i = 0; i < array_size; i++) + for (intptr_t i = 0; i < array_size; i++) { b[i] = scalar * c[i]; } @@ -103,12 +109,12 @@ void ACCStream::mul() template void ACCStream::add() { - int array_size = this->array_size; + intptr_t array_size = this->array_size; T * restrict a = this->a; T * restrict b = this->b; T * restrict c = this->c; #pragma acc parallel loop present(a[0:array_size], b[0:array_size], c[0:array_size]) wait - for (int i = 0; i < array_size; i++) + for (intptr_t i = 0; i < array_size; i++) { c[i] = a[i] + b[i]; } @@ -119,12 +125,12 @@ void ACCStream::triad() { const T scalar = startScalar; - int array_size = this->array_size; + intptr_t array_size = this->array_size; T * restrict a = this->a; T * restrict b = this->b; T * restrict c = this->c; #pragma acc parallel loop present(a[0:array_size], b[0:array_size], c[0:array_size]) wait - for (int i = 0; i < array_size; i++) + for (intptr_t i = 0; i < array_size; i++) { a[i] = b[i] + scalar * c[i]; } @@ -135,12 +141,12 @@ void ACCStream::nstream() { const T scalar = startScalar; - int array_size = this->array_size; + intptr_t array_size = this->array_size; T * restrict a = this->a; T * restrict b = this->b; T * restrict c = this->c; #pragma acc parallel loop present(a[0:array_size], b[0:array_size], c[0:array_size]) wait - for (int i = 0; i < array_size; i++) + for (intptr_t i = 0; i < array_size; i++) { a[i] += b[i] + scalar * c[i]; } @@ -151,11 +157,11 @@ T ACCStream::dot() { T sum{}; - int array_size = this->array_size; + intptr_t array_size = this->array_size; T * restrict a = this->a; T * restrict b = this->b; #pragma acc parallel loop reduction(+:sum) present(a[0:array_size], b[0:array_size]) wait - for (int i = 0; i < array_size; i++) + for (intptr_t i = 0; i < array_size; i++) { sum += a[i] * b[i]; } diff --git a/src/acc/ACCStream.h b/src/acc/ACCStream.h index 28fb8333..1b053cb4 100644 --- a/src/acc/ACCStream.h +++ b/src/acc/ACCStream.h @@ -19,16 +19,15 @@ template class ACCStream : public Stream { - - struct A{ - T *a; - T *b; - T *c; - }; + struct A{ + T *a; + T *b; + T *c; + }; protected: // Size of arrays - int array_size; + intptr_t array_size; A aa; // Device side pointers T *a; @@ -36,7 +35,7 @@ class ACCStream : public Stream T *c; public: - ACCStream(const int, int); + ACCStream(const intptr_t, int); ~ACCStream(); virtual void copy() override; @@ -48,7 +47,4 @@ class ACCStream : public Stream virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; - - - }; diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu index a6361139..24d05794 100644 --- a/src/cuda/CUDAStream.cu +++ b/src/cuda/CUDAStream.cu @@ -20,7 +20,7 @@ __host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + cudaStream_t stream; template -CUDAStream::CUDAStream(const int array_size, const int device_index) +CUDAStream::CUDAStream(const intptr_t array_size, const int device_index) : array_size(array_size) { // Set device @@ -96,9 +96,9 @@ CUDAStream::~CUDAStream() } template -__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC, int array_size) +__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC, size_t array_size) { - for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) { + for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { a[i] = initA; b[i] = initB; c[i] = initC; @@ -120,7 +120,7 @@ void CUDAStream::read_arrays(std::vector& a, std::vector& b, std::vecto // Copy device memory to host #if defined(PAGEFAULT) || defined(MANAGED) CU(cudaStreamSynchronize(stream)); - for (int i = 0; i < array_size; ++i) + for (intptr_t i = 0; i < array_size; ++i) { a[i] = d_a[i]; b[i] = d_b[i]; @@ -134,9 +134,9 @@ void CUDAStream::read_arrays(std::vector& a, std::vector& b, std::vecto } template -__global__ void copy_kernel(const T * a, T * c, int array_size) +__global__ void copy_kernel(const T * a, T * c, size_t array_size) { - for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) { + for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { c[i] = a[i]; } } @@ -151,10 +151,10 @@ void CUDAStream::copy() } template -__global__ void mul_kernel(T * b, const T * c, int array_size) +__global__ void mul_kernel(T * b, const T * c, size_t array_size) { const T scalar = startScalar; - for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) { + for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { b[i] = scalar * c[i]; } } @@ -169,9 +169,9 @@ void CUDAStream::mul() } template -__global__ void add_kernel(const T * a, const T * b, T * c, int array_size) +__global__ void add_kernel(const T * a, const T * b, T * c, size_t array_size) { - for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) { + for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { c[i] = a[i] + b[i]; } } @@ -186,10 +186,10 @@ void CUDAStream::add() } template -__global__ void triad_kernel(T * a, const T * b, const T * c, int array_size) +__global__ void triad_kernel(T * a, const T * b, const T * c, size_t array_size) { const T scalar = startScalar; - for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) { + for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { a[i] = b[i] + scalar * c[i]; } } @@ -204,10 +204,10 @@ void CUDAStream::triad() } template -__global__ void nstream_kernel(T * a, const T * b, const T * c, int array_size) +__global__ void nstream_kernel(T * a, const T * b, const T * c, size_t array_size) { const T scalar = startScalar; - for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) { + for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { a[i] += b[i] + scalar * c[i]; } } @@ -222,12 +222,12 @@ void CUDAStream::nstream() } template -__global__ void dot_kernel(const T * a, const T * b, T* sums, int array_size) +__global__ void dot_kernel(const T * a, const T * b, T* sums, size_t array_size) { __shared__ T smem[TBSIZE]; T tmp = T(0.); const size_t tidx = threadIdx.x; - for (int i = tidx + (size_t)blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) { + for (size_t i = tidx + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { tmp += a[i] * b[i]; } smem[tidx] = tmp; @@ -249,7 +249,7 @@ T CUDAStream::dot() CU(cudaStreamSynchronize(stream)); T sum = 0.0; - for (int i = 0; i < dot_num_blocks; ++i) sum += sums[i]; + for (intptr_t i = 0; i < dot_num_blocks; ++i) sum += sums[i]; return sum; } diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h index 54bf2a18..4b4a1a3a 100644 --- a/src/cuda/CUDAStream.h +++ b/src/cuda/CUDAStream.h @@ -22,7 +22,7 @@ class CUDAStream : public Stream { protected: // Size of arrays - int array_size; + intptr_t array_size; // Host array for partial sums for dot kernel T *sums; @@ -33,10 +33,10 @@ class CUDAStream : public Stream T *d_c; // Number of blocks for dot kernel - int dot_num_blocks; + intptr_t dot_num_blocks; public: - CUDAStream(const int, const int); + CUDAStream(const intptr_t, const int); ~CUDAStream(); virtual void copy() override; diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index 79ea9467..ec02425a 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -25,7 +25,7 @@ void check_error(void) __host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + b - 1)/b; } template -HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) +HIPStream::HIPStream(const intptr_t ARRAY_SIZE, const int device_index) { // Set device int count; @@ -107,9 +107,9 @@ HIPStream::~HIPStream() template -__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC, int array_size) +__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC, size_t array_size) { - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) { + for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { a[i] = initA; b[i] = initB; c[i] = initC; @@ -133,7 +133,7 @@ void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector // Copy device memory to host #if defined(PAGEFAULT) || defined(MANAGED) hipDeviceSynchronize(); - for (int i = 0; i < array_size; i++) + for (intptr_t i = 0; i < array_size; i++) { a[i] = d_a[i]; b[i] = d_b[i]; @@ -150,9 +150,9 @@ void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector } template -__global__ void copy_kernel(const T * a, T * c, int array_size) +__global__ void copy_kernel(const T * a, T * c, size_t array_size) { - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) { + for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { c[i] = a[i]; } } @@ -168,10 +168,10 @@ void HIPStream::copy() } template -__global__ void mul_kernel(T * b, const T * c, int array_size) +__global__ void mul_kernel(T * b, const T * c, size_t array_size) { const T scalar = startScalar; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) { + for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { b[i] = scalar * c[i]; } } @@ -187,10 +187,9 @@ void HIPStream::mul() } template -__global__ void add_kernel(const T * a, const T * b, T * c, int array_size) +__global__ void add_kernel(const T * a, const T * b, T * c, size_t array_size) { - const size_t i = threadIdx.x + blockIdx.x * blockDim.x; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) { + for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { c[i] = a[i] + b[i]; } } @@ -206,10 +205,10 @@ void HIPStream::add() } template -__global__ void triad_kernel(T * a, const T * b, const T * c, int array_size) +__global__ void triad_kernel(T * a, const T * b, const T * c, size_t array_size) { const T scalar = startScalar; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) { + for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { a[i] = b[i] + scalar * c[i]; } } @@ -225,10 +224,10 @@ void HIPStream::triad() } template -__global__ void nstream_kernel(T * a, const T * b, const T * c, int array_size) +__global__ void nstream_kernel(T * a, const T * b, const T * c, size_t array_size) { const T scalar = startScalar; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < array_size; i += gridDim.x * blockDim.x) { + for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) { a[i] += b[i] + scalar * c[i]; } } @@ -244,7 +243,7 @@ void HIPStream::nstream() } template -__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) +__global__ void dot_kernel(const T * a, const T * b, T * sum, size_t array_size) { __shared__ T tb_sum[TBSIZE]; @@ -277,7 +276,7 @@ T HIPStream::dot() check_error(); T sum{}; - for (int i = 0; i < dot_num_blocks; i++) + for (intptr_t i = 0; i < dot_num_blocks; i++) sum += sums[i]; return sum; diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index 3c603e0b..76ef7df4 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -37,8 +37,8 @@ class HIPStream : public Stream protected: // Size of arrays - int array_size; - int dot_num_blocks; + intptr_t array_size; + intptr_t dot_num_blocks; // Host array for partial sums for dot kernel T *sums; @@ -51,7 +51,7 @@ class HIPStream : public Stream public: - HIPStream(const int, const int); + HIPStream(const intptr_t, const int); ~HIPStream(); virtual void copy() override; diff --git a/src/kokkos/KokkosStream.cpp b/src/kokkos/KokkosStream.cpp index c6054140..e49d5bcc 100644 --- a/src/kokkos/KokkosStream.cpp +++ b/src/kokkos/KokkosStream.cpp @@ -9,7 +9,7 @@ template KokkosStream::KokkosStream( - const int ARRAY_SIZE, const int device_index) + const intptr_t ARRAY_SIZE, const int device_index) : array_size(ARRAY_SIZE) { Kokkos::initialize(Kokkos::InitializationSettings().set_device_id(device_index)); @@ -53,7 +53,7 @@ void KokkosStream::read_arrays( deep_copy(*hm_a, *d_a); deep_copy(*hm_b, *d_b); deep_copy(*hm_c, *d_c); - for(int ii = 0; ii < array_size; ++ii) + for(intptr_t ii = 0; ii < array_size; ++ii) { a[ii] = (*hm_a)(ii); b[ii] = (*hm_b)(ii); diff --git a/src/kokkos/KokkosStream.hpp b/src/kokkos/KokkosStream.hpp index a410a868..8e40119c 100644 --- a/src/kokkos/KokkosStream.hpp +++ b/src/kokkos/KokkosStream.hpp @@ -19,7 +19,7 @@ class KokkosStream : public Stream { protected: // Size of arrays - int array_size; + intptr_t array_size; // Device side pointers to arrays typename Kokkos::View* d_a; @@ -31,7 +31,7 @@ class KokkosStream : public Stream public: - KokkosStream(const int, const int); + KokkosStream(const intptr_t, const int); ~KokkosStream(); virtual void copy() override; diff --git a/src/legacy/HCStream.cpp b/src/legacy/HCStream.cpp index fb909687..0de0bb3a 100644 --- a/src/legacy/HCStream.cpp +++ b/src/legacy/HCStream.cpp @@ -52,7 +52,7 @@ void listDevices(void) template -HCStream::HCStream(const int ARRAY_SIZE, const int device_index): +HCStream::HCStream(const intptr_t ARRAY_SIZE, const int device_index): array_size(ARRAY_SIZE), d_a(ARRAY_SIZE), d_b(ARRAY_SIZE), diff --git a/src/legacy/HCStream.h b/src/legacy/HCStream.h index 13d07a90..a931cdb6 100644 --- a/src/legacy/HCStream.h +++ b/src/legacy/HCStream.h @@ -21,7 +21,7 @@ class HCStream : public Stream { protected: // Size of arrays - int array_size; + intptr_t array_size; // Device side pointers to arrays hc::array d_a; hc::array d_b; @@ -30,7 +30,7 @@ class HCStream : public Stream public: - HCStream(const int, const int); + HCStream(const intptr_t, const int); ~HCStream(); virtual void copy() override; diff --git a/src/main.cpp b/src/main.cpp index 877127af..ee091259 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,3 +1,4 @@ + // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, // University of Bristol HPC // @@ -22,7 +23,7 @@ #include "Unit.h" // Default size of 2^25 -int ARRAY_SIZE = 33554432; +intptr_t ARRAY_SIZE = 33554432; size_t num_times = 100; size_t deviceIndex = 0; bool use_float = false; @@ -367,7 +368,7 @@ void check_solution(const size_t num_times, size_t failed = 0; T epsi = std::numeric_limits::epsilon() * T(100000.0); auto check = [&](const char* name, T is, T should, T e, size_t i = size_t(-1)) { - if (e > epsi) { + if (e > epsi || std::isnan(e) || std::isnan(is)) { ++failed; if (failed > 10) return; std::cerr << "FAILED validation of " << name; diff --git a/src/ocl/OCLStream.cpp b/src/ocl/OCLStream.cpp index 26b525a8..c70a701d 100644 --- a/src/ocl/OCLStream.cpp +++ b/src/ocl/OCLStream.cpp @@ -75,7 +75,7 @@ std::string kernels{R"CLC( global const TYPE * restrict b, global TYPE * restrict sum, local TYPE * restrict wg_sum, - int array_size) + long array_size) { size_t i = get_global_id(0); const size_t local_i = get_local_id(0); @@ -100,7 +100,8 @@ std::string kernels{R"CLC( template -OCLStream::OCLStream(const int ARRAY_SIZE, const int device_index) +OCLStream::OCLStream(const intptr_t ARRAY_SIZE, const int device_index) + : array_size{ARRAY_SIZE} { if (!cached) getDeviceList(); @@ -166,9 +167,7 @@ OCLStream::OCLStream(const int ARRAY_SIZE, const int device_index) add_kernel = new cl::KernelFunctor(program, "add"); triad_kernel = new cl::KernelFunctor(program, "triad"); nstream_kernel = new cl::KernelFunctor(program, "nstream"); - dot_kernel = new cl::KernelFunctor(program, "stream_dot"); - - array_size = ARRAY_SIZE; + dot_kernel = new cl::KernelFunctor(program, "stream_dot"); // Check buffers fit on the device cl_ulong totalmem = device.getInfo(); diff --git a/src/ocl/OCLStream.h b/src/ocl/OCLStream.h index bcdf9acd..e2366dad 100644 --- a/src/ocl/OCLStream.h +++ b/src/ocl/OCLStream.h @@ -26,7 +26,7 @@ class OCLStream : public Stream { protected: // Size of arrays - int array_size; + intptr_t array_size; // Host array for partial sums for dot kernel std::vector sums; @@ -48,7 +48,7 @@ class OCLStream : public Stream cl::KernelFunctor *add_kernel; cl::KernelFunctor *triad_kernel; cl::KernelFunctor *nstream_kernel; - cl::KernelFunctor *dot_kernel; + cl::KernelFunctor *dot_kernel; // NDRange configuration for the dot kernel size_t dot_num_groups; @@ -56,7 +56,7 @@ class OCLStream : public Stream public: - OCLStream(const int, const int); + OCLStream(const intptr_t, const int); ~OCLStream(); virtual void copy() override; diff --git a/src/omp/OMPStream.cpp b/src/omp/OMPStream.cpp index 774f61bf..09b749fd 100644 --- a/src/omp/OMPStream.cpp +++ b/src/omp/OMPStream.cpp @@ -13,7 +13,7 @@ #endif template -OMPStream::OMPStream(const int ARRAY_SIZE, int device) +OMPStream::OMPStream(const intptr_t ARRAY_SIZE, int device) { array_size = ARRAY_SIZE; @@ -39,7 +39,7 @@ OMPStream::~OMPStream() { #ifdef OMP_TARGET_GPU // End data region on device - int array_size = this->array_size; + intptr_t array_size = this->array_size; T *a = this->a; T *b = this->b; T *c = this->c; @@ -54,7 +54,7 @@ OMPStream::~OMPStream() template void OMPStream::init_arrays(T initA, T initB, T initC) { - int array_size = this->array_size; + intptr_t array_size = this->array_size; #ifdef OMP_TARGET_GPU T *a = this->a; T *b = this->b; @@ -63,7 +63,7 @@ void OMPStream::init_arrays(T initA, T initB, T initC) #else #pragma omp parallel for #endif - for (int i = 0; i < array_size; i++) + for (intptr_t i = 0; i < array_size; i++) { a[i] = initA; b[i] = initB; @@ -89,7 +89,7 @@ void OMPStream::read_arrays(std::vector& h_a, std::vector& h_b, std::ve #endif #pragma omp parallel for - for (int i = 0; i < array_size; i++) + for (intptr_t i = 0; i < array_size; i++) { h_a[i] = a[i]; h_b[i] = b[i]; @@ -102,14 +102,14 @@ template void OMPStream::copy() { #ifdef OMP_TARGET_GPU - int array_size = this->array_size; + intptr_t array_size = this->array_size; T *a = this->a; T *c = this->c; #pragma omp target teams distribute parallel for simd #else #pragma omp parallel for #endif - for (int i = 0; i < array_size; i++) + for (intptr_t i = 0; i < array_size; i++) { c[i] = a[i]; } @@ -126,14 +126,14 @@ void OMPStream::mul() const T scalar = startScalar; #ifdef OMP_TARGET_GPU - int array_size = this->array_size; + intptr_t array_size = this->array_size; T *b = this->b; T *c = this->c; #pragma omp target teams distribute parallel for simd #else #pragma omp parallel for #endif - for (int i = 0; i < array_size; i++) + for (intptr_t i = 0; i < array_size; i++) { b[i] = scalar * c[i]; } @@ -148,7 +148,7 @@ template void OMPStream::add() { #ifdef OMP_TARGET_GPU - int array_size = this->array_size; + intptr_t array_size = this->array_size; T *a = this->a; T *b = this->b; T *c = this->c; @@ -156,7 +156,7 @@ void OMPStream::add() #else #pragma omp parallel for #endif - for (int i = 0; i < array_size; i++) + for (intptr_t i = 0; i < array_size; i++) { c[i] = a[i] + b[i]; } @@ -173,7 +173,7 @@ void OMPStream::triad() const T scalar = startScalar; #ifdef OMP_TARGET_GPU - int array_size = this->array_size; + intptr_t array_size = this->array_size; T *a = this->a; T *b = this->b; T *c = this->c; @@ -181,7 +181,7 @@ void OMPStream::triad() #else #pragma omp parallel for #endif - for (int i = 0; i < array_size; i++) + for (intptr_t i = 0; i < array_size; i++) { a[i] = b[i] + scalar * c[i]; } @@ -198,7 +198,7 @@ void OMPStream::nstream() const T scalar = startScalar; #ifdef OMP_TARGET_GPU - int array_size = this->array_size; + intptr_t array_size = this->array_size; T *a = this->a; T *b = this->b; T *c = this->c; @@ -206,7 +206,7 @@ void OMPStream::nstream() #else #pragma omp parallel for #endif - for (int i = 0; i < array_size; i++) + for (intptr_t i = 0; i < array_size; i++) { a[i] += b[i] + scalar * c[i]; } @@ -223,14 +223,14 @@ T OMPStream::dot() T sum{}; #ifdef OMP_TARGET_GPU - int array_size = this->array_size; + intptr_t array_size = this->array_size; T *a = this->a; T *b = this->b; #pragma omp target teams distribute parallel for simd map(tofrom: sum) reduction(+:sum) #else #pragma omp parallel for reduction(+:sum) #endif - for (int i = 0; i < array_size; i++) + for (intptr_t i = 0; i < array_size; i++) { sum += a[i] * b[i]; } diff --git a/src/omp/OMPStream.h b/src/omp/OMPStream.h index 5a5622fd..40770005 100644 --- a/src/omp/OMPStream.h +++ b/src/omp/OMPStream.h @@ -21,7 +21,7 @@ class OMPStream : public Stream { protected: // Size of arrays - int array_size; + intptr_t array_size; // Device side pointers T *a; @@ -29,7 +29,7 @@ class OMPStream : public Stream T *c; public: - OMPStream(const int, int); + OMPStream(const intptr_t, int); ~OMPStream(); virtual void copy() override; @@ -41,7 +41,4 @@ class OMPStream : public Stream virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; - - - }; diff --git a/src/raja/RAJAStream.cpp b/src/raja/RAJAStream.cpp index 6a99999d..6d6e8342 100644 --- a/src/raja/RAJAStream.cpp +++ b/src/raja/RAJAStream.cpp @@ -16,7 +16,7 @@ using RAJA::forall; #endif template -RAJAStream::RAJAStream(const int ARRAY_SIZE, const int device_index) +RAJAStream::RAJAStream(const intptr_t ARRAY_SIZE, const int device_index) : array_size(ARRAY_SIZE), range(0, ARRAY_SIZE) { @@ -120,9 +120,14 @@ void RAJAStream::triad() template void RAJAStream::nstream() { - // TODO implement me! - std::cerr << "Not implemented yet!" << std::endl; - throw std::runtime_error("Not implemented yet!"); + T* RAJA_RESTRICT a = d_a; + T* RAJA_RESTRICT b = d_b; + T* RAJA_RESTRICT c = d_c; + const T scalar = startScalar; + forall(range, [=] RAJA_DEVICE (RAJA::Index_type index) + { + a[index] += b[index] + scalar * c[index];; + }); } template diff --git a/src/raja/RAJAStream.hpp b/src/raja/RAJAStream.hpp index a7167266..e98b0778 100644 --- a/src/raja/RAJAStream.hpp +++ b/src/raja/RAJAStream.hpp @@ -41,8 +41,8 @@ class RAJAStream : public Stream { protected: // Size of arrays - const int array_size; - const RangeSegment range; + const intptr_t array_size; + const RangeSegment range; // Device side pointers to arrays T* d_a; @@ -51,7 +51,7 @@ class RAJAStream : public Stream public: - RAJAStream(const int, const int); + RAJAStream(const intptr_t, const int); ~RAJAStream(); virtual void copy() override; diff --git a/src/raja/model.cmake b/src/raja/model.cmake index bf306313..6e9f6bc6 100644 --- a/src/raja/model.cmake +++ b/src/raja/model.cmake @@ -99,7 +99,8 @@ macro(setup) message(STATUS "Building using packaged Raja at `${RAJA_IN_PACKAGE}`") find_package(RAJA REQUIRED) register_link_library(RAJA) - + + set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler -extended-lambda -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) else () message(FATAL_ERROR "Neither `${RAJA_IN_TREE}` or `${RAJA_IN_PACKAGE}` exists") endif () @@ -112,7 +113,6 @@ macro(setup) set_source_files_properties(src/main.cpp PROPERTIES LANGUAGE CUDA) endif () - register_append_compiler_and_arch_specific_cxx_flags( RAJA_FLAGS_CPU ${CMAKE_CXX_COMPILER_ID} diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index a234d617..3efeb1b3 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -7,7 +7,7 @@ #include "STDDataStream.h" template -STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) +STDDataStream::STDDataStream(const intptr_t ARRAY_SIZE, int device) noexcept : array_size{ARRAY_SIZE}, a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) { diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h index 65e1acee..d92864be 100644 --- a/src/std-data/STDDataStream.h +++ b/src/std-data/STDDataStream.h @@ -19,13 +19,13 @@ class STDDataStream : public Stream { protected: // Size of arrays - int array_size; + intptr_t array_size; // Device side pointers T *a, *b, *c; public: - STDDataStream(const int, int) noexcept; + STDDataStream(const intptr_t, int) noexcept; ~STDDataStream(); virtual void copy() override; diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake index e9e70998..837d26bf 100644 --- a/src/std-data/model.cmake +++ b/src/std-data/model.cmake @@ -4,7 +4,7 @@ register_flag_optional(CMAKE_CXX_COMPILER "c++") register_flag_optional(NVHPC_OFFLOAD - "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK. + "Enable offloading support (via the non-standard `-stdpar=gpu`) for the new NVHPC SDK. The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) Possible values are: @@ -38,7 +38,7 @@ register_flag_optional(USE_ONEDPL macro(setup) set(CMAKE_CXX_STANDARD 17) if (NVHPC_OFFLOAD) - set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) + set(NVHPC_FLAGS -stdpar=gpu -gpu=${NVHPC_OFFLOAD}) # propagate flags to linker so that it links with the gpu stuff as well register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS}) diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index fc9f3806..473d93d0 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -11,7 +11,7 @@ #endif template -STDIndicesStream::STDIndicesStream(const int ARRAY_SIZE, int device) +STDIndicesStream::STDIndicesStream(const intptr_t ARRAY_SIZE, int device) noexcept : array_size{ARRAY_SIZE}, range(0, array_size), a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) { @@ -65,7 +65,7 @@ template void STDIndicesStream::mul() { // b[i] = scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), b, [c = this->c, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), b, [c = this->c, scalar = startScalar](intptr_t i) { return scalar * c[i]; }); } @@ -74,7 +74,7 @@ template void STDIndicesStream::add() { // c[i] = a[i] + b[i]; - std::transform(exe_policy, range.begin(), range.end(), c, [a = this->a, b = this->b](int i) { + std::transform(exe_policy, range.begin(), range.end(), c, [a = this->a, b = this->b](intptr_t i) { return a[i] + b[i]; }); } @@ -83,7 +83,7 @@ template void STDIndicesStream::triad() { // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), a, [b = this->b, c = this->c, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), a, [b = this->b, c = this->c, scalar = startScalar](intptr_t i) { return b[i] + scalar * c[i]; }); } @@ -95,7 +95,7 @@ void STDIndicesStream::nstream() // Need to do in two stages with C++11 STL. // 1: a[i] += b[i] // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), a, [a = this->a, b = this->b, c = this->c, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), a, [a = this->a, b = this->b, c = this->c, scalar = startScalar](intptr_t i) { return a[i] + b[i] + scalar * c[i]; }); } diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index ffab9103..8a8f5de8 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -71,16 +71,16 @@ class STDIndicesStream : public Stream { protected: // Size of arrays - int array_size; + intptr_t array_size; // induction range - ranged range; + ranged range; // Device side pointers T *a, *b, *c; public: - STDIndicesStream(const int, int) noexcept; + STDIndicesStream(const intptr_t, int) noexcept; ~STDIndicesStream(); virtual void copy() override; diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index b29d0c42..8b7ada4b 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -12,7 +12,7 @@ #endif template -STDRangesStream::STDRangesStream(const int ARRAY_SIZE, int device) +STDRangesStream::STDRangesStream(const intptr_t ARRAY_SIZE, int device) noexcept : array_size{ARRAY_SIZE}, a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) { @@ -44,8 +44,8 @@ void STDRangesStream::init_arrays(T initA, T initB, T initC) { std::for_each_n( exe_policy, - std::views::iota(0).begin(), array_size, // loop range - [&] (int i) { + std::views::iota((intptr_t)0).begin(), array_size, // loop range + [=, this] (intptr_t i) { a[i] = initA; b[i] = initB; c[i] = initC; @@ -67,8 +67,8 @@ void STDRangesStream::copy() { std::for_each_n( exe_policy, - std::views::iota(0).begin(), array_size, - [&] (int i) { + std::views::iota((intptr_t)0).begin(), array_size, + [=, this] (intptr_t i) { c[i] = a[i]; } ); @@ -81,8 +81,8 @@ void STDRangesStream::mul() std::for_each_n( exe_policy, - std::views::iota(0).begin(), array_size, - [&] (int i) { + std::views::iota((intptr_t)0).begin(), array_size, + [=, this] (intptr_t i) { b[i] = scalar * c[i]; } ); @@ -93,8 +93,8 @@ void STDRangesStream::add() { std::for_each_n( exe_policy, - std::views::iota(0).begin(), array_size, - [&] (int i) { + std::views::iota((intptr_t)0).begin(), array_size, + [=, this] (intptr_t i) { c[i] = a[i] + b[i]; } ); @@ -107,8 +107,8 @@ void STDRangesStream::triad() std::for_each_n( exe_policy, - std::views::iota(0).begin(), array_size, - [&] (int i) { + std::views::iota((intptr_t)0).begin(), array_size, + [=, this] (intptr_t i) { a[i] = b[i] + scalar * c[i]; } ); @@ -121,8 +121,8 @@ void STDRangesStream::nstream() std::for_each_n( exe_policy, - std::views::iota(0).begin(), array_size, - [&] (int i) { + std::views::iota((intptr_t)0).begin(), array_size, + [=, this] (intptr_t i) { a[i] += b[i] + scalar * c[i]; } ); diff --git a/src/std-ranges/STDRangesStream.hpp b/src/std-ranges/STDRangesStream.hpp index 6e7c29c6..51680c62 100644 --- a/src/std-ranges/STDRangesStream.hpp +++ b/src/std-ranges/STDRangesStream.hpp @@ -18,13 +18,13 @@ class STDRangesStream : public Stream { protected: // Size of arrays - int array_size; + intptr_t array_size; // Device side pointers T *a, *b, *c; public: - STDRangesStream(const int, int) noexcept; + STDRangesStream(const intptr_t, int) noexcept; ~STDRangesStream(); virtual void copy() override; diff --git a/src/std-ranges/model.cmake b/src/std-ranges/model.cmake index 8f735010..d7fd6a8b 100644 --- a/src/std-ranges/model.cmake +++ b/src/std-ranges/model.cmake @@ -3,6 +3,22 @@ register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection and supports C++20 Ranges" "c++") +register_flag_optional(NVHPC_OFFLOAD + "Enable offloading support (via the non-standard `-stdpar=gpu`) for the new NVHPC SDK. + The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) + + Possible values are: + cc35 - Compile for compute capability 3.5 + cc50 - Compile for compute capability 5.0 + cc60 - Compile for compute capability 6.0 + cc62 - Compile for compute capability 6.2 + cc70 - Compile for compute capability 7.0 + cc72 - Compile for compute capability 7.2 + cc75 - Compile for compute capability 7.5 + cc80 - Compile for compute capability 8.0 + ccall - Compile for all supported compute capabilities" + "") + register_flag_optional(USE_TBB "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." "OFF") @@ -29,6 +45,12 @@ macro(setup) unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default # and append our own: register_append_cxx_flags(ANY -std=c++20) + if (NVHPC_OFFLOAD) + set(NVHPC_FLAGS -stdpar=gpu -gpu=${NVHPC_OFFLOAD}) + # propagate flags to linker so that it links with the gpu stuff as well + register_append_cxx_flags(ANY ${NVHPC_FLAGS}) + register_append_link_flags(${NVHPC_FLAGS}) + endif () if (USE_TBB) register_link_library(TBB::tbb) endif () diff --git a/src/sycl/SYCLStream.cpp b/src/sycl/SYCLStream.cpp index 512517b6..e99454e6 100644 --- a/src/sycl/SYCLStream.cpp +++ b/src/sycl/SYCLStream.cpp @@ -17,7 +17,7 @@ std::vector devices; void getDeviceList(void); template -SYCLStream::SYCLStream(const int ARRAY_SIZE, const int device_index) +SYCLStream::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index) { if (!cached) getDeviceList(); diff --git a/src/sycl/SYCLStream.h b/src/sycl/SYCLStream.h index d3fa18d0..1a40242d 100644 --- a/src/sycl/SYCLStream.h +++ b/src/sycl/SYCLStream.h @@ -54,7 +54,7 @@ class SYCLStream : public Stream public: - SYCLStream(const int, const int); + SYCLStream(const intptr_t, const int); ~SYCLStream(); virtual void copy() override; diff --git a/src/sycl2020-acc/SYCLStream2020.cpp b/src/sycl2020-acc/SYCLStream2020.cpp index 0de24bbb..742be95b 100644 --- a/src/sycl2020-acc/SYCLStream2020.cpp +++ b/src/sycl2020-acc/SYCLStream2020.cpp @@ -15,7 +15,7 @@ std::vector devices; void getDeviceList(void); template -SYCLStream::SYCLStream(const size_t ARRAY_SIZE, const int device_index) +SYCLStream::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index) : array_size {ARRAY_SIZE}, d_a {ARRAY_SIZE}, d_b {ARRAY_SIZE}, diff --git a/src/sycl2020-acc/SYCLStream2020.h b/src/sycl2020-acc/SYCLStream2020.h index caaeae9e..cd515f87 100644 --- a/src/sycl2020-acc/SYCLStream2020.h +++ b/src/sycl2020-acc/SYCLStream2020.h @@ -35,7 +35,7 @@ class SYCLStream : public Stream public: - SYCLStream(const size_t, const int); + SYCLStream(const intptr_t, const int); ~SYCLStream() = default; virtual void copy() override; diff --git a/src/sycl2020-usm/SYCLStream2020.cpp b/src/sycl2020-usm/SYCLStream2020.cpp index 21a8a47b..e4c6ec27 100644 --- a/src/sycl2020-usm/SYCLStream2020.cpp +++ b/src/sycl2020-usm/SYCLStream2020.cpp @@ -15,7 +15,7 @@ std::vector devices; void getDeviceList(void); template -SYCLStream::SYCLStream(const size_t ARRAY_SIZE, const int device_index) +SYCLStream::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index) : array_size {ARRAY_SIZE} { if (!cached) diff --git a/src/sycl2020-usm/SYCLStream2020.h b/src/sycl2020-usm/SYCLStream2020.h index 0b2dc0db..811c26ef 100644 --- a/src/sycl2020-usm/SYCLStream2020.h +++ b/src/sycl2020-usm/SYCLStream2020.h @@ -35,7 +35,7 @@ class SYCLStream : public Stream public: - SYCLStream(const size_t, const int); + SYCLStream(const intptr_t, const int); ~SYCLStream(); virtual void copy() override; diff --git a/src/tbb/TBBStream.cpp b/src/tbb/TBBStream.cpp index c5e9d905..75af6141 100644 --- a/src/tbb/TBBStream.cpp +++ b/src/tbb/TBBStream.cpp @@ -20,8 +20,8 @@ #endif template -TBBStream::TBBStream(const int ARRAY_SIZE, int device) - : partitioner(), range(0, ARRAY_SIZE), +TBBStream::TBBStream(const intptr_t ARRAY_SIZE, int device) + : partitioner(), range(0, (size_t)ARRAY_SIZE), #ifdef USE_VECTOR a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) #else diff --git a/src/tbb/TBBStream.hpp b/src/tbb/TBBStream.hpp index 2744afc2..80f11c17 100644 --- a/src/tbb/TBBStream.hpp +++ b/src/tbb/TBBStream.hpp @@ -47,10 +47,8 @@ class TBBStream : public Stream T *a, *b, *c; #endif - - public: - TBBStream(const int, int); + TBBStream(const intptr_t, int); ~TBBStream() = default; virtual void copy() override; @@ -62,6 +60,5 @@ class TBBStream : public Stream virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; - }; diff --git a/src/thrust/ThrustStream.cu b/src/thrust/ThrustStream.cu index f15a3924..84b27b8e 100644 --- a/src/thrust/ThrustStream.cu +++ b/src/thrust/ThrustStream.cu @@ -19,8 +19,8 @@ static inline void synchronise() } template -ThrustStream::ThrustStream(const int ARRAY_SIZE, int device) - : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size) { +ThrustStream::ThrustStream(const intptr_t array_size, int device) + : array_size{array_size}, a(array_size), b(array_size), c(array_size) { std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl; std::cout << "Driver: " << getDeviceDriver(device) << std::endl; std::cout << "Thrust version: " << THRUST_VERSION << std::endl; diff --git a/src/thrust/ThrustStream.h b/src/thrust/ThrustStream.h index a2a4b72f..b0acd80f 100644 --- a/src/thrust/ThrustStream.h +++ b/src/thrust/ThrustStream.h @@ -23,7 +23,7 @@ class ThrustStream : public Stream { protected: // Size of arrays - int array_size; + intptr_t array_size; #if defined(MANAGED) thrust::universtal_vector a; @@ -36,7 +36,7 @@ class ThrustStream : public Stream #endif public: - ThrustStream(const int, int); + ThrustStream(const intptr_t, int); ~ThrustStream() = default; virtual void copy() override;