Skip to content

Commit

Permalink
Merge pull request #188 from gonzalobg/large_index
Browse files Browse the repository at this point in the history
  • Loading branch information
tomdeakin authored May 26, 2024
2 parents 01224e7 + 9695b3b commit e1fffc0
Show file tree
Hide file tree
Showing 37 changed files with 184 additions and 162 deletions.
2 changes: 1 addition & 1 deletion src/StreamModels.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
#endif

template <typename T>
std::unique_ptr<Stream<T>> make_stream(int array_size, int deviceIndex) {
std::unique_ptr<Stream<T>> make_stream(intptr_t array_size, int deviceIndex) {
#if defined(CUDA)
// Use the CUDA implementation
return std::make_unique<CUDAStream<T>>(array_size, deviceIndex);
Expand Down
42 changes: 24 additions & 18 deletions src/acc/ACCStream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,12 @@
#include "ACCStream.h"

template <class T>
ACCStream<T>::ACCStream(const int ARRAY_SIZE, int device)
ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
: array_size{ARRAY_SIZE}
{
acc_device_t device_type = acc_get_device_type();
acc_set_device_num(device, device_type);

array_size = ARRAY_SIZE;

// Set up data region on device
this->a = new T[array_size];
this->b = new T[array_size];
Expand All @@ -32,7 +31,7 @@ template <class T>
ACCStream<T>::~ACCStream()
{
// End data region on device
int array_size = this->array_size;
intptr_t array_size = this->array_size;

T * restrict a = this->a;
T * restrict b = this->b;
Expand All @@ -49,12 +48,12 @@ ACCStream<T>::~ACCStream()
template <class T>
void ACCStream<T>::init_arrays(T initA, T initB, T initC)
{
int array_size = this->array_size;
intptr_t array_size = this->array_size;
T * restrict a = this->a;
T * restrict b = this->b;
T * restrict c = this->c;
#pragma acc parallel loop present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
for (int i = 0; i < array_size; i++)
for (intptr_t i = 0; i < array_size; i++)
{
a[i] = initA;
b[i] = initB;
Expand All @@ -70,16 +69,23 @@ void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::ve
T *c = this->c;
#pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size])
{}

for (intptr_t i = 0; i < array_size; i++)
{
h_a[i] = a[i];
h_b[i] = b[i];
h_c[i] = c[i];
}
}

template <class T>
void ACCStream<T>::copy()
{
int array_size = this->array_size;
intptr_t array_size = this->array_size;
T * restrict a = this->a;
T * restrict c = this->c;
#pragma acc parallel loop present(a[0:array_size], c[0:array_size]) wait
for (int i = 0; i < array_size; i++)
for (intptr_t i = 0; i < array_size; i++)
{
c[i] = a[i];
}
Expand All @@ -90,11 +96,11 @@ void ACCStream<T>::mul()
{
const T scalar = startScalar;

int array_size = this->array_size;
intptr_t array_size = this->array_size;
T * restrict b = this->b;
T * restrict c = this->c;
#pragma acc parallel loop present(b[0:array_size], c[0:array_size]) wait
for (int i = 0; i < array_size; i++)
for (intptr_t i = 0; i < array_size; i++)
{
b[i] = scalar * c[i];
}
Expand All @@ -103,12 +109,12 @@ void ACCStream<T>::mul()
template <class T>
void ACCStream<T>::add()
{
int array_size = this->array_size;
intptr_t array_size = this->array_size;
T * restrict a = this->a;
T * restrict b = this->b;
T * restrict c = this->c;
#pragma acc parallel loop present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
for (int i = 0; i < array_size; i++)
for (intptr_t i = 0; i < array_size; i++)
{
c[i] = a[i] + b[i];
}
Expand All @@ -119,12 +125,12 @@ void ACCStream<T>::triad()
{
const T scalar = startScalar;

int array_size = this->array_size;
intptr_t array_size = this->array_size;
T * restrict a = this->a;
T * restrict b = this->b;
T * restrict c = this->c;
#pragma acc parallel loop present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
for (int i = 0; i < array_size; i++)
for (intptr_t i = 0; i < array_size; i++)
{
a[i] = b[i] + scalar * c[i];
}
Expand All @@ -135,12 +141,12 @@ void ACCStream<T>::nstream()
{
const T scalar = startScalar;

int array_size = this->array_size;
intptr_t array_size = this->array_size;
T * restrict a = this->a;
T * restrict b = this->b;
T * restrict c = this->c;
#pragma acc parallel loop present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
for (int i = 0; i < array_size; i++)
for (intptr_t i = 0; i < array_size; i++)
{
a[i] += b[i] + scalar * c[i];
}
Expand All @@ -151,11 +157,11 @@ T ACCStream<T>::dot()
{
T sum{};

int array_size = this->array_size;
intptr_t array_size = this->array_size;
T * restrict a = this->a;
T * restrict b = this->b;
#pragma acc parallel loop reduction(+:sum) present(a[0:array_size], b[0:array_size]) wait
for (int i = 0; i < array_size; i++)
for (intptr_t i = 0; i < array_size; i++)
{
sum += a[i] * b[i];
}
Expand Down
18 changes: 7 additions & 11 deletions src/acc/ACCStream.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,24 +19,23 @@
template <class T>
class ACCStream : public Stream<T>
{

struct A{
T *a;
T *b;
T *c;
};
struct A{
T *a;
T *b;
T *c;
};

protected:
// Size of arrays
int array_size;
intptr_t array_size;
A aa;
// Device side pointers
T *a;
T *b;
T *c;

public:
ACCStream(const int, int);
ACCStream(const intptr_t, int);
~ACCStream();

virtual void copy() override;
Expand All @@ -48,7 +47,4 @@ class ACCStream : public Stream<T>

virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;



};
34 changes: 17 additions & 17 deletions src/cuda/CUDAStream.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ __host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a +
cudaStream_t stream;

template <class T>
CUDAStream<T>::CUDAStream(const int array_size, const int device_index)
CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
: array_size(array_size)
{
// Set device
Expand Down Expand Up @@ -96,9 +96,9 @@ CUDAStream<T>::~CUDAStream()
}

template <typename T>
__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC, int array_size)
__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC, size_t array_size)
{
for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
a[i] = initA;
b[i] = initB;
c[i] = initC;
Expand All @@ -120,7 +120,7 @@ void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vecto
// Copy device memory to host
#if defined(PAGEFAULT) || defined(MANAGED)
CU(cudaStreamSynchronize(stream));
for (int i = 0; i < array_size; ++i)
for (intptr_t i = 0; i < array_size; ++i)
{
a[i] = d_a[i];
b[i] = d_b[i];
Expand All @@ -134,9 +134,9 @@ void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vecto
}

template <typename T>
__global__ void copy_kernel(const T * a, T * c, int array_size)
__global__ void copy_kernel(const T * a, T * c, size_t array_size)
{
for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
c[i] = a[i];
}
}
Expand All @@ -151,10 +151,10 @@ void CUDAStream<T>::copy()
}

template <typename T>
__global__ void mul_kernel(T * b, const T * c, int array_size)
__global__ void mul_kernel(T * b, const T * c, size_t array_size)
{
const T scalar = startScalar;
for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
b[i] = scalar * c[i];
}
}
Expand All @@ -169,9 +169,9 @@ void CUDAStream<T>::mul()
}

template <typename T>
__global__ void add_kernel(const T * a, const T * b, T * c, int array_size)
__global__ void add_kernel(const T * a, const T * b, T * c, size_t array_size)
{
for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
c[i] = a[i] + b[i];
}
}
Expand All @@ -186,10 +186,10 @@ void CUDAStream<T>::add()
}

template <typename T>
__global__ void triad_kernel(T * a, const T * b, const T * c, int array_size)
__global__ void triad_kernel(T * a, const T * b, const T * c, size_t array_size)
{
const T scalar = startScalar;
for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
a[i] = b[i] + scalar * c[i];
}
}
Expand All @@ -204,10 +204,10 @@ void CUDAStream<T>::triad()
}

template <typename T>
__global__ void nstream_kernel(T * a, const T * b, const T * c, int array_size)
__global__ void nstream_kernel(T * a, const T * b, const T * c, size_t array_size)
{
const T scalar = startScalar;
for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
for (size_t i = (size_t)threadIdx.x + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
a[i] += b[i] + scalar * c[i];
}
}
Expand All @@ -222,12 +222,12 @@ void CUDAStream<T>::nstream()
}

template <class T>
__global__ void dot_kernel(const T * a, const T * b, T* sums, int array_size)
__global__ void dot_kernel(const T * a, const T * b, T* sums, size_t array_size)
{
__shared__ T smem[TBSIZE];
T tmp = T(0.);
const size_t tidx = threadIdx.x;
for (int i = tidx + (size_t)blockDim.x * blockIdx.x; i < array_size; i += gridDim.x * blockDim.x) {
for (size_t i = tidx + (size_t)blockDim.x * blockIdx.x; i < array_size; i += (size_t)gridDim.x * blockDim.x) {
tmp += a[i] * b[i];
}
smem[tidx] = tmp;
Expand All @@ -249,7 +249,7 @@ T CUDAStream<T>::dot()
CU(cudaStreamSynchronize(stream));

T sum = 0.0;
for (int i = 0; i < dot_num_blocks; ++i) sum += sums[i];
for (intptr_t i = 0; i < dot_num_blocks; ++i) sum += sums[i];

return sum;
}
Expand Down
6 changes: 3 additions & 3 deletions src/cuda/CUDAStream.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class CUDAStream : public Stream<T>
{
protected:
// Size of arrays
int array_size;
intptr_t array_size;

// Host array for partial sums for dot kernel
T *sums;
Expand All @@ -33,10 +33,10 @@ class CUDAStream : public Stream<T>
T *d_c;

// Number of blocks for dot kernel
int dot_num_blocks;
intptr_t dot_num_blocks;

public:
CUDAStream(const int, const int);
CUDAStream(const intptr_t, const int);
~CUDAStream();

virtual void copy() override;
Expand Down
Loading

0 comments on commit e1fffc0

Please sign in to comment.