From b5ad6c0eae1606245720ae6a0f85856e0aa45385 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 26 Jun 2024 14:24:15 +0200
Subject: [PATCH 1/3] Fix compilation for newer thrust versions

Recent changes in Thrust, for the CUDA backend, no longer allow including `<thrust/device_vector.h>` into a translation unit that is not compiled by a CUDA compiler. As a workaround, the vectors containing the data are moved from the header into the benchmark implementation file.
---
 src/thrust/ThrustStream.cu | 74 ++++++++++++++++++++++++--------------
 src/thrust/ThrustStream.h  | 27 ++++----------
 2 files changed, 55 insertions(+), 46 deletions(-)
diff --git a/src/thrust/ThrustStream.cu b/src/thrust/ThrustStream.cu
index 84b27b8e..a9cae339 100644
--- a/src/thrust/ThrustStream.cu
+++ b/src/thrust/ThrustStream.cu
@@ -1,5 +1,5 @@
-// Copyright (c) 2020 Tom Deakin
-// University of Bristol HPC
+// Copyright (c) 2020 Tom Deakin, 2024 Bernhard Manfred Gruber
+// University of Bristol HPC, NVIDIA
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
@@ -10,6 +10,25 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/zip_function.h>
 
+#if defined(MANAGED)
+#include <thrust/universal_vector.h>
+#else
+#include <thrust/device_vector.h>
+#endif
+
+template <class T>
+using vector =
+#if defined(MANAGED)
+  thrust::universal_vector<T>;
+#else
+  thrust::device_vector<T>;
+#endif
+
+template <class T>
+struct ThrustStream<T>::Impl{
+  vector<T> a, b, c;
+};
+
 static inline void synchronise()
 {
 // rocThrust doesn't synchronise between thrust calls
@@ -20,7 +39,7 @@ static inline void synchronise()
 
 template <class T>
 ThrustStream<T>::ThrustStream(const intptr_t array_size, int device)
-    : array_size{array_size}, a(array_size), b(array_size), c(array_size) {
+    : array_size{array_size}, impl(new Impl{vector<T>(array_size), vector<T>(array_size), vector<T>(array_size)}) {
   std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl;
   std::cout << "Driver: " << getDeviceDriver(device) << std::endl;
   std::cout << "Thrust version: " << THRUST_VERSION << std::endl;
@@ -50,27 +69,30 @@ ThrustStream<T>::ThrustStream(const intptr_t array_size, int device)
 
 }
 
+template <class T>
+ThrustStream<T>::~ThrustStream() = default;
+
 template <class T>
 void ThrustStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  thrust::fill(a.begin(), a.end(), initA);
-  thrust::fill(b.begin(), b.end(), initB);
-  thrust::fill(c.begin(), c.end(), initC);
+  thrust::fill(impl->a.begin(), impl->a.end(), initA);
+  thrust::fill(impl->b.begin(), impl->b.end(), initB);
+  thrust::fill(impl->c.begin(), impl->c.end(), initC);
   synchronise();
 }
 
 template <class T>
 void ThrustStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
-  thrust::copy(a.begin(), a.end(), h_a.begin());
-  thrust::copy(b.begin(), b.end(), h_b.begin());
-  thrust::copy(c.begin(), c.end(), h_c.begin());
+  thrust::copy(impl->a.begin(), impl->a.end(), h_a.begin());
+  thrust::copy(impl->b.begin(), impl->b.end(), h_b.begin());
+  thrust::copy(impl->c.begin(), impl->c.end(), h_c.begin());
 }
 
 template <class T>
 void ThrustStream<T>::copy()
 {
-  thrust::copy(a.begin(), a.end(),c.begin());
+  thrust::copy(impl->a.begin(), impl->a.end(),impl->c.begin());
   synchronise();
 }
 
@@ -79,9 +101,9 @@ void ThrustStream<T>::mul()
 {
   const T scalar = startScalar;
   thrust::transform(
-      c.begin(),
-      c.end(),
-      b.begin(),
+      impl->c.begin(),
+      impl->c.end(),
+      impl->b.begin(),
       [=] __device__ __host__ (const T &ci){
         return ci * scalar;
       }
@@ -93,9 +115,9 @@ template <class T>
 void ThrustStream<T>::add()
 {
   thrust::transform(
-      thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin())),
-      thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end())),
-      c.begin(),
+      thrust::make_zip_iterator(thrust::make_tuple(impl->a.begin(), impl->b.begin())),
+      thrust::make_zip_iterator(thrust::make_tuple(impl->a.end(), impl->b.end())),
+      impl->c.begin(),
       thrust::make_zip_function(
           [] __device__ __host__ (const T& ai, const T& bi){
             return ai + bi;
@@ -109,9 +131,9 @@ void ThrustStream<T>::triad()
 {
   const T scalar = startScalar;
   thrust::transform(
-      thrust::make_zip_iterator(thrust::make_tuple(b.begin(), c.begin())),
-      thrust::make_zip_iterator(thrust::make_tuple(b.end(), c.end())),
-      a.begin(),
+      thrust::make_zip_iterator(thrust::make_tuple(impl->b.begin(), impl->c.begin())),
+      thrust::make_zip_iterator(thrust::make_tuple(impl->b.end(), impl->c.end())),
+      impl->a.begin(),
       thrust::make_zip_function(
           [=] __device__ __host__ (const T& bi, const T& ci){
             return bi + scalar * ci;
@@ -125,9 +147,9 @@ void ThrustStream<T>::nstream()
 {
   const T scalar = startScalar;
   thrust::transform(
-      thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin(), c.begin())),
-      thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end(), c.end())),
-      a.begin(),
+      thrust::make_zip_iterator(thrust::make_tuple(impl->a.begin(), impl->b.begin(), impl->c.begin())),
+      thrust::make_zip_iterator(thrust::make_tuple(impl->a.end(), impl->b.end(), impl->c.end())),
+      impl->a.begin(),
       thrust::make_zip_function(
           [=] __device__ __host__ (const T& ai, const T& bi, const T& ci){
             return ai + bi + scalar * ci;
@@ -139,7 +161,7 @@ void ThrustStream<T>::nstream()
 template <class T>
 T ThrustStream<T>::dot()
 {
-  return thrust::inner_product(a.begin(), a.end(), b.begin(), T{});
+  return thrust::inner_product(impl->a.begin(), impl->a.end(), impl->b.begin(), T{});
 }
 
 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \
@@ -155,7 +177,7 @@ T ThrustStream<T>::dot()
 # error Unsupported compiler for Thrust
 #endif
 
-void check_error(void)
+void check_error()
 {
   IMPL_FN__(Error_t) err =  IMPL_FN__(GetLastError());
   if (err !=  IMPL_FN__(Success))
@@ -165,7 +187,7 @@ void check_error(void)
   }
 }
 
-void listDevices(void)
+void listDevices()
 {
   // Get number of devices
   int count;
@@ -213,7 +235,7 @@ std::string getDeviceDriver(const int device)
 
 #else
 
-void listDevices(void)
+void listDevices()
 {
   std::cout << "0: CPU" << std::endl;
 }
diff --git a/src/thrust/ThrustStream.h b/src/thrust/ThrustStream.h
index b0acd80f..57a0d818 100644
--- a/src/thrust/ThrustStream.h
+++ b/src/thrust/ThrustStream.h
@@ -1,5 +1,5 @@
-// Copyright (c) 2020 Tom Deakin
-// University of Bristol HPC
+// Copyright (c) 2020 Tom Deakin, 2024 Bernhard Manfred Gruber
+// University of Bristol HPC, NVIDIA
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
@@ -8,11 +8,7 @@
 
 #include <iostream>
 #include <vector>
-#if defined(MANAGED)
-#include <thrust/universal_vector.h>
-#else
-#include <thrust/device_vector.h>
-#endif
+#include <memory>
 
 #include "Stream.h"
 
@@ -22,22 +18,13 @@ template <class T>
 class ThrustStream : public Stream<T>
 {
   protected:
-    // Size of arrays
+    struct Impl;
+    std::unique_ptr<Impl> impl; // avoid thrust vectors leaking into non-CUDA translation units
     intptr_t array_size;
 
-  #if defined(MANAGED)
-    thrust::universtal_vector<T> a;
-    thrust::universtal_vector<T> b;
-    thrust::universtal_vector<T> c;
-  #else
-    thrust::device_vector<T> a;
-    thrust::device_vector<T> b;
-    thrust::device_vector<T> c;
-  #endif
-
   public:
-    ThrustStream(const intptr_t, int);
-    ~ThrustStream() = default;
+    ThrustStream(intptr_t array_size, int device);
+    ~ThrustStream();
 
     virtual void copy() override;
     virtual void add() override;

From f7871672562f7dc7905f55d01df9d08089f72088 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 26 Jun 2024 17:14:10 +0200
Subject: [PATCH 2/3] Avoid tuple creation when making zip iterator

---
 src/thrust/ThrustStream.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/thrust/ThrustStream.cu b/src/thrust/ThrustStream.cu
index a9cae339..de9fa8ff 100644
--- a/src/thrust/ThrustStream.cu
+++ b/src/thrust/ThrustStream.cu
@@ -115,8 +115,8 @@ template <class T>
 void ThrustStream<T>::add()
 {
   thrust::transform(
-      thrust::make_zip_iterator(thrust::make_tuple(impl->a.begin(), impl->b.begin())),
-      thrust::make_zip_iterator(thrust::make_tuple(impl->a.end(), impl->b.end())),
+      thrust::make_zip_iterator(impl->a.begin(), impl->b.begin()),
+      thrust::make_zip_iterator(impl->a.end(), impl->b.end()),
       impl->c.begin(),
       thrust::make_zip_function(
           [] __device__ __host__ (const T& ai, const T& bi){
@@ -131,8 +131,8 @@ void ThrustStream<T>::triad()
 {
   const T scalar = startScalar;
   thrust::transform(
-      thrust::make_zip_iterator(thrust::make_tuple(impl->b.begin(), impl->c.begin())),
-      thrust::make_zip_iterator(thrust::make_tuple(impl->b.end(), impl->c.end())),
+      thrust::make_zip_iterator(impl->b.begin(), impl->c.begin()),
+      thrust::make_zip_iterator(impl->b.end(), impl->c.end()),
       impl->a.begin(),
       thrust::make_zip_function(
           [=] __device__ __host__ (const T& bi, const T& ci){
@@ -147,8 +147,8 @@ void ThrustStream<T>::nstream()
 {
   const T scalar = startScalar;
   thrust::transform(
-      thrust::make_zip_iterator(thrust::make_tuple(impl->a.begin(), impl->b.begin(), impl->c.begin())),
-      thrust::make_zip_iterator(thrust::make_tuple(impl->a.end(), impl->b.end(), impl->c.end())),
+      thrust::make_zip_iterator(impl->a.begin(), impl->b.begin(), impl->c.begin()),
+      thrust::make_zip_iterator(impl->a.end(), impl->b.end(), impl->c.end()),
       impl->a.begin(),
       thrust::make_zip_function(
           [=] __device__ __host__ (const T& ai, const T& bi, const T& ci){

From be3132becf0b3d9020b95b8eae67db95ce417a07 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Wed, 26 Jun 2024 17:12:50 +0200
Subject: [PATCH 3/3] Enable using Thrust from CCCL

---
 src/thrust/model.cmake | 61 +++++++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 27 deletions(-)

diff --git a/src/thrust/model.cmake b/src/thrust/model.cmake
index 6b82ef59..904b24e5 100644
--- a/src/thrust/model.cmake
+++ b/src/thrust/model.cmake
@@ -1,13 +1,13 @@
 
 register_flag_optional(THRUST_IMPL
         "Which Thrust implementation to use, supported options include:
-         - CUDA (via https://github.com/NVIDIA/thrust)
+         - CUDA (via https://github.com/NVIDIA/thrust or https://github.com/NVIDIA/CCCL)
          - ROCM (via https://github.com/ROCmSoftwarePlatform/rocThrust)
         "
         "CUDA")
 
 register_flag_optional(SDK_DIR
-        "Path to the selected Thrust implementation (e.g `/opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/include` for NVHPC, `/opt/rocm` for ROCm)"
+        "Path to the installation prefix for CCCL or Thrust (e.g `/opt/nvidia/hpc_sdk/Linux_x86_64/24.5/cuda/12.4/lib64/cmake` for NVHPC, or `/usr/local/cuda-12.5/lib64/cmake` for nvcc, or `/usr/local/cuda-11.4/include` for older nvcc, or `/opt/rocm` for ROCm)"
         "")
 
 register_flag_optional(BACKEND
@@ -18,7 +18,7 @@ register_flag_optional(BACKEND
         "
         "CUDA")
 
-      register_flag_optional(MANAGED "Enabled managed memory mode."
+register_flag_optional(MANAGED "Enabled managed memory mode."
         "OFF")
 
 register_flag_optional(CMAKE_CUDA_COMPILER
@@ -34,6 +34,9 @@ register_flag_optional(CUDA_EXTRA_FLAGS
         "[THRUST_IMPL==CUDA] Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`"
         "")
 
+option(FETCH_CCCL "Fetch (download) the CCCL library. This uses CMake's FetchContent feature.
+                   Specify version by setting FETCH_CCCL_VERSION" OFF)
+set(FETCH_CCCL_VERSION "v2.4.0" CACHE STRING "Specify version of CCCL to use if FETCH_CCCL is ON")
 
 macro(setup)
     set(CMAKE_CXX_STANDARD 14)
@@ -42,44 +45,48 @@ macro(setup)
     endif ()
 
     if (${THRUST_IMPL} STREQUAL "CUDA")
-
-        # see CUDA.cmake, we're only adding a few Thrust related libraries here
-
         if (POLICY CMP0104)
             cmake_policy(SET CMP0104 NEW)
         endif ()
-
         set(CMAKE_CUDA_ARCHITECTURES  ${CUDA_ARCH})
-        # add -forward-unknown-to-host-compiler for compatibility reasons
         set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda " ${CUDA_EXTRA_FLAGS})
         enable_language(CUDA)
-        # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG
-        # appended later
+        # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG appended later
         wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE})
 
         message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}")
 
-
-        # XXX NVHPC <= 21.9 has cub-config in `Linux_x86_64/21.9/cuda/11.4/include/cub/cmake`
-        # XXX NVHPC >= 22.3 has cub-config in `Linux_x86_64/22.3/cuda/11.6/lib64/cmake/cub/`
-        # same thing for thrust
         if (SDK_DIR)
+            # CMake tries several subdirectories below SDK_DIR, see documentation:
+            # https://cmake.org/cmake/help/latest/command/find_package.html#config-mode-search-procedure
             list(APPEND CMAKE_PREFIX_PATH ${SDK_DIR})
-            find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub)
-            find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust)
-        else ()
-            find_package(CUB REQUIRED CONFIG)
-            find_package(Thrust REQUIRED CONFIG)
         endif ()
 
-        message(STATUS "Using Thrust backend: ${BACKEND}")
-
-        # this creates the interface that we can link to
-        thrust_create_target(Thrust${BACKEND}
-                HOST CPP
-                DEVICE ${BACKEND})
-
-        register_link_library(Thrust${BACKEND})
+        set(CCCL_THRUST_DEVICE_SYSTEM ${BACKEND} CACHE STRING "" FORCE)
+
+        # fetch CCCL if user wants to
+        if (FETCH_CCCL)
+            FetchContent_Declare(
+                    CCCL
+                    GIT_REPOSITORY https://github.com/nvidia/cccl.git
+                    GIT_TAG "${FETCH_CCCL_VERSION}"
+            )
+            FetchContent_MakeAvailable(CCCL)
+            register_link_library(CCCL::CCCL)
+        else()
+            # try to find CCCL locally
+            find_package(CCCL CONFIG)
+            if (CCCL_FOUND)
+                register_link_library(CCCL::CCCL)
+            else()
+                # backup: find legacy projects separately
+                message(WARNING "No CCCL found on your system. Trying Thrust and CUB legacy targets.")
+                find_package(CUB REQUIRED CONFIG)
+                find_package(Thrust REQUIRED CONFIG)
+                thrust_create_target(Thrust${BACKEND} HOST CPP DEVICE ${BACKEND})
+                register_link_library(Thrust${BACKEND})
+            endif()
+        endif()
     elseif (${THRUST_IMPL} STREQUAL "ROCM")
         if (SDK_DIR)
             find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim)