From b5ad6c0eae1606245720ae6a0f85856e0aa45385 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 26 Jun 2024 14:24:15 +0200 Subject: [PATCH 1/3] Fix compilation for newer thrust versions Recent changes in Thrust, for the CUDA backend, no longer allow including `` into a translation unit that is not compiled by a CUDA compiler. As a workaround, the vectors containing the data are moved from the header into the benchmark implementation file. --- src/thrust/ThrustStream.cu | 74 ++++++++++++++++++++++++-------------- src/thrust/ThrustStream.h | 27 ++++---------- 2 files changed, 55 insertions(+), 46 deletions(-) diff --git a/src/thrust/ThrustStream.cu b/src/thrust/ThrustStream.cu index 84b27b8e..a9cae339 100644 --- a/src/thrust/ThrustStream.cu +++ b/src/thrust/ThrustStream.cu @@ -1,5 +1,5 @@ -// Copyright (c) 2020 Tom Deakin -// University of Bristol HPC +// Copyright (c) 2020 Tom Deakin, 2024 Bernhard Manfred Gruber +// University of Bristol HPC, NVIDIA // // For full license terms please see the LICENSE file distributed with this // source code @@ -10,6 +10,25 @@ #include #include +#if defined(MANAGED) +#include +#else +#include +#endif + +template +using vector = +#if defined(MANAGED) + thrust::universal_vector; +#else + thrust::device_vector; +#endif + +template +struct ThrustStream::Impl{ + vector a, b, c; +}; + static inline void synchronise() { // rocThrust doesn't synchronise between thrust calls @@ -20,7 +39,7 @@ static inline void synchronise() template ThrustStream::ThrustStream(const intptr_t array_size, int device) - : array_size{array_size}, a(array_size), b(array_size), c(array_size) { + : array_size{array_size}, impl(new Impl{vector(array_size), vector(array_size), vector(array_size)}) { std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl; std::cout << "Driver: " << getDeviceDriver(device) << std::endl; std::cout << "Thrust version: " << THRUST_VERSION << std::endl; @@ -50,27 +69,30 @@ ThrustStream::ThrustStream(const intptr_t array_size, int device) } +template +ThrustStream::~ThrustStream() = default; + template void ThrustStream::init_arrays(T initA, T initB, T initC) { - thrust::fill(a.begin(), a.end(), initA); - thrust::fill(b.begin(), b.end(), initB); - thrust::fill(c.begin(), c.end(), initC); + thrust::fill(impl->a.begin(), impl->a.end(), initA); + thrust::fill(impl->b.begin(), impl->b.end(), initB); + thrust::fill(impl->c.begin(), impl->c.end(), initC); synchronise(); } template void ThrustStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { - thrust::copy(a.begin(), a.end(), h_a.begin()); - thrust::copy(b.begin(), b.end(), h_b.begin()); - thrust::copy(c.begin(), c.end(), h_c.begin()); + thrust::copy(impl->a.begin(), impl->a.end(), h_a.begin()); + thrust::copy(impl->b.begin(), impl->b.end(), h_b.begin()); + thrust::copy(impl->c.begin(), impl->c.end(), h_c.begin()); } template void ThrustStream::copy() { - thrust::copy(a.begin(), a.end(),c.begin()); + thrust::copy(impl->a.begin(), impl->a.end(),impl->c.begin()); synchronise(); } @@ -79,9 +101,9 @@ void ThrustStream::mul() { const T scalar = startScalar; thrust::transform( - c.begin(), - c.end(), - b.begin(), + impl->c.begin(), + impl->c.end(), + impl->b.begin(), [=] __device__ __host__ (const T &ci){ return ci * scalar; } @@ -93,9 +115,9 @@ template void ThrustStream::add() { thrust::transform( - thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin())), - thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end())), - c.begin(), + thrust::make_zip_iterator(thrust::make_tuple(impl->a.begin(), impl->b.begin())), + thrust::make_zip_iterator(thrust::make_tuple(impl->a.end(), impl->b.end())), + impl->c.begin(), thrust::make_zip_function( [] __device__ __host__ (const T& ai, const T& bi){ return ai + bi; @@ -109,9 +131,9 @@ void ThrustStream::triad() { const T scalar = startScalar; thrust::transform( - thrust::make_zip_iterator(thrust::make_tuple(b.begin(), c.begin())), - thrust::make_zip_iterator(thrust::make_tuple(b.end(), c.end())), - a.begin(), + thrust::make_zip_iterator(thrust::make_tuple(impl->b.begin(), impl->c.begin())), + thrust::make_zip_iterator(thrust::make_tuple(impl->b.end(), impl->c.end())), + impl->a.begin(), thrust::make_zip_function( [=] __device__ __host__ (const T& bi, const T& ci){ return bi + scalar * ci; @@ -125,9 +147,9 @@ void ThrustStream::nstream() { const T scalar = startScalar; thrust::transform( - thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin(), c.begin())), - thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end(), c.end())), - a.begin(), + thrust::make_zip_iterator(thrust::make_tuple(impl->a.begin(), impl->b.begin(), impl->c.begin())), + thrust::make_zip_iterator(thrust::make_tuple(impl->a.end(), impl->b.end(), impl->c.end())), + impl->a.begin(), thrust::make_zip_function( [=] __device__ __host__ (const T& ai, const T& bi, const T& ci){ return ai + bi + scalar * ci; @@ -139,7 +161,7 @@ void ThrustStream::nstream() template T ThrustStream::dot() { - return thrust::inner_product(a.begin(), a.end(), b.begin(), T{}); + return thrust::inner_product(impl->a.begin(), impl->a.end(), impl->b.begin(), T{}); } #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \ @@ -155,7 +177,7 @@ T ThrustStream::dot() # error Unsupported compiler for Thrust #endif -void check_error(void) +void check_error() { IMPL_FN__(Error_t) err = IMPL_FN__(GetLastError()); if (err != IMPL_FN__(Success)) @@ -165,7 +187,7 @@ void check_error(void) } } -void listDevices(void) +void listDevices() { // Get number of devices int count; @@ -213,7 +235,7 @@ std::string getDeviceDriver(const int device) #else -void listDevices(void) +void listDevices() { std::cout << "0: CPU" << std::endl; } diff --git a/src/thrust/ThrustStream.h b/src/thrust/ThrustStream.h index b0acd80f..57a0d818 100644 --- a/src/thrust/ThrustStream.h +++ b/src/thrust/ThrustStream.h @@ -1,5 +1,5 @@ -// Copyright (c) 2020 Tom Deakin -// University of Bristol HPC +// Copyright (c) 2020 Tom Deakin, 2024 Bernhard Manfred Gruber +// University of Bristol HPC, NVIDIA // // For full license terms please see the LICENSE file distributed with this // source code @@ -8,11 +8,7 @@ #include #include -#if defined(MANAGED) -#include -#else -#include -#endif +#include #include "Stream.h" @@ -22,22 +18,13 @@ template class ThrustStream : public Stream { protected: - // Size of arrays + struct Impl; + std::unique_ptr impl; // avoid thrust vectors leaking into non-CUDA translation units intptr_t array_size; - #if defined(MANAGED) - thrust::universtal_vector a; - thrust::universtal_vector b; - thrust::universtal_vector c; - #else - thrust::device_vector a; - thrust::device_vector b; - thrust::device_vector c; - #endif - public: - ThrustStream(const intptr_t, int); - ~ThrustStream() = default; + ThrustStream(intptr_t array_size, int device); + ~ThrustStream(); virtual void copy() override; virtual void add() override; From f7871672562f7dc7905f55d01df9d08089f72088 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 26 Jun 2024 17:14:10 +0200 Subject: [PATCH 2/3] Avoid tuple creation when making zip iterator --- src/thrust/ThrustStream.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/thrust/ThrustStream.cu b/src/thrust/ThrustStream.cu index a9cae339..de9fa8ff 100644 --- a/src/thrust/ThrustStream.cu +++ b/src/thrust/ThrustStream.cu @@ -115,8 +115,8 @@ template void ThrustStream::add() { thrust::transform( - thrust::make_zip_iterator(thrust::make_tuple(impl->a.begin(), impl->b.begin())), - thrust::make_zip_iterator(thrust::make_tuple(impl->a.end(), impl->b.end())), + thrust::make_zip_iterator(impl->a.begin(), impl->b.begin()), + thrust::make_zip_iterator(impl->a.end(), impl->b.end()), impl->c.begin(), thrust::make_zip_function( [] __device__ __host__ (const T& ai, const T& bi){ @@ -131,8 +131,8 @@ void ThrustStream::triad() { const T scalar = startScalar; thrust::transform( - thrust::make_zip_iterator(thrust::make_tuple(impl->b.begin(), impl->c.begin())), - thrust::make_zip_iterator(thrust::make_tuple(impl->b.end(), impl->c.end())), + thrust::make_zip_iterator(impl->b.begin(), impl->c.begin()), + thrust::make_zip_iterator(impl->b.end(), impl->c.end()), impl->a.begin(), thrust::make_zip_function( [=] __device__ __host__ (const T& bi, const T& ci){ @@ -147,8 +147,8 @@ void ThrustStream::nstream() { const T scalar = startScalar; thrust::transform( - thrust::make_zip_iterator(thrust::make_tuple(impl->a.begin(), impl->b.begin(), impl->c.begin())), - thrust::make_zip_iterator(thrust::make_tuple(impl->a.end(), impl->b.end(), impl->c.end())), + thrust::make_zip_iterator(impl->a.begin(), impl->b.begin(), impl->c.begin()), + thrust::make_zip_iterator(impl->a.end(), impl->b.end(), impl->c.end()), impl->a.begin(), thrust::make_zip_function( [=] __device__ __host__ (const T& ai, const T& bi, const T& ci){ From be3132becf0b3d9020b95b8eae67db95ce417a07 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 26 Jun 2024 17:12:50 +0200 Subject: [PATCH 3/3] Enable using Thrust from CCCL --- src/thrust/model.cmake | 61 +++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/src/thrust/model.cmake b/src/thrust/model.cmake index 6b82ef59..904b24e5 100644 --- a/src/thrust/model.cmake +++ b/src/thrust/model.cmake @@ -1,13 +1,13 @@ register_flag_optional(THRUST_IMPL "Which Thrust implementation to use, supported options include: - - CUDA (via https://github.com/NVIDIA/thrust) + - CUDA (via https://github.com/NVIDIA/thrust or https://github.com/NVIDIA/CCCL) - ROCM (via https://github.com/ROCmSoftwarePlatform/rocThrust) " "CUDA") register_flag_optional(SDK_DIR - "Path to the selected Thrust implementation (e.g `/opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/include` for NVHPC, `/opt/rocm` for ROCm)" + "Path to the installation prefix for CCCL or Thrust (e.g `/opt/nvidia/hpc_sdk/Linux_x86_64/24.5/cuda/12.4/lib64/cmake` for NVHPC, or `/usr/local/cuda-12.5/lib64/cmake` for nvcc, or `/usr/local/cuda-11.4/include` for older nvcc, or `/opt/rocm` for ROCm)" "") register_flag_optional(BACKEND @@ -18,7 +18,7 @@ register_flag_optional(BACKEND " "CUDA") - register_flag_optional(MANAGED "Enabled managed memory mode." +register_flag_optional(MANAGED "Enabled managed memory mode." "OFF") register_flag_optional(CMAKE_CUDA_COMPILER @@ -34,6 +34,9 @@ register_flag_optional(CUDA_EXTRA_FLAGS "[THRUST_IMPL==CUDA] Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`" "") +option(FETCH_CCCL "Fetch (download) the CCCL library. This uses CMake's FetchContent feature. + Specify version by setting FETCH_CCCL_VERSION" OFF) +set(FETCH_CCCL_VERSION "v2.4.0" CACHE STRING "Specify version of CCCL to use if FETCH_CCCL is ON") macro(setup) set(CMAKE_CXX_STANDARD 14) @@ -42,44 +45,48 @@ macro(setup) endif () if (${THRUST_IMPL} STREQUAL "CUDA") - - # see CUDA.cmake, we're only adding a few Thrust related libraries here - if (POLICY CMP0104) cmake_policy(SET CMP0104 NEW) endif () - set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH}) - # add -forward-unknown-to-host-compiler for compatibility reasons set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda " ${CUDA_EXTRA_FLAGS}) enable_language(CUDA) - # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG - # appended later + # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG appended later wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE}) message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}") - - # XXX NVHPC <= 21.9 has cub-config in `Linux_x86_64/21.9/cuda/11.4/include/cub/cmake` - # XXX NVHPC >= 22.3 has cub-config in `Linux_x86_64/22.3/cuda/11.6/lib64/cmake/cub/` - # same thing for thrust if (SDK_DIR) + # CMake tries several subdirectories below SDK_DIR, see documentation: + # https://cmake.org/cmake/help/latest/command/find_package.html#config-mode-search-procedure list(APPEND CMAKE_PREFIX_PATH ${SDK_DIR}) - find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub) - find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust) - else () - find_package(CUB REQUIRED CONFIG) - find_package(Thrust REQUIRED CONFIG) endif () - message(STATUS "Using Thrust backend: ${BACKEND}") - - # this creates the interface that we can link to - thrust_create_target(Thrust${BACKEND} - HOST CPP - DEVICE ${BACKEND}) - - register_link_library(Thrust${BACKEND}) + set(CCCL_THRUST_DEVICE_SYSTEM ${BACKEND} CACHE STRING "" FORCE) + + # fetch CCCL if user wants to + if (FETCH_CCCL) + FetchContent_Declare( + CCCL + GIT_REPOSITORY https://github.com/nvidia/cccl.git + GIT_TAG "${FETCH_CCCL_VERSION}" + ) + FetchContent_MakeAvailable(CCCL) + register_link_library(CCCL::CCCL) + else() + # try to find CCCL locally + find_package(CCCL CONFIG) + if (CCCL_FOUND) + register_link_library(CCCL::CCCL) + else() + # backup: find legacy projects separately + message(WARNING "No CCCL found on your system. Trying Thrust and CUB legacy targets.") + find_package(CUB REQUIRED CONFIG) + find_package(Thrust REQUIRED CONFIG) + thrust_create_target(Thrust${BACKEND} HOST CPP DEVICE ${BACKEND}) + register_link_library(Thrust${BACKEND}) + endif() + endif() elseif (${THRUST_IMPL} STREQUAL "ROCM") if (SDK_DIR) find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim)