From f459480f78164328214b75b16ffef338f1d4bc89 Mon Sep 17 00:00:00 2001 From: Nara Date: Wed, 22 Nov 2023 00:06:38 +0100 Subject: [PATCH] StreamHPC 2023-11-21 (DeviceMemcpy::Batched) (#314) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ci: use build instead rocm-build and nvcc-build tags This allows the build job to be performed by any runner configured for building, instead of the ROCm-specialized builder. As the target architectures are specified ahead of time, the GPU is not needed during the build process, and may be performed by any builder. * feat: Add interface for batched memcpy from rocPRIM and CUB * style(device_memcpy): improve formatting --------- Co-authored-by: Robin Voetter Co-authored-by: Gergely Mészáros --- .clang-format | 2 - .gitlab-ci.yml | 12 +- CHANGELOG.md | 3 + benchmark/CMakeLists.txt | 3 +- benchmark/benchmark_device_batch_memcpy.cpp | 410 ++++++++++++++++++ benchmark/benchmark_utils.hpp | 62 ++- .../backend/cub/device/device_memcpy.hpp | 64 +++ hipcub/include/hipcub/backend/cub/hipcub.hpp | 3 +- .../backend/rocprim/device/device_memcpy.hpp | 65 +++ .../include/hipcub/backend/rocprim/hipcub.hpp | 3 +- .../include/hipcub/device/device_memcpy.hpp | 38 ++ test/hipcub/CMakeLists.txt | 1 + test/hipcub/test_hipcub_device_memcpy.cpp | 327 ++++++++++++++ 13 files changed, 980 insertions(+), 13 deletions(-) create mode 100644 benchmark/benchmark_device_batch_memcpy.cpp create mode 100644 hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp create mode 100644 hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp create mode 100644 hipcub/include/hipcub/device/device_memcpy.hpp create mode 100644 test/hipcub/test_hipcub_device_memcpy.cpp diff --git a/.clang-format b/.clang-format index 1351dcc9..bf4a9927 100644 --- a/.clang-format +++ b/.clang-format @@ -59,8 +59,6 @@ BraceWrapping: AfterStruct: true AfterUnion: true AfterExternBlock: false - BeforeCatch: true - BeforeElse: true BeforeLambdaBody: true BeforeWhile: true IndentBraces: false diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 68be9cde..b370708c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -45,7 +45,7 @@ clang-format: stage: lint needs: [] tags: - - rocm-build + - build variables: CLANG_FORMAT: "/opt/rocm/llvm/bin/clang-format" GIT_CLANG_FORMAT: "/opt/rocm/llvm/bin/git-clang-format" @@ -62,7 +62,7 @@ copyright-date: stage: lint needs: [] tags: - - rocm-build + - build rules: - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' script: @@ -107,7 +107,7 @@ build:rocm: - .rules:build stage: build tags: - - rocm-build + - build needs: [] script: - cmake @@ -145,7 +145,7 @@ build:rocm-benchmark: - .rules:build stage: build tags: - - rocm-build + - build needs: [] script: - cmake @@ -304,7 +304,7 @@ build:nvcc: - .nvcc - .rules:build tags: - - nvcc-build + - build needs: [] script: - cmake @@ -339,7 +339,7 @@ build:nvcc-benchmark: - .nvcc - .rules:build tags: - - nvcc-build + - build needs: [] script: - cmake diff --git a/CHANGELOG.md b/CHANGELOG.md index e0fc6873..950525f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,9 @@ See README.md on how to build the hipCUB documentation using Doxygen. ### Fixed - Fixed build issues with `rmake.py` on Windows when using VS 2017 15.8 or later due to a breaking fix with extended aligned storage. +### Added +- Added interface `DeviceMemcpy::Batched` for batched memcpy from rocPRIM and CUB. + ## (Unreleased) hipCUB-2.13.1 for ROCm 5.7.0 ### Changed - CUB backend references CUB and Thrust version 2.0.1. diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 5fff6007..2d5e2188 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -80,6 +80,7 @@ add_hipcub_benchmark(benchmark_block_run_length_decode.cpp) add_hipcub_benchmark(benchmark_block_scan.cpp) add_hipcub_benchmark(benchmark_block_shuffle.cpp) add_hipcub_benchmark(benchmark_device_adjacent_difference.cpp) +add_hipcub_benchmark(benchmark_device_batch_memcpy.cpp) add_hipcub_benchmark(benchmark_device_histogram.cpp) add_hipcub_benchmark(benchmark_device_memory.cpp) add_hipcub_benchmark(benchmark_device_merge_sort.cpp) diff --git a/benchmark/benchmark_device_batch_memcpy.cpp b/benchmark/benchmark_device_batch_memcpy.cpp new file mode 100644 index 00000000..3d72e349 --- /dev/null +++ b/benchmark/benchmark_device_batch_memcpy.cpp @@ -0,0 +1,410 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "benchmark/benchmark.h" +#include "cmdparser.hpp" +#include "common_benchmark_header.hpp" + +#include "hipcub/block/block_load.hpp" +#include "hipcub/block/block_store.hpp" +#include "hipcub/device/device_memcpy.hpp" +#include "hipcub/hipcub.hpp" + +#ifdef __HIP_PLATFORM_AMD__ + // Only include this on AMD as it contains specialized config information + #include +#endif + +#include + +#include +#include +#include +#include +#include + +#include + +constexpr uint32_t warmup_size = 5; +constexpr int32_t max_size = 1024 * 1024; + +constexpr int32_t wlev_min_size = 128; +constexpr int32_t blev_min_size = 1024; + +// Used for generating offsets. We generate a permutation map and then derive +// offsets via a sum scan over the sizes in the order of the permutation. This +// allows us to keep the order of buffers we pass to batch_memcpy, but still +// have source and destinations mappings not be the identity function: +// +// batch_memcpy( +// [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, d!) +// [&a0', &b0', &c0', &d0'], // to (order is the same as above too!) +// [3 , 2 , 1 , 2 ]) // size +// +// ┌───┬───┬───┬───┬───┬───┬───┬───┐ +// │b0 │b1 │a0 │a1 │a2 │d0 │d1 │c0 │ buffer x contains buffers a, b, c, d +// └───┴───┴───┴───┴───┴───┴───┴───┘ note that the order of buffers is shuffled! +// ───┬─── ─────┬───── ───┬─── ─── +// └─────────┼─────────┼───┐ +// ┌───┘ ┌───┘ │ what batch_memcpy does +// ▼ ▼ ▼ +// ─── ─────────── ─────── ─────── +// ┌───┬───┬───┬───┬───┬───┬───┬───┐ +// │c0'│a0'│a1'│a2'│d0'│d1'│b0'│b1'│ buffer y contains buffers a', b', c', d' +// └───┴───┴───┴───┴───┴───┴───┴───┘ +template +std::vector shuffled_exclusive_scan(const std::vector& input, RandomGenerator& rng) +{ + const auto n = input.size(); + assert(n > 0); + + std::vector result(n); + std::vector permute(n); + + std::iota(permute.begin(), permute.end(), 0); + std::shuffle(permute.begin(), permute.end(), rng); + + for(T i = 0, sum = 0; i < n; ++i) + { + result[permute[i]] = sum; + sum += input[permute[i]]; + } + + return result; +} + +using offset_type = size_t; + +template +struct BatchMemcpyData +{ + size_t total_num_elements = 0; + ValueType* d_input = nullptr; + ValueType* d_output = nullptr; + ValueType** d_buffer_srcs = nullptr; + ValueType** d_buffer_dsts = nullptr; + BufferSizeType* d_buffer_sizes = nullptr; + + BatchMemcpyData() = default; + BatchMemcpyData(const BatchMemcpyData&) = delete; + + BatchMemcpyData(BatchMemcpyData&& other) + : total_num_elements{std::exchange(other.total_num_elements, 0)} + , d_input{std::exchange(other.d_input, nullptr)} + , d_output{std::exchange(other.d_output, nullptr)} + , d_buffer_srcs{std::exchange(other.d_buffer_srcs, nullptr)} + , d_buffer_dsts{std::exchange(other.d_buffer_dsts, nullptr)} + , d_buffer_sizes{std::exchange(other.d_buffer_sizes, nullptr)} + {} + + BatchMemcpyData& operator=(BatchMemcpyData&& other) + { + total_num_elements = std::exchange(other.total_num_elements, 0); + d_input = std::exchange(other.d_input, nullptr); + d_output = std::exchange(other.d_output, nullptr); + d_buffer_srcs = std::exchange(other.d_buffer_srcs, nullptr); + d_buffer_dsts = std::exchange(other.d_buffer_dsts, nullptr); + d_buffer_sizes = std::exchange(other.d_buffer_sizes, nullptr); + return *this; + }; + + BatchMemcpyData& operator=(const BatchMemcpyData&) = delete; + + size_t total_num_bytes() const + { + return total_num_elements * sizeof(ValueType); + } + + ~BatchMemcpyData() + { + HIP_CHECK(hipFree(d_buffer_sizes)); + HIP_CHECK(hipFree(d_buffer_srcs)); + HIP_CHECK(hipFree(d_buffer_dsts)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); + } +}; + +template +BatchMemcpyData prepare_data(const int32_t num_tlev_buffers = 1024, + const int32_t num_wlev_buffers = 1024, + const int32_t num_blev_buffers = 1024) +{ + const bool shuffle_buffers = false; + + BatchMemcpyData result; + const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; + + constexpr int32_t wlev_min_elems + = benchmark_utils::ceiling_div(wlev_min_size, sizeof(ValueType)); + constexpr int32_t blev_min_elems + = benchmark_utils::ceiling_div(blev_min_size, sizeof(ValueType)); + constexpr int32_t max_elems = max_size / sizeof(ValueType); + + // Generate data + std::mt19937_64 rng(std::random_device{}()); + + // Number of elements in each buffer. + std::vector h_buffer_num_elements(num_buffers); + + auto iter = h_buffer_num_elements.begin(); + + iter = benchmark_utils::generate_random_data_n(iter, + num_tlev_buffers, + 1, + wlev_min_elems - 1, + rng); + iter = benchmark_utils::generate_random_data_n(iter, + num_wlev_buffers, + wlev_min_elems, + blev_min_elems - 1, + rng); + iter = benchmark_utils::generate_random_data_n(iter, + num_blev_buffers, + blev_min_elems, + max_elems, + rng); + + // Shuffle the sizes so that size classes aren't clustered + std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng); + + // Get the byte size of each buffer + std::vector h_buffer_num_bytes(num_buffers); + for(size_t i = 0; i < num_buffers; ++i) + { + h_buffer_num_bytes[i] = h_buffer_num_elements[i] * sizeof(ValueType); + } + + result.total_num_elements + = std::accumulate(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), size_t{0}); + + // Generate data. + std::independent_bits_engine bits_engine{rng}; + + const size_t num_ints + = benchmark_utils::ceiling_div(result.total_num_bytes(), sizeof(uint64_t)); + auto h_input = std::make_unique(num_ints * sizeof(uint64_t)); + + std::for_each(reinterpret_cast(h_input.get()), + reinterpret_cast(h_input.get() + num_ints * sizeof(uint64_t)), + [&bits_engine](uint64_t& elem) { ::new(&elem) uint64_t{bits_engine()}; }); + + HIP_CHECK(hipMalloc(&result.d_input, result.total_num_bytes())); + HIP_CHECK(hipMalloc(&result.d_output, result.total_num_bytes())); + + HIP_CHECK(hipMalloc(&result.d_buffer_srcs, num_buffers * sizeof(ValueType*))); + HIP_CHECK(hipMalloc(&result.d_buffer_dsts, num_buffers * sizeof(ValueType*))); + HIP_CHECK(hipMalloc(&result.d_buffer_sizes, num_buffers * sizeof(BufferSizeType))); + + // Generate the source and shuffled destination offsets. + std::vector src_offsets; + std::vector dst_offsets; + + if(shuffle_buffers) + { + src_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); + dst_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); + } else + { + src_offsets = std::vector(num_buffers); + dst_offsets = std::vector(num_buffers); + + // Consecutive offsets (no shuffling). + // src/dst offsets first element is 0, so skip that! + std::partial_sum(h_buffer_num_elements.begin(), + h_buffer_num_elements.end() - 1, + src_offsets.begin() + 1); + std::partial_sum(h_buffer_num_elements.begin(), + h_buffer_num_elements.end() - 1, + dst_offsets.begin() + 1); + } + + // Generate the source and destination pointers. + std::vector h_buffer_srcs(num_buffers); + std::vector h_buffer_dsts(num_buffers); + + for(size_t i = 0; i < num_buffers; ++i) + { + h_buffer_srcs[i] = result.d_input + src_offsets[i]; + h_buffer_dsts[i] = result.d_output + dst_offsets[i]; + } + + // Prepare the batch memcpy. + HIP_CHECK( + hipMemcpy(result.d_input, h_input.get(), result.total_num_bytes(), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_srcs, + h_buffer_srcs.data(), + h_buffer_srcs.size() * sizeof(ValueType*), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_dsts, + h_buffer_dsts.data(), + h_buffer_dsts.size() * sizeof(ValueType*), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(result.d_buffer_sizes, + h_buffer_num_bytes.data(), + h_buffer_num_bytes.size() * sizeof(BufferSizeType), + hipMemcpyHostToDevice)); + + return result; +} + +template +void run_benchmark(benchmark::State& state, + hipStream_t stream, + const int32_t num_tlev_buffers = 1024, + const int32_t num_wlev_buffers = 1024, + const int32_t num_blev_buffers = 1024) +{ + const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; + + size_t temp_storage_bytes = 0; + BatchMemcpyData data; + HIP_CHECK(hipcub::DeviceMemcpy::Batched(nullptr, + temp_storage_bytes, + data.d_buffer_srcs, + data.d_buffer_dsts, + data.d_buffer_sizes, + num_buffers)); + + void* d_temp_storage = nullptr; + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); + + data = prepare_data(num_tlev_buffers, + num_wlev_buffers, + num_blev_buffers); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + HIP_CHECK(hipcub::DeviceMemcpy::Batched(d_temp_storage, + temp_storage_bytes, + data.d_buffer_srcs, + data.d_buffer_dsts, + data.d_buffer_sizes, + num_buffers, + stream)); + } + HIP_CHECK(hipDeviceSynchronize()); + + // HIP events creation + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + + for(auto _ : state) + { + // Record start event + HIP_CHECK(hipEventRecord(start, stream)); + + HIP_CHECK(hipcub::DeviceMemcpy::Batched(d_temp_storage, + temp_storage_bytes, + data.d_buffer_srcs, + data.d_buffer_dsts, + data.d_buffer_sizes, + num_buffers, + stream)); + + // Record stop event and wait until it completes + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed_mseconds; + HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); + state.SetIterationTime(elapsed_mseconds / 1000); + } + state.SetBytesProcessed(state.iterations() * data.total_num_bytes()); + state.SetItemsProcessed(state.iterations() * data.total_num_elements); + + HIP_CHECK(hipFree(d_temp_storage)); +} + +#define CREATE_BENCHMARK(item_size, item_alignment, size_type, num_tlev, num_wlev, num_blev) \ + benchmark::RegisterBenchmark( \ + "{lvl:device,item_size:" #item_size ",item_alignment:" #item_alignment \ + ",size_type:" #size_type ",algo:batch_memcpy,num_tlev:" #num_tlev ",num_wlev:" #num_wlev \ + ",num_blev:" #num_blev ",cfg:default_config}", \ + [=](benchmark::State& state) \ + { \ + run_benchmark, \ + size_type>(state, stream, num_tlev, num_wlev, num_blev); \ + }) + +#define BENCHMARK_TYPE(item_size, item_alignment) \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000), \ + CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) + +int32_t main(int32_t argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", 1024, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.set_optional("name_format", + "name_format", + "human", + "either: json,human,txt"); + + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int32_t trials = parser.get("trials"); + + // HIP + hipStream_t stream = hipStreamDefault; // default + + // Benchmark info + benchmark::AddCustomContext("size", std::to_string(size)); + + // Add benchmarks + std::vector benchmarks; + + benchmarks = {BENCHMARK_TYPE(1, 1), + BENCHMARK_TYPE(1, 2), + BENCHMARK_TYPE(1, 4), + BENCHMARK_TYPE(1, 8), + BENCHMARK_TYPE(2, 2), + BENCHMARK_TYPE(4, 4), + BENCHMARK_TYPE(8, 8)}; + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } + } + + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; +} diff --git a/benchmark/benchmark_utils.hpp b/benchmark/benchmark_utils.hpp index b8b08957..deadd3dd 100644 --- a/benchmark/benchmark_utils.hpp +++ b/benchmark/benchmark_utils.hpp @@ -357,10 +357,68 @@ template struct DeviceSelectWarpSize { static constexpr unsigned value = HIPCUB_DEVICE_WARP_THREADS >= LogicalWarpSize - ? LogicalWarpSize - : HIPCUB_DEVICE_WARP_THREADS; + ? LogicalWarpSize + : HIPCUB_DEVICE_WARP_THREADS; }; +template +using it_value_t = typename std::iterator_traits::value_type; + +using engine_type = std::default_random_engine; + +// generate_random_data_n() generates only part of sequence and replicates it, +// because benchmarks usually do not need "true" random sequence. +template +inline auto generate_random_data_n( + OutputIter it, size_t size, U min, V max, Generator& gen, size_t max_random_size = 1024 * 1024) + -> typename std::enable_if_t>::value, OutputIter> +{ + using T = it_value_t; + + using dis_type = typename std::conditional<(sizeof(T) == 1), short, T>::type; + std::uniform_int_distribution distribution((T)min, (T)max); + std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); }); + for(size_t i = max_random_size; i < size; i += max_random_size) + { + std::copy_n(it, std::min(size - i, max_random_size), it + i); + } + return it + size; +} + +template +inline auto generate_random_data_n(OutputIterator it, + size_t size, + U min, + V max, + Generator& gen, + size_t max_random_size = 1024 * 1024) + -> std::enable_if_t>::value, OutputIterator> +{ + using T = typename std::iterator_traits::value_type; + + std::uniform_real_distribution distribution((T)min, (T)max); + std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); }); + for(size_t i = max_random_size; i < size; i += max_random_size) + { + std::copy_n(it, std::min(size - i, max_random_size), it + i); + } + return it + size; +} + +template +struct alignas(Alignment) custom_aligned_type +{ + unsigned char data[Size]; +}; + +template::value && std::is_unsigned::value, int> = 0> +inline constexpr auto ceiling_div(const T a, const U b) +{ + return a / b + (a % b > 0 ? 1 : 0); +} + } // end benchmark_util namespace // Need for hipcub::DeviceReduce::Min/Max etc. diff --git a/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp b/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp new file mode 100644 index 00000000..835e5dd0 --- /dev/null +++ b/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp @@ -0,0 +1,64 @@ +/****************************************************************************** + * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#ifndef HIPCUB_CUB_DEVICE_DEVICE_MEMCPY_HPP_ +#define HIPCUB_CUB_DEVICE_DEVICE_MEMCPY_HPP_ + +#include "../../../config.hpp" + +#include + +#include + +BEGIN_HIPCUB_NAMESPACE + +struct DeviceMemcpy +{ + template + static hipError_t Batched(void* d_temp_storage, + size_t& temp_storage_bytes, + InputBufferIt input_buffer_it, + OutputBufferIt output_buffer_it, + BufferSizeIteratorT buffer_sizes, + uint32_t num_buffers, + hipStream_t stream = 0, + bool /* debug_synchronous */ = false) + { + return hipCUDAErrorTohipError(::cub::DeviceMemcpy::Batched(d_temp_storage, + temp_storage_bytes, + input_buffer_it, + output_buffer_it, + buffer_sizes, + num_buffers, + stream)); + } +}; + +END_HIPCUB_NAMESPACE + +#endif // HIPCUB_CUB_DEVICE_DEVICE_MEMCPY_HPP_ diff --git a/hipcub/include/hipcub/backend/cub/hipcub.hpp b/hipcub/include/hipcub/backend/cub/hipcub.hpp index 1424e201..8b70cc1a 100644 --- a/hipcub/include/hipcub/backend/cub/hipcub.hpp +++ b/hipcub/include/hipcub/backend/cub/hipcub.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2022, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2023, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -53,6 +53,7 @@ // hipError_t instead of cudaError_t #include "device/device_adjacent_difference.hpp" #include "device/device_histogram.hpp" +#include "device/device_memcpy.hpp" #include "device/device_merge_sort.hpp" #include "device/device_partition.hpp" #include "device/device_radix_sort.hpp" diff --git a/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp b/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp new file mode 100644 index 00000000..93e6fd11 --- /dev/null +++ b/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp @@ -0,0 +1,65 @@ +/****************************************************************************** + * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_MEMCPY_HPP_ +#define HIPCUB_ROCPRIM_DEVICE_DEVICE_MEMCPY_HPP_ + +#include "../../../config.hpp" + +#include + +#include + +BEGIN_HIPCUB_NAMESPACE + +struct DeviceMemcpy +{ + template + static hipError_t Batched(void* d_temp_storage, + size_t& temp_storage_bytes, + InputBufferIt input_buffer_it, + OutputBufferIt output_buffer_it, + BufferSizeIteratorT buffer_sizes, + uint32_t num_buffers, + hipStream_t stream = 0, + bool debug_synchronous = false) + { + return rocprim::batch_memcpy(d_temp_storage, + temp_storage_bytes, + input_buffer_it, + output_buffer_it, + buffer_sizes, + num_buffers, + stream, + debug_synchronous); + } +}; + +END_HIPCUB_NAMESPACE + +#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_MEMCPY_HPP_ diff --git a/hipcub/include/hipcub/backend/rocprim/hipcub.hpp b/hipcub/include/hipcub/backend/rocprim/hipcub.hpp index 236bc9c1..6de82c45 100644 --- a/hipcub/include/hipcub/backend/rocprim/hipcub.hpp +++ b/hipcub/include/hipcub/backend/rocprim/hipcub.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2022, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2023, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -52,6 +52,7 @@ // Device #include "device/device_adjacent_difference.hpp" #include "device/device_histogram.hpp" +#include "device/device_memcpy.hpp" #include "device/device_merge_sort.hpp" #include "device/device_partition.hpp" #include "device/device_radix_sort.hpp" diff --git a/hipcub/include/hipcub/device/device_memcpy.hpp b/hipcub/include/hipcub/device/device_memcpy.hpp new file mode 100644 index 00000000..2a7864c7 --- /dev/null +++ b/hipcub/include/hipcub/device/device_memcpy.hpp @@ -0,0 +1,38 @@ +/****************************************************************************** + * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#ifndef HIPCUB_DEVICE_DEVICE_MEMCPY_HPP_ +#define HIPCUB_DEVICE_DEVICE_MEMCPY_HPP_ + +#ifdef __HIP_PLATFORM_AMD__ + #include "../backend/rocprim/device/device_memcpy.hpp" +#elif defined(__HIP_PLATFORM_NVIDIA__) + #include "../backend/cub/device/device_memcpy.hpp" +#endif + +#endif // HIPCUB_DEVICE_DEVICE_MEMCPY_HPP_ diff --git a/test/hipcub/CMakeLists.txt b/test/hipcub/CMakeLists.txt index d225952e..e1d67d85 100644 --- a/test/hipcub/CMakeLists.txt +++ b/test/hipcub/CMakeLists.txt @@ -206,6 +206,7 @@ add_hipcub_test("hipcub.BlockScan" test_hipcub_block_scan.cpp) add_hipcub_test("hipcub.BlockShuffle" test_hipcub_block_shuffle.cpp) add_hipcub_test("hipcub.DeviceAdjacentDifference" test_hipcub_device_adjacent_difference.cpp) add_hipcub_test("hipcub.DeviceHistogram" test_hipcub_device_histogram.cpp) +add_hipcub_test("hipcub.DeviceMemcpy" test_hipcub_device_memcpy.cpp) add_hipcub_test("hipcub.DeviceMergeSort" test_hipcub_device_merge_sort.cpp) add_hipcub_test_parallel("hipcub.DeviceRadixSort" test_hipcub_device_radix_sort.cpp.in) add_hipcub_test("hipcub.DeviceReduce" test_hipcub_device_reduce.cpp) diff --git a/test/hipcub/test_hipcub_device_memcpy.cpp b/test/hipcub/test_hipcub_device_memcpy.cpp new file mode 100644 index 00000000..46e54cad --- /dev/null +++ b/test/hipcub/test_hipcub_device_memcpy.cpp @@ -0,0 +1,327 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "common_test_header.hpp" +#include "test_utils_assertions.hpp" +#include "test_utils_custom_test_types.hpp" +#include "test_utils_data_generation.hpp" +#include "test_utils_types.hpp" + +#include "hipcub/device/device_memcpy.hpp" +#include "hipcub/thread/thread_operators.hpp" + +#include +#include + +#include +#include +#include +#include + +#include + +template +struct DeviceBatchMemcpyParams +{ + using value_type = ValueType; + using size_type = SizeType; + static constexpr bool shuffled = Shuffled; + static constexpr uint32_t num_buffers = NumBuffers; + static constexpr uint32_t max_size = MaxSize; +}; + +template +struct DeviceBatchMemcpyTests : public ::testing::Test +{ + using value_type = typename Params::value_type; + using size_type = typename Params::size_type; + static constexpr bool shuffled = Params::shuffled; + static constexpr uint32_t num_buffers = Params::num_buffers; + static constexpr uint32_t max_size = Params::max_size; +}; + +typedef ::testing::Types< + // Ignore copy/move + + // Unshuffled inputs and outputs + // Variable value_type + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + // size_type: uint16_t + DeviceBatchMemcpyParams, + // size_type: int64_t + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + + // weird amount of buffers + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + + // Shuffled inputs and outputs + // Variable value_type + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams, + // size_type: uint16_t + DeviceBatchMemcpyParams, + // size_type: int64_t + DeviceBatchMemcpyParams, + DeviceBatchMemcpyParams> + DeviceBatchMemcpyTestsParams; + +TYPED_TEST_SUITE(DeviceBatchMemcpyTests, DeviceBatchMemcpyTestsParams); + +// Used for generating offsets. We generate a permutation map and then derive +// offsets via a sum scan over the sizes in the order of the permutation. This +// allows us to keep the order of buffers we pass to batch_memcpy, but still +// have source and destinations mappings not be the identity function: +// +// batch_memcpy( +// [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, d!) +// [&a0', &b0', &c0', &d0'], // to (order is the same as above too!) +// [3 , 2 , 1 , 2 ]) // size +// +// ┌───┬───┬───┬───┬───┬───┬───┬───┐ +// │b0 │b1 │a0 │a1 │a2 │d0 │d1 │c0 │ buffer x contains buffers a, b, c, d +// └───┴───┴───┴───┴───┴───┴───┴───┘ note that the order of buffers is shuffled! +// ───┬─── ─────┬───── ───┬─── ─── +// └─────────┼─────────┼───┐ +// ┌───┘ ┌───┘ │ what batch_memcpy does +// ▼ ▼ ▼ +// ─── ─────────── ─────── ─────── +// ┌───┬───┬───┬───┬───┬───┬───┬───┐ +// │c0'│a0'│a1'│a2'│d0'│d1'│b0'│b1'│ buffer y contains buffers a', b', c', d' +// └───┴───┴───┴───┴───┴───┴───┴───┘ +template +std::vector shuffled_exclusive_scan(const std::vector& input, RandomGenerator& rng) +{ + const size_t n = input.size(); + assert(n > 0); + + std::vector result(n); + std::vector permute(n); + + std::iota(permute.begin(), permute.end(), 0); + std::shuffle(permute.begin(), permute.end(), rng); + + T sum = 0; + for(size_t i = 0; i < n; ++i) + { + result[permute[i]] = sum; + sum += input[permute[i]]; + } + + return result; +} + +TYPED_TEST(DeviceBatchMemcpyTests, SizeAndTypeVariation) +{ + // While on rocPRIM these can be variable via the config. CUB does not allow this. + // Therefore we assume fixed size. Otherwise we would use: + // - rocprim::batch_memcpy_config<>::wlev_size_threshold + // - rocprim::batch_memcpy_config<>::blev_size_threshold; + constexpr int32_t wlev_min_size = 128; + constexpr int32_t blev_min_size = 1024; + + constexpr int32_t num_buffers = TestFixture::num_buffers; + constexpr int32_t max_size = TestFixture::max_size; + constexpr bool shuffled = TestFixture::shuffled; + + constexpr int32_t num_tlev_buffers = num_buffers / 3; + constexpr int32_t num_wlev_buffers = num_buffers / 3; + + using value_type = typename TestFixture::value_type; + using buffer_size_type = typename TestFixture::size_type; + using buffer_offset_type = uint32_t; + using byte_offset_type = size_t; + + using value_alias = + typename std::conditional::value, + typename test_utils::inner_type::type, + value_type>::type; + + // Get random buffer sizes + + // Number of elements in each buffer. + std::vector h_buffer_num_elements(num_buffers); + + // Total number of bytes. + byte_offset_type total_num_bytes = 0; + byte_offset_type total_num_elements = 0; + + uint32_t seed = 0; + SCOPED_TRACE(testing::Message() << "with seed= " << seed); + std::default_random_engine rng{seed}; + + for(buffer_offset_type i = 0; i < num_buffers; ++i) + { + buffer_size_type size; + if(i < num_tlev_buffers) + { + size = test_utils::get_random_value(1, wlev_min_size - 1, rng()); + } else if(i < num_tlev_buffers + num_wlev_buffers) + { + size = test_utils::get_random_value(wlev_min_size, + blev_min_size - 1, + rng()); + } else + { + size = test_utils::get_random_value(blev_min_size, max_size, rng()); + } + + // convert from number of bytes to number of elements + size = test_utils::max(1, size / sizeof(value_type)); + size = test_utils::min(size, max_size); + + h_buffer_num_elements[i] = size; + total_num_elements += size; + } + + // Shuffle the sizes so that size classes aren't clustered + std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng); + + // Get the byte size of each buffer + std::vector h_buffer_num_bytes(num_buffers); + for(size_t i = 0; i < num_buffers; ++i) + { + h_buffer_num_bytes[i] = h_buffer_num_elements[i] * sizeof(value_type); + } + + // And the total byte size + total_num_bytes = total_num_elements * sizeof(value_type); + + // Device pointers + value_type* d_input{}; + value_type* d_output{}; + value_type** d_buffer_srcs{}; + value_type** d_buffer_dsts{}; + buffer_size_type* d_buffer_sizes{}; + + // Calculate temporary storage + + size_t temp_storage_bytes = 0; + + HIP_CHECK(hipcub::DeviceMemcpy::Batched(nullptr, + temp_storage_bytes, + d_buffer_srcs, + d_buffer_dsts, + d_buffer_sizes, + num_buffers)); + + void* d_temp_storage{}; + + // Allocate memory. + HIP_CHECK(hipMalloc(&d_input, total_num_bytes)); + HIP_CHECK(hipMalloc(&d_output, total_num_bytes)); + + HIP_CHECK(hipMalloc(&d_buffer_srcs, num_buffers * sizeof(*d_buffer_srcs))); + HIP_CHECK(hipMalloc(&d_buffer_dsts, num_buffers * sizeof(*d_buffer_dsts))); + HIP_CHECK(hipMalloc(&d_buffer_sizes, num_buffers * sizeof(*d_buffer_sizes))); + + HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); + + // Generate data. + std::vector h_input + = test_utils::get_random_data(total_num_elements, + test_utils::numeric_limits::min(), + test_utils::numeric_limits::max(), + rng()); + + // Generate the source and shuffled destination offsets. + std::vector src_offsets; + std::vector dst_offsets; + + if(shuffled) + { + src_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); + dst_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); + } else + { + src_offsets = std::vector(num_buffers); + dst_offsets = std::vector(num_buffers); + + test_utils::host_exclusive_scan(h_buffer_num_elements.begin(), + h_buffer_num_elements.end(), + 0, + src_offsets.begin(), + hipcub::Sum{}); + test_utils::host_exclusive_scan(h_buffer_num_elements.begin(), + h_buffer_num_elements.end(), + 0, + dst_offsets.begin(), + hipcub::Sum{}); + } + + // Generate the source and destination pointers. + std::vector h_buffer_srcs(num_buffers); + std::vector h_buffer_dsts(num_buffers); + + for(int32_t i = 0; i < num_buffers; ++i) + { + h_buffer_srcs[i] = d_input + src_offsets[i]; + h_buffer_dsts[i] = d_output + dst_offsets[i]; + } + + // Prepare the batch memcpy. + HIP_CHECK(hipMemcpy(d_input, h_input.data(), total_num_bytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_buffer_srcs, + h_buffer_srcs.data(), + h_buffer_srcs.size() * sizeof(*d_buffer_srcs), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_buffer_dsts, + h_buffer_dsts.data(), + h_buffer_dsts.size() * sizeof(*d_buffer_dsts), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_buffer_sizes, + h_buffer_num_bytes.data(), + h_buffer_num_bytes.size() * sizeof(*d_buffer_sizes), + hipMemcpyHostToDevice)); + + // Run batched memcpy. + HIP_CHECK(hipcub::DeviceMemcpy::Batched(d_temp_storage, + temp_storage_bytes, + d_buffer_srcs, + d_buffer_dsts, + d_buffer_sizes, + num_buffers, + hipStreamDefault)); + // Verify results. + std::vector h_output(total_num_elements); + HIP_CHECK(hipMemcpy(h_output.data(), d_output, total_num_bytes, hipMemcpyDeviceToHost)); + + for(int32_t i = 0; i < num_buffers; ++i) + { + for(buffer_size_type j = 0; j < h_buffer_num_elements[i]; ++j) + { + auto input_index = src_offsets[i] + j; + auto output_index = dst_offsets[i] + j; + + ASSERT_TRUE(test_utils::bit_equal(h_input[input_index], h_output[output_index])); + } + } +}