diff --git a/.clang-format b/.clang-format
index 1351dcc9..bf4a9927 100644
--- a/.clang-format
+++ b/.clang-format
@@ -59,8 +59,6 @@ BraceWrapping:
     AfterStruct: true
     AfterUnion: true
     AfterExternBlock: false
-    BeforeCatch: true
-    BeforeElse: true
     BeforeLambdaBody: true
     BeforeWhile: true
     IndentBraces: false
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 68be9cde..b370708c 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -45,7 +45,7 @@ clang-format:
   stage: lint
   needs: []
   tags:
-    - rocm-build
+    - build
   variables:
     CLANG_FORMAT: "/opt/rocm/llvm/bin/clang-format"
     GIT_CLANG_FORMAT: "/opt/rocm/llvm/bin/git-clang-format"
@@ -62,7 +62,7 @@ copyright-date:
   stage: lint
   needs: []
   tags:
-    - rocm-build
+    - build
   rules:
     - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
   script:
@@ -107,7 +107,7 @@ build:rocm:
     - .rules:build
   stage: build
   tags:
-    - rocm-build
+    - build
   needs: []
   script:
     - cmake
@@ -145,7 +145,7 @@ build:rocm-benchmark:
     - .rules:build
   stage: build
   tags:
-    - rocm-build
+    - build
   needs: []
   script:
     - cmake
@@ -304,7 +304,7 @@ build:nvcc:
     - .nvcc
     - .rules:build
   tags:
-    - nvcc-build
+    - build
   needs: []
   script:
     - cmake
@@ -339,7 +339,7 @@ build:nvcc-benchmark:
     - .nvcc
     - .rules:build
   tags:
-    - nvcc-build
+    - build
   needs: []
   script:
     - cmake
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e0fc6873..950525f2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,9 @@ See README.md on how to build the hipCUB documentation using Doxygen.
 ### Fixed
 - Fixed build issues with `rmake.py` on Windows when using VS 2017 15.8 or later due to a breaking fix with extended aligned storage.
 
+### Added
+- Added interface `DeviceMemcpy::Batched` for batched memcpy from rocPRIM and CUB.
+
 ## (Unreleased) hipCUB-2.13.1 for ROCm 5.7.0
 ### Changed
 - CUB backend references CUB and Thrust version 2.0.1.
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 5fff6007..2d5e2188 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -1,6 +1,6 @@
 # MIT License
 #
-# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -80,6 +80,7 @@ add_hipcub_benchmark(benchmark_block_run_length_decode.cpp)
 add_hipcub_benchmark(benchmark_block_scan.cpp)
 add_hipcub_benchmark(benchmark_block_shuffle.cpp)
 add_hipcub_benchmark(benchmark_device_adjacent_difference.cpp)
+add_hipcub_benchmark(benchmark_device_batch_memcpy.cpp)
 add_hipcub_benchmark(benchmark_device_histogram.cpp)
 add_hipcub_benchmark(benchmark_device_memory.cpp)
 add_hipcub_benchmark(benchmark_device_merge_sort.cpp)
diff --git a/benchmark/benchmark_device_batch_memcpy.cpp b/benchmark/benchmark_device_batch_memcpy.cpp
new file mode 100644
index 00000000..3d72e349
--- /dev/null
+++ b/benchmark/benchmark_device_batch_memcpy.cpp
@@ -0,0 +1,410 @@
+// MIT License
+//
+// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "benchmark/benchmark.h"
+#include "cmdparser.hpp"
+#include "common_benchmark_header.hpp"
+
+#include "hipcub/block/block_load.hpp"
+#include "hipcub/block/block_store.hpp"
+#include "hipcub/device/device_memcpy.hpp"
+#include "hipcub/hipcub.hpp"
+
+#ifdef __HIP_PLATFORM_AMD__
+    // Only include this on AMD as it contains specialized config information
+    #include <rocprim/device/device_memcpy_config.hpp>
+#endif
+
+#include <hip/hip_runtime.h>
+
+#include <iostream>
+#include <numeric>
+#include <random>
+#include <utility>
+#include <vector>
+
+#include <stdint.h>
+
+constexpr uint32_t warmup_size = 5;
+constexpr int32_t  max_size    = 1024 * 1024;
+
+constexpr int32_t wlev_min_size = 128;
+constexpr int32_t blev_min_size = 1024;
+
+// Used for generating offsets. We generate a permutation map and then derive
+// offsets via a sum scan over the sizes in the order of the permutation. This
+// allows us to keep the order of buffers we pass to batch_memcpy, but still
+// have source and destinations mappings not be the identity function:
+//
+//  batch_memcpy(
+//    [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, d!)
+//    [&a0', &b0', &c0', &d0'], // to   (order is the same as above too!)
+//    [3   , 2   , 1   , 2   ]) // size
+//
+// ┌───┬───┬───┬───┬───┬───┬───┬───┐
+// │b0 │b1 │a0 │a1 │a2 │d0 │d1 │c0 │ buffer x contains buffers a, b, c, d
+// └───┴───┴───┴───┴───┴───┴───┴───┘ note that the order of buffers is shuffled!
+//  ───┬─── ─────┬───── ───┬─── ───
+//     └─────────┼─────────┼───┐
+//           ┌───┘     ┌───┘   │ what batch_memcpy does
+//           ▼         ▼       ▼
+//  ─── ─────────── ─────── ───────
+// ┌───┬───┬───┬───┬───┬───┬───┬───┐
+// │c0'│a0'│a1'│a2'│d0'│d1'│b0'│b1'│ buffer y contains buffers a', b', c', d'
+// └───┴───┴───┴───┴───┴───┴───┴───┘
+template<class T, class S, class RandomGenerator>
+std::vector<T> shuffled_exclusive_scan(const std::vector<S>& input, RandomGenerator& rng)
+{
+    const auto n = input.size();
+    assert(n > 0);
+
+    std::vector<T> result(n);
+    std::vector<T> permute(n);
+
+    std::iota(permute.begin(), permute.end(), 0);
+    std::shuffle(permute.begin(), permute.end(), rng);
+
+    for(T i = 0, sum = 0; i < n; ++i)
+    {
+        result[permute[i]] = sum;
+        sum += input[permute[i]];
+    }
+
+    return result;
+}
+
+using offset_type = size_t;
+
+template<typename ValueType, typename BufferSizeType>
+struct BatchMemcpyData
+{
+    size_t          total_num_elements = 0;
+    ValueType*      d_input            = nullptr;
+    ValueType*      d_output           = nullptr;
+    ValueType**     d_buffer_srcs      = nullptr;
+    ValueType**     d_buffer_dsts      = nullptr;
+    BufferSizeType* d_buffer_sizes     = nullptr;
+
+    BatchMemcpyData()                       = default;
+    BatchMemcpyData(const BatchMemcpyData&) = delete;
+
+    BatchMemcpyData(BatchMemcpyData&& other)
+        : total_num_elements{std::exchange(other.total_num_elements, 0)}
+        , d_input{std::exchange(other.d_input, nullptr)}
+        , d_output{std::exchange(other.d_output, nullptr)}
+        , d_buffer_srcs{std::exchange(other.d_buffer_srcs, nullptr)}
+        , d_buffer_dsts{std::exchange(other.d_buffer_dsts, nullptr)}
+        , d_buffer_sizes{std::exchange(other.d_buffer_sizes, nullptr)}
+    {}
+
+    BatchMemcpyData& operator=(BatchMemcpyData&& other)
+    {
+        total_num_elements = std::exchange(other.total_num_elements, 0);
+        d_input            = std::exchange(other.d_input, nullptr);
+        d_output           = std::exchange(other.d_output, nullptr);
+        d_buffer_srcs      = std::exchange(other.d_buffer_srcs, nullptr);
+        d_buffer_dsts      = std::exchange(other.d_buffer_dsts, nullptr);
+        d_buffer_sizes     = std::exchange(other.d_buffer_sizes, nullptr);
+        return *this;
+    };
+
+    BatchMemcpyData& operator=(const BatchMemcpyData&) = delete;
+
+    size_t total_num_bytes() const
+    {
+        return total_num_elements * sizeof(ValueType);
+    }
+
+    ~BatchMemcpyData()
+    {
+        HIP_CHECK(hipFree(d_buffer_sizes));
+        HIP_CHECK(hipFree(d_buffer_srcs));
+        HIP_CHECK(hipFree(d_buffer_dsts));
+        HIP_CHECK(hipFree(d_output));
+        HIP_CHECK(hipFree(d_input));
+    }
+};
+
+template<class ValueType, class BufferSizeType>
+BatchMemcpyData<ValueType, BufferSizeType> prepare_data(const int32_t num_tlev_buffers = 1024,
+                                                        const int32_t num_wlev_buffers = 1024,
+                                                        const int32_t num_blev_buffers = 1024)
+{
+    const bool shuffle_buffers = false;
+
+    BatchMemcpyData<ValueType, BufferSizeType> result;
+    const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers;
+
+    constexpr int32_t wlev_min_elems
+        = benchmark_utils::ceiling_div(wlev_min_size, sizeof(ValueType));
+    constexpr int32_t blev_min_elems
+        = benchmark_utils::ceiling_div(blev_min_size, sizeof(ValueType));
+    constexpr int32_t max_elems = max_size / sizeof(ValueType);
+
+    // Generate data
+    std::mt19937_64 rng(std::random_device{}());
+
+    // Number of elements in each buffer.
+    std::vector<BufferSizeType> h_buffer_num_elements(num_buffers);
+
+    auto iter = h_buffer_num_elements.begin();
+
+    iter = benchmark_utils::generate_random_data_n(iter,
+                                                   num_tlev_buffers,
+                                                   1,
+                                                   wlev_min_elems - 1,
+                                                   rng);
+    iter = benchmark_utils::generate_random_data_n(iter,
+                                                   num_wlev_buffers,
+                                                   wlev_min_elems,
+                                                   blev_min_elems - 1,
+                                                   rng);
+    iter = benchmark_utils::generate_random_data_n(iter,
+                                                   num_blev_buffers,
+                                                   blev_min_elems,
+                                                   max_elems,
+                                                   rng);
+
+    // Shuffle the sizes so that size classes aren't clustered
+    std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng);
+
+    // Get the byte size of each buffer
+    std::vector<BufferSizeType> h_buffer_num_bytes(num_buffers);
+    for(size_t i = 0; i < num_buffers; ++i)
+    {
+        h_buffer_num_bytes[i] = h_buffer_num_elements[i] * sizeof(ValueType);
+    }
+
+    result.total_num_elements
+        = std::accumulate(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), size_t{0});
+
+    // Generate data.
+    std::independent_bits_engine<std::mt19937_64, 64, uint64_t> bits_engine{rng};
+
+    const size_t num_ints
+        = benchmark_utils::ceiling_div(result.total_num_bytes(), sizeof(uint64_t));
+    auto h_input = std::make_unique<unsigned char[]>(num_ints * sizeof(uint64_t));
+
+    std::for_each(reinterpret_cast<uint64_t*>(h_input.get()),
+                  reinterpret_cast<uint64_t*>(h_input.get() + num_ints * sizeof(uint64_t)),
+                  [&bits_engine](uint64_t& elem) { ::new(&elem) uint64_t{bits_engine()}; });
+
+    HIP_CHECK(hipMalloc(&result.d_input, result.total_num_bytes()));
+    HIP_CHECK(hipMalloc(&result.d_output, result.total_num_bytes()));
+
+    HIP_CHECK(hipMalloc(&result.d_buffer_srcs, num_buffers * sizeof(ValueType*)));
+    HIP_CHECK(hipMalloc(&result.d_buffer_dsts, num_buffers * sizeof(ValueType*)));
+    HIP_CHECK(hipMalloc(&result.d_buffer_sizes, num_buffers * sizeof(BufferSizeType)));
+
+    // Generate the source and shuffled destination offsets.
+    std::vector<offset_type> src_offsets;
+    std::vector<offset_type> dst_offsets;
+
+    if(shuffle_buffers)
+    {
+        src_offsets = shuffled_exclusive_scan<offset_type>(h_buffer_num_elements, rng);
+        dst_offsets = shuffled_exclusive_scan<offset_type>(h_buffer_num_elements, rng);
+    } else
+    {
+        src_offsets = std::vector<offset_type>(num_buffers);
+        dst_offsets = std::vector<offset_type>(num_buffers);
+
+        // Consecutive offsets (no shuffling).
+        // src/dst offsets first element is 0, so skip that!
+        std::partial_sum(h_buffer_num_elements.begin(),
+                         h_buffer_num_elements.end() - 1,
+                         src_offsets.begin() + 1);
+        std::partial_sum(h_buffer_num_elements.begin(),
+                         h_buffer_num_elements.end() - 1,
+                         dst_offsets.begin() + 1);
+    }
+
+    // Generate the source and destination pointers.
+    std::vector<ValueType*> h_buffer_srcs(num_buffers);
+    std::vector<ValueType*> h_buffer_dsts(num_buffers);
+
+    for(size_t i = 0; i < num_buffers; ++i)
+    {
+        h_buffer_srcs[i] = result.d_input + src_offsets[i];
+        h_buffer_dsts[i] = result.d_output + dst_offsets[i];
+    }
+
+    // Prepare the batch memcpy.
+    HIP_CHECK(
+        hipMemcpy(result.d_input, h_input.get(), result.total_num_bytes(), hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(result.d_buffer_srcs,
+                        h_buffer_srcs.data(),
+                        h_buffer_srcs.size() * sizeof(ValueType*),
+                        hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(result.d_buffer_dsts,
+                        h_buffer_dsts.data(),
+                        h_buffer_dsts.size() * sizeof(ValueType*),
+                        hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(result.d_buffer_sizes,
+                        h_buffer_num_bytes.data(),
+                        h_buffer_num_bytes.size() * sizeof(BufferSizeType),
+                        hipMemcpyHostToDevice));
+
+    return result;
+}
+
+template<class ValueType, class BufferSizeType>
+void run_benchmark(benchmark::State& state,
+                   hipStream_t       stream,
+                   const int32_t     num_tlev_buffers = 1024,
+                   const int32_t     num_wlev_buffers = 1024,
+                   const int32_t     num_blev_buffers = 1024)
+{
+    const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers;
+
+    size_t                                     temp_storage_bytes = 0;
+    BatchMemcpyData<ValueType, BufferSizeType> data;
+    HIP_CHECK(hipcub::DeviceMemcpy::Batched(nullptr,
+                                            temp_storage_bytes,
+                                            data.d_buffer_srcs,
+                                            data.d_buffer_dsts,
+                                            data.d_buffer_sizes,
+                                            num_buffers));
+
+    void* d_temp_storage = nullptr;
+    HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes));
+
+    data = prepare_data<ValueType, BufferSizeType>(num_tlev_buffers,
+                                                   num_wlev_buffers,
+                                                   num_blev_buffers);
+
+    // Warm-up
+    for(size_t i = 0; i < warmup_size; i++)
+    {
+        HIP_CHECK(hipcub::DeviceMemcpy::Batched(d_temp_storage,
+                                                temp_storage_bytes,
+                                                data.d_buffer_srcs,
+                                                data.d_buffer_dsts,
+                                                data.d_buffer_sizes,
+                                                num_buffers,
+                                                stream));
+    }
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // HIP events creation
+    hipEvent_t start, stop;
+    HIP_CHECK(hipEventCreate(&start));
+    HIP_CHECK(hipEventCreate(&stop));
+
+    for(auto _ : state)
+    {
+        // Record start event
+        HIP_CHECK(hipEventRecord(start, stream));
+
+        HIP_CHECK(hipcub::DeviceMemcpy::Batched(d_temp_storage,
+                                                temp_storage_bytes,
+                                                data.d_buffer_srcs,
+                                                data.d_buffer_dsts,
+                                                data.d_buffer_sizes,
+                                                num_buffers,
+                                                stream));
+
+        // Record stop event and wait until it completes
+        HIP_CHECK(hipEventRecord(stop, stream));
+        HIP_CHECK(hipEventSynchronize(stop));
+
+        float elapsed_mseconds;
+        HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop));
+        state.SetIterationTime(elapsed_mseconds / 1000);
+    }
+    state.SetBytesProcessed(state.iterations() * data.total_num_bytes());
+    state.SetItemsProcessed(state.iterations() * data.total_num_elements);
+
+    HIP_CHECK(hipFree(d_temp_storage));
+}
+
+#define CREATE_BENCHMARK(item_size, item_alignment, size_type, num_tlev, num_wlev, num_blev)     \
+    benchmark::RegisterBenchmark(                                                                \
+        "{lvl:device,item_size:" #item_size ",item_alignment:" #item_alignment                   \
+        ",size_type:" #size_type ",algo:batch_memcpy,num_tlev:" #num_tlev ",num_wlev:" #num_wlev \
+        ",num_blev:" #num_blev ",cfg:default_config}",                                           \
+        [=](benchmark::State& state)                                                             \
+        {                                                                                        \
+            run_benchmark<benchmark_utils::custom_aligned_type<item_size, item_alignment>,       \
+                          size_type>(state, stream, num_tlev, num_wlev, num_blev);               \
+        })
+
+#define BENCHMARK_TYPE(item_size, item_alignment)                            \
+    CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0),     \
+        CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \
+        CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000),   \
+        CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000)
+
+int32_t main(int32_t argc, char* argv[])
+{
+    cli::Parser parser(argc, argv);
+    parser.set_optional<size_t>("size", "size", 1024, "number of values");
+    parser.set_optional<int>("trials", "trials", -1, "number of iterations");
+    parser.set_optional<std::string>("name_format",
+                                     "name_format",
+                                     "human",
+                                     "either: json,human,txt");
+
+    parser.run_and_exit_if_error();
+
+    // Parse argv
+    benchmark::Initialize(&argc, argv);
+    const size_t  size   = parser.get<size_t>("size");
+    const int32_t trials = parser.get<int>("trials");
+
+    // HIP
+    hipStream_t stream = hipStreamDefault; // default
+
+    // Benchmark info
+    benchmark::AddCustomContext("size", std::to_string(size));
+
+    // Add benchmarks
+    std::vector<benchmark::internal::Benchmark*> benchmarks;
+
+    benchmarks = {BENCHMARK_TYPE(1, 1),
+                  BENCHMARK_TYPE(1, 2),
+                  BENCHMARK_TYPE(1, 4),
+                  BENCHMARK_TYPE(1, 8),
+                  BENCHMARK_TYPE(2, 2),
+                  BENCHMARK_TYPE(4, 4),
+                  BENCHMARK_TYPE(8, 8)};
+
+    // Use manual timing
+    for(auto& b : benchmarks)
+    {
+        b->UseManualTime();
+        b->Unit(benchmark::kMillisecond);
+    }
+
+    // Force number of iterations
+    if(trials > 0)
+    {
+        for(auto& b : benchmarks)
+        {
+            b->Iterations(trials);
+        }
+    }
+
+    // Run benchmarks
+    benchmark::RunSpecifiedBenchmarks();
+    return 0;
+}
diff --git a/benchmark/benchmark_utils.hpp b/benchmark/benchmark_utils.hpp
index b8b08957..deadd3dd 100644
--- a/benchmark/benchmark_utils.hpp
+++ b/benchmark/benchmark_utils.hpp
@@ -357,10 +357,68 @@ template<unsigned LogicalWarpSize>
 struct DeviceSelectWarpSize
 {
     static constexpr unsigned value = HIPCUB_DEVICE_WARP_THREADS >= LogicalWarpSize
-        ? LogicalWarpSize
-        : HIPCUB_DEVICE_WARP_THREADS;
+                                          ? LogicalWarpSize
+                                          : HIPCUB_DEVICE_WARP_THREADS;
 };
 
+template<typename Iterator>
+using it_value_t = typename std::iterator_traits<Iterator>::value_type;
+
+using engine_type = std::default_random_engine;
+
+// generate_random_data_n() generates only part of sequence and replicates it,
+// because benchmarks usually do not need "true" random sequence.
+template<class OutputIter, class U, class V, class Generator>
+inline auto generate_random_data_n(
+    OutputIter it, size_t size, U min, V max, Generator& gen, size_t max_random_size = 1024 * 1024)
+    -> typename std::enable_if_t<std::is_integral<it_value_t<OutputIter>>::value, OutputIter>
+{
+    using T = it_value_t<OutputIter>;
+
+    using dis_type = typename std::conditional<(sizeof(T) == 1), short, T>::type;
+    std::uniform_int_distribution<dis_type> distribution((T)min, (T)max);
+    std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); });
+    for(size_t i = max_random_size; i < size; i += max_random_size)
+    {
+        std::copy_n(it, std::min(size - i, max_random_size), it + i);
+    }
+    return it + size;
+}
+
+template<class OutputIterator, class U, class V, class Generator>
+inline auto generate_random_data_n(OutputIterator it,
+                                   size_t         size,
+                                   U              min,
+                                   V              max,
+                                   Generator&     gen,
+                                   size_t         max_random_size = 1024 * 1024)
+    -> std::enable_if_t<std::is_floating_point<it_value_t<OutputIterator>>::value, OutputIterator>
+{
+    using T = typename std::iterator_traits<OutputIterator>::value_type;
+
+    std::uniform_real_distribution<T> distribution((T)min, (T)max);
+    std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); });
+    for(size_t i = max_random_size; i < size; i += max_random_size)
+    {
+        std::copy_n(it, std::min(size - i, max_random_size), it + i);
+    }
+    return it + size;
+}
+
+template<std::size_t Size, std::size_t Alignment>
+struct alignas(Alignment) custom_aligned_type
+{
+    unsigned char data[Size];
+};
+
+template<typename T,
+         typename U,
+         std::enable_if_t<std::is_integral<T>::value && std::is_unsigned<U>::value, int> = 0>
+inline constexpr auto ceiling_div(const T a, const U b)
+{
+    return a / b + (a % b > 0 ? 1 : 0);
+}
+
 } // end benchmark_util namespace
 
 // Need for hipcub::DeviceReduce::Min/Max etc.
diff --git a/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp b/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp
new file mode 100644
index 00000000..835e5dd0
--- /dev/null
+++ b/hipcub/include/hipcub/backend/cub/device/device_memcpy.hpp
@@ -0,0 +1,64 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2023, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_CUB_DEVICE_DEVICE_MEMCPY_HPP_
+#define HIPCUB_CUB_DEVICE_DEVICE_MEMCPY_HPP_
+
+#include "../../../config.hpp"
+
+#include <cub/device/device_memcpy.cuh>
+
+#include <hip/hip_runtime.h>
+
+BEGIN_HIPCUB_NAMESPACE
+
+struct DeviceMemcpy
+{
+    template<typename InputBufferIt, typename OutputBufferIt, typename BufferSizeIteratorT>
+    static hipError_t Batched(void*               d_temp_storage,
+                              size_t&             temp_storage_bytes,
+                              InputBufferIt       input_buffer_it,
+                              OutputBufferIt      output_buffer_it,
+                              BufferSizeIteratorT buffer_sizes,
+                              uint32_t            num_buffers,
+                              hipStream_t         stream   = 0,
+                              bool /* debug_synchronous */ = false)
+    {
+        return hipCUDAErrorTohipError(::cub::DeviceMemcpy::Batched(d_temp_storage,
+                                                                   temp_storage_bytes,
+                                                                   input_buffer_it,
+                                                                   output_buffer_it,
+                                                                   buffer_sizes,
+                                                                   num_buffers,
+                                                                   stream));
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_CUB_DEVICE_DEVICE_MEMCPY_HPP_
diff --git a/hipcub/include/hipcub/backend/cub/hipcub.hpp b/hipcub/include/hipcub/backend/cub/hipcub.hpp
index 1424e201..8b70cc1a 100644
--- a/hipcub/include/hipcub/backend/cub/hipcub.hpp
+++ b/hipcub/include/hipcub/backend/cub/hipcub.hpp
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * Modifications Copyright (c) 2017-2022, Advanced Micro Devices, Inc.  All rights reserved.
+ * Modifications Copyright (c) 2017-2023, Advanced Micro Devices, Inc.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -53,6 +53,7 @@
 // hipError_t instead of cudaError_t
 #include "device/device_adjacent_difference.hpp"
 #include "device/device_histogram.hpp"
+#include "device/device_memcpy.hpp"
 #include "device/device_merge_sort.hpp"
 #include "device/device_partition.hpp"
 #include "device/device_radix_sort.hpp"
diff --git a/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp b/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp
new file mode 100644
index 00000000..93e6fd11
--- /dev/null
+++ b/hipcub/include/hipcub/backend/rocprim/device/device_memcpy.hpp
@@ -0,0 +1,65 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2023, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_MEMCPY_HPP_
+#define HIPCUB_ROCPRIM_DEVICE_DEVICE_MEMCPY_HPP_
+
+#include "../../../config.hpp"
+
+#include <rocprim/device/device_memcpy.hpp>
+
+#include <hip/hip_runtime.h>
+
+BEGIN_HIPCUB_NAMESPACE
+
+struct DeviceMemcpy
+{
+    template<typename InputBufferIt, typename OutputBufferIt, typename BufferSizeIteratorT>
+    static hipError_t Batched(void*               d_temp_storage,
+                              size_t&             temp_storage_bytes,
+                              InputBufferIt       input_buffer_it,
+                              OutputBufferIt      output_buffer_it,
+                              BufferSizeIteratorT buffer_sizes,
+                              uint32_t            num_buffers,
+                              hipStream_t         stream            = 0,
+                              bool                debug_synchronous = false)
+    {
+        return rocprim::batch_memcpy(d_temp_storage,
+                                     temp_storage_bytes,
+                                     input_buffer_it,
+                                     output_buffer_it,
+                                     buffer_sizes,
+                                     num_buffers,
+                                     stream,
+                                     debug_synchronous);
+    }
+};
+
+END_HIPCUB_NAMESPACE
+
+#endif // HIPCUB_ROCPRIM_DEVICE_DEVICE_MEMCPY_HPP_
diff --git a/hipcub/include/hipcub/backend/rocprim/hipcub.hpp b/hipcub/include/hipcub/backend/rocprim/hipcub.hpp
index 236bc9c1..6de82c45 100644
--- a/hipcub/include/hipcub/backend/rocprim/hipcub.hpp
+++ b/hipcub/include/hipcub/backend/rocprim/hipcub.hpp
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * Modifications Copyright (c) 2017-2022, Advanced Micro Devices, Inc.  All rights reserved.
+ * Modifications Copyright (c) 2017-2023, Advanced Micro Devices, Inc.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -52,6 +52,7 @@
 // Device
 #include "device/device_adjacent_difference.hpp"
 #include "device/device_histogram.hpp"
+#include "device/device_memcpy.hpp"
 #include "device/device_merge_sort.hpp"
 #include "device/device_partition.hpp"
 #include "device/device_radix_sort.hpp"
diff --git a/hipcub/include/hipcub/device/device_memcpy.hpp b/hipcub/include/hipcub/device/device_memcpy.hpp
new file mode 100644
index 00000000..2a7864c7
--- /dev/null
+++ b/hipcub/include/hipcub/device/device_memcpy.hpp
@@ -0,0 +1,38 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2023, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef HIPCUB_DEVICE_DEVICE_MEMCPY_HPP_
+#define HIPCUB_DEVICE_DEVICE_MEMCPY_HPP_
+
+#ifdef __HIP_PLATFORM_AMD__
+    #include "../backend/rocprim/device/device_memcpy.hpp"
+#elif defined(__HIP_PLATFORM_NVIDIA__)
+    #include "../backend/cub/device/device_memcpy.hpp"
+#endif
+
+#endif // HIPCUB_DEVICE_DEVICE_MEMCPY_HPP_
diff --git a/test/hipcub/CMakeLists.txt b/test/hipcub/CMakeLists.txt
index d225952e..e1d67d85 100644
--- a/test/hipcub/CMakeLists.txt
+++ b/test/hipcub/CMakeLists.txt
@@ -206,6 +206,7 @@ add_hipcub_test("hipcub.BlockScan" test_hipcub_block_scan.cpp)
 add_hipcub_test("hipcub.BlockShuffle" test_hipcub_block_shuffle.cpp)
 add_hipcub_test("hipcub.DeviceAdjacentDifference" test_hipcub_device_adjacent_difference.cpp)
 add_hipcub_test("hipcub.DeviceHistogram" test_hipcub_device_histogram.cpp)
+add_hipcub_test("hipcub.DeviceMemcpy" test_hipcub_device_memcpy.cpp)
 add_hipcub_test("hipcub.DeviceMergeSort" test_hipcub_device_merge_sort.cpp)
 add_hipcub_test_parallel("hipcub.DeviceRadixSort" test_hipcub_device_radix_sort.cpp.in)
 add_hipcub_test("hipcub.DeviceReduce" test_hipcub_device_reduce.cpp)
diff --git a/test/hipcub/test_hipcub_device_memcpy.cpp b/test/hipcub/test_hipcub_device_memcpy.cpp
new file mode 100644
index 00000000..46e54cad
--- /dev/null
+++ b/test/hipcub/test_hipcub_device_memcpy.cpp
@@ -0,0 +1,327 @@
+// MIT License
+//
+// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "common_test_header.hpp"
+#include "test_utils_assertions.hpp"
+#include "test_utils_custom_test_types.hpp"
+#include "test_utils_data_generation.hpp"
+#include "test_utils_types.hpp"
+
+#include "hipcub/device/device_memcpy.hpp"
+#include "hipcub/thread/thread_operators.hpp"
+
+#include <gtest/gtest-typed-test.h>
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <numeric>
+#include <random>
+#include <type_traits>
+
+#include <stdint.h>
+
+template<class ValueType,
+         class SizeType,
+         bool     Shuffled   = false,
+         uint32_t NumBuffers = 1024,
+         uint32_t MaxSize    = 4 * 1024>
+struct DeviceBatchMemcpyParams
+{
+    using value_type                      = ValueType;
+    using size_type                       = SizeType;
+    static constexpr bool     shuffled    = Shuffled;
+    static constexpr uint32_t num_buffers = NumBuffers;
+    static constexpr uint32_t max_size    = MaxSize;
+};
+
+template<class Params>
+struct DeviceBatchMemcpyTests : public ::testing::Test
+{
+    using value_type                      = typename Params::value_type;
+    using size_type                       = typename Params::size_type;
+    static constexpr bool     shuffled    = Params::shuffled;
+    static constexpr uint32_t num_buffers = Params::num_buffers;
+    static constexpr uint32_t max_size    = Params::max_size;
+};
+
+typedef ::testing::Types<
+    // Ignore copy/move
+
+    // Unshuffled inputs and outputs
+    // Variable value_type
+    DeviceBatchMemcpyParams<uint8_t, uint32_t, false>,
+    DeviceBatchMemcpyParams<uint32_t, uint32_t, false>,
+    DeviceBatchMemcpyParams<uint64_t, uint32_t, false>,
+    // size_type: uint16_t
+    DeviceBatchMemcpyParams<uint8_t, uint16_t, false, 1024, 1024>,
+    // size_type: int64_t
+    DeviceBatchMemcpyParams<uint8_t, int64_t, false, 1024, 64 * 1024>,
+    DeviceBatchMemcpyParams<uint8_t, int64_t, false, 1024, 128 * 1024>,
+
+    // weird amount of buffers
+    DeviceBatchMemcpyParams<uint8_t, uint32_t, false, 3 * 1023>,
+    DeviceBatchMemcpyParams<uint8_t, uint32_t, false, 3 * 1025>,
+    DeviceBatchMemcpyParams<uint8_t, uint32_t, false, 1024 * 1024, 256>,
+
+    // Shuffled inputs and outputs
+    // Variable value_type
+    DeviceBatchMemcpyParams<uint8_t, uint32_t, true>,
+    DeviceBatchMemcpyParams<uint32_t, uint32_t, true>,
+    DeviceBatchMemcpyParams<uint64_t, uint32_t, true>,
+    // size_type: uint16_t
+    DeviceBatchMemcpyParams<uint8_t, uint16_t, true, 1024, 1024>,
+    // size_type: int64_t
+    DeviceBatchMemcpyParams<uint8_t, int64_t, true, 1024, 64 * 1024>,
+    DeviceBatchMemcpyParams<uint8_t, int64_t, true, 1024, 128 * 1024>>
+    DeviceBatchMemcpyTestsParams;
+
+TYPED_TEST_SUITE(DeviceBatchMemcpyTests, DeviceBatchMemcpyTestsParams);
+
+// Used for generating offsets. We generate a permutation map and then derive
+// offsets via a sum scan over the sizes in the order of the permutation. This
+// allows us to keep the order of buffers we pass to batch_memcpy, but still
+// have source and destinations mappings not be the identity function:
+//
+//  batch_memcpy(
+//    [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, d!)
+//    [&a0', &b0', &c0', &d0'], // to   (order is the same as above too!)
+//    [3   , 2   , 1   , 2   ]) // size
+//
+// ┌───┬───┬───┬───┬───┬───┬───┬───┐
+// │b0 │b1 │a0 │a1 │a2 │d0 │d1 │c0 │ buffer x contains buffers a, b, c, d
+// └───┴───┴───┴───┴───┴───┴───┴───┘ note that the order of buffers is shuffled!
+//  ───┬─── ─────┬───── ───┬─── ───
+//     └─────────┼─────────┼───┐
+//           ┌───┘     ┌───┘   │ what batch_memcpy does
+//           ▼         ▼       ▼
+//  ─── ─────────── ─────── ───────
+// ┌───┬───┬───┬───┬───┬───┬───┬───┐
+// │c0'│a0'│a1'│a2'│d0'│d1'│b0'│b1'│ buffer y contains buffers a', b', c', d'
+// └───┴───┴───┴───┴───┴───┴───┴───┘
+template<class T, class S, class RandomGenerator>
+std::vector<T> shuffled_exclusive_scan(const std::vector<S>& input, RandomGenerator& rng)
+{
+    const size_t n = input.size();
+    assert(n > 0);
+
+    std::vector<T> result(n);
+    std::vector<T> permute(n);
+
+    std::iota(permute.begin(), permute.end(), 0);
+    std::shuffle(permute.begin(), permute.end(), rng);
+
+    T sum = 0;
+    for(size_t i = 0; i < n; ++i)
+    {
+        result[permute[i]] = sum;
+        sum += input[permute[i]];
+    }
+
+    return result;
+}
+
+TYPED_TEST(DeviceBatchMemcpyTests, SizeAndTypeVariation)
+{
+    // While on rocPRIM these can be variable via the config. CUB does not allow this.
+    // Therefore we assume fixed size. Otherwise we would use:
+    // - rocprim::batch_memcpy_config<>::wlev_size_threshold
+    // - rocprim::batch_memcpy_config<>::blev_size_threshold;
+    constexpr int32_t wlev_min_size = 128;
+    constexpr int32_t blev_min_size = 1024;
+
+    constexpr int32_t num_buffers = TestFixture::num_buffers;
+    constexpr int32_t max_size    = TestFixture::max_size;
+    constexpr bool    shuffled    = TestFixture::shuffled;
+
+    constexpr int32_t num_tlev_buffers = num_buffers / 3;
+    constexpr int32_t num_wlev_buffers = num_buffers / 3;
+
+    using value_type         = typename TestFixture::value_type;
+    using buffer_size_type   = typename TestFixture::size_type;
+    using buffer_offset_type = uint32_t;
+    using byte_offset_type   = size_t;
+
+    using value_alias =
+        typename std::conditional<test_utils::is_custom_test_type<value_type>::value,
+                                  typename test_utils::inner_type<value_type>::type,
+                                  value_type>::type;
+
+    // Get random buffer sizes
+
+    // Number of elements in each buffer.
+    std::vector<buffer_size_type> h_buffer_num_elements(num_buffers);
+
+    // Total number of bytes.
+    byte_offset_type total_num_bytes    = 0;
+    byte_offset_type total_num_elements = 0;
+
+    uint32_t seed = 0;
+    SCOPED_TRACE(testing::Message() << "with seed= " << seed);
+    std::default_random_engine rng{seed};
+
+    for(buffer_offset_type i = 0; i < num_buffers; ++i)
+    {
+        buffer_size_type size;
+        if(i < num_tlev_buffers)
+        {
+            size = test_utils::get_random_value<buffer_size_type>(1, wlev_min_size - 1, rng());
+        } else if(i < num_tlev_buffers + num_wlev_buffers)
+        {
+            size = test_utils::get_random_value<buffer_size_type>(wlev_min_size,
+                                                                  blev_min_size - 1,
+                                                                  rng());
+        } else
+        {
+            size = test_utils::get_random_value<buffer_size_type>(blev_min_size, max_size, rng());
+        }
+
+        // convert from number of bytes to number of elements
+        size = test_utils::max(1, size / sizeof(value_type));
+        size = test_utils::min(size, max_size);
+
+        h_buffer_num_elements[i] = size;
+        total_num_elements += size;
+    }
+
+    // Shuffle the sizes so that size classes aren't clustered
+    std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng);
+
+    // Get the byte size of each buffer
+    std::vector<buffer_size_type> h_buffer_num_bytes(num_buffers);
+    for(size_t i = 0; i < num_buffers; ++i)
+    {
+        h_buffer_num_bytes[i] = h_buffer_num_elements[i] * sizeof(value_type);
+    }
+
+    // And the total byte size
+    total_num_bytes = total_num_elements * sizeof(value_type);
+
+    // Device pointers
+    value_type*       d_input{};
+    value_type*       d_output{};
+    value_type**      d_buffer_srcs{};
+    value_type**      d_buffer_dsts{};
+    buffer_size_type* d_buffer_sizes{};
+
+    // Calculate temporary storage
+
+    size_t temp_storage_bytes = 0;
+
+    HIP_CHECK(hipcub::DeviceMemcpy::Batched(nullptr,
+                                            temp_storage_bytes,
+                                            d_buffer_srcs,
+                                            d_buffer_dsts,
+                                            d_buffer_sizes,
+                                            num_buffers));
+
+    void* d_temp_storage{};
+
+    // Allocate memory.
+    HIP_CHECK(hipMalloc(&d_input, total_num_bytes));
+    HIP_CHECK(hipMalloc(&d_output, total_num_bytes));
+
+    HIP_CHECK(hipMalloc(&d_buffer_srcs, num_buffers * sizeof(*d_buffer_srcs)));
+    HIP_CHECK(hipMalloc(&d_buffer_dsts, num_buffers * sizeof(*d_buffer_dsts)));
+    HIP_CHECK(hipMalloc(&d_buffer_sizes, num_buffers * sizeof(*d_buffer_sizes)));
+
+    HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes));
+
+    // Generate data.
+    std::vector<value_alias> h_input
+        = test_utils::get_random_data<value_alias>(total_num_elements,
+                                                   test_utils::numeric_limits<value_alias>::min(),
+                                                   test_utils::numeric_limits<value_alias>::max(),
+                                                   rng());
+
+    // Generate the source and shuffled destination offsets.
+    std::vector<buffer_offset_type> src_offsets;
+    std::vector<buffer_offset_type> dst_offsets;
+
+    if(shuffled)
+    {
+        src_offsets = shuffled_exclusive_scan<buffer_offset_type>(h_buffer_num_elements, rng);
+        dst_offsets = shuffled_exclusive_scan<buffer_offset_type>(h_buffer_num_elements, rng);
+    } else
+    {
+        src_offsets = std::vector<buffer_offset_type>(num_buffers);
+        dst_offsets = std::vector<buffer_offset_type>(num_buffers);
+
+        test_utils::host_exclusive_scan(h_buffer_num_elements.begin(),
+                                        h_buffer_num_elements.end(),
+                                        0,
+                                        src_offsets.begin(),
+                                        hipcub::Sum{});
+        test_utils::host_exclusive_scan(h_buffer_num_elements.begin(),
+                                        h_buffer_num_elements.end(),
+                                        0,
+                                        dst_offsets.begin(),
+                                        hipcub::Sum{});
+    }
+
+    // Generate the source and destination pointers.
+    std::vector<value_type*> h_buffer_srcs(num_buffers);
+    std::vector<value_type*> h_buffer_dsts(num_buffers);
+
+    for(int32_t i = 0; i < num_buffers; ++i)
+    {
+        h_buffer_srcs[i] = d_input + src_offsets[i];
+        h_buffer_dsts[i] = d_output + dst_offsets[i];
+    }
+
+    // Prepare the batch memcpy.
+    HIP_CHECK(hipMemcpy(d_input, h_input.data(), total_num_bytes, hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(d_buffer_srcs,
+                        h_buffer_srcs.data(),
+                        h_buffer_srcs.size() * sizeof(*d_buffer_srcs),
+                        hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(d_buffer_dsts,
+                        h_buffer_dsts.data(),
+                        h_buffer_dsts.size() * sizeof(*d_buffer_dsts),
+                        hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(d_buffer_sizes,
+                        h_buffer_num_bytes.data(),
+                        h_buffer_num_bytes.size() * sizeof(*d_buffer_sizes),
+                        hipMemcpyHostToDevice));
+
+    // Run batched memcpy.
+    HIP_CHECK(hipcub::DeviceMemcpy::Batched(d_temp_storage,
+                                            temp_storage_bytes,
+                                            d_buffer_srcs,
+                                            d_buffer_dsts,
+                                            d_buffer_sizes,
+                                            num_buffers,
+                                            hipStreamDefault));
+    // Verify results.
+    std::vector<value_alias> h_output(total_num_elements);
+    HIP_CHECK(hipMemcpy(h_output.data(), d_output, total_num_bytes, hipMemcpyDeviceToHost));
+
+    for(int32_t i = 0; i < num_buffers; ++i)
+    {
+        for(buffer_size_type j = 0; j < h_buffer_num_elements[i]; ++j)
+        {
+            auto input_index  = src_offsets[i] + j;
+            auto output_index = dst_offsets[i] + j;
+
+            ASSERT_TRUE(test_utils::bit_equal(h_input[input_index], h_output[output_index]));
+        }
+    }
+}