From 6cb593806d95a6579c345fae9173ecb0b22be548 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?= Date: Tue, 19 Mar 2024 20:59:59 +0100 Subject: [PATCH] StreamHPC 2024-01-16 (#327) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor(device_scan.hpp): use hipCUB for sum by key instead of explicitly calling rocPRIM * Add const qualifier to operator overloading methods of iterators * fix(device_scan.hpp): derive internal accumulator type from scan operator instead of using the input iterator type * docs(changelog.md): document the change in accumulator type for device scan * style(device_scan.hpp): update copyright date * style(device_scan.hpp): improve styling * fix: use public function 'rocprim::invoke_result' instead of 'rocprim::detail::invoke_result' * build: Update CUB/Thrust/libcu++ to 2.2.0 * build: Respect CMAKE_CUDA_FLAGS on the NVIDIA platform Previously hipCUB would ignore CMAKE_CUDA_FLAGS specified by the user. This was probably a workaround to the broken logic that would repeat these flags multiple times when re-configuring the project. I fixed the same issue in rocRAND by not running this code on subsequent configurations. Use the same solution here too. * build(test): Don't mark cmake generated source files to be cleaned This resulted on them being deleted by `make/ninja clean`, and would not be re-generated until the next configuration of cmake, resulting in compilation errors when trying to build after the clean. When the parallelization is changed this might result in some lose files sticking around, but they should not cause harm. * docs(CHANGELOG.md): Add update to CUB version to the changelog [CI skip] * build: Fix NVCC warnings Fix some NVCC and GCC warnings. Some of these were false positives, but the resulting code is cleaner IMO, so it's not worth fighting against the tools. Some warnings remain in ROCm 6.0 coming from the HIP headers. * fix: update github urls to point to new ROCm organization * docs: fixed changelog style * ci: Specifying SPHINX_DIR * build: Updated version in CMakeLists and Changelog --------- Co-authored-by: Nara Prasetya Co-authored-by: Jaap Co-authored-by: Gergely Mészáros --- .gitlab-ci.yml | 2 + .jenkins/precheckin.groovy | 2 +- .jenkins/staticanalysis.groovy | 2 +- CHANGELOG.md | 8 + CMakeLists.txt | 2 +- README.md | 1 - .../benchmark_device_segmented_reduce.cpp | 4 +- benchmark/benchmark_device_select.cpp | 8 +- cmake/Dependencies.cmake | 83 ++++----- cmake/SetupNVCC.cmake | 40 +++-- hipcub/CMakeLists.txt | 10 +- .../backend/rocprim/device/device_scan.hpp | 163 +++++++++++++----- .../device/device_segmented_reduce.hpp | 5 +- .../cache_modified_input_iterator.hpp | 6 +- .../cache_modified_output_iterator.hpp | 6 +- .../iterator/discard_output_iterator.hpp | 6 +- .../rocprim/thread/thread_operators.hpp | 8 +- test/extra/CMakeLists.txt | 94 ++++------ test/hipcub/CMakeLists.txt | 3 +- test/hipcub/identity_iterator.hpp | 5 +- .../test_hipcub_block_run_length_decode.cpp | 5 +- test/hipcub/test_hipcub_block_shuffle.cpp | 20 ++- test/hipcub/test_hipcub_warp_reduce.cpp | 4 +- 23 files changed, 272 insertions(+), 215 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b370708c..2ba53d62 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -411,6 +411,8 @@ test:nvcc_install: test:doc: stage: test + variables: + SPHINX_DIR: $DOCS_DIR/sphinx extends: - .rules:test - .build:docs diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy index 573ac33a..7fdcc3e8 100644 --- a/.jenkins/precheckin.groovy +++ b/.jenkins/precheckin.groovy @@ -1,5 +1,5 @@ #!/usr/bin/env groovy -// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ +// This shared library is available at https://github.com/ROCm/rocJENKINS/ @Library('rocJenkins@pong') _ // This file is for internal AMD use. diff --git a/.jenkins/staticanalysis.groovy b/.jenkins/staticanalysis.groovy index 0dbf1bc6..5c0b9490 100644 --- a/.jenkins/staticanalysis.groovy +++ b/.jenkins/staticanalysis.groovy @@ -1,5 +1,5 @@ #!/usr/bin/env groovy -// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ +// This shared library is available at https://github.com/ROCm/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d61493f..aecb85b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,14 @@ Documentation for hipCUB is available at [https://rocm.docs.amd.com/projects/hipCUB/en/latest/](https://rocm.docs.amd.com/projects/hipCUB/en/latest/). +## (Unreleased) hipCUB-3.2.0 for ROCm 6.2.0 + +### Fixed + +* Fixed the derivation for the accumulator type for device scan algorithms in the rocPRIM backend being different compared to CUB. + It now derives the accumulator type as the result of the binary operator. +* The NVIDIA backend now requires CUB, Thrust and libcu++ 2.2.0. If it is not found it will be downloaded from the NVIDIA CCCL repository. + ## (Unreleased) hipCUB-3.1.0 for ROCm 6.1.0 ### Changes diff --git a/CMakeLists.txt b/CMakeLists.txt index 3a897165..516b1598 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -98,7 +98,7 @@ if(BUILD_ADDRESS_SANITIZER) endif() # Setup VERSION -set(VERSION_STRING "3.0.0") +set(VERSION_STRING "3.2.0") rocm_setup_version(VERSION ${VERSION_STRING}) # Print configuration summary diff --git a/README.md b/README.md index 1459d001..adb777c4 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,6 @@ python3 -m http.server * The [rocPRIM](https://github.com/ROCm/rocPRIM) library * Automatically downloaded and built by the CMake script * Requires CMake 3.16.9 or later - * For NVIDIA GPUs: * CUDA Toolkit * CUB library diff --git a/benchmark/benchmark_device_segmented_reduce.cpp b/benchmark/benchmark_device_segmented_reduce.cpp index 43649efc..e00e2a86 100644 --- a/benchmark/benchmark_device_segmented_reduce.cpp +++ b/benchmark/benchmark_device_segmented_reduce.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -90,8 +90,6 @@ void run_benchmark(benchmark::State& state, OutputT * d_aggregates_output; HIP_CHECK(hipMalloc(&d_aggregates_output, segments_count * sizeof(OutputT))); - hipcub::Sum reduce_op; - void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; diff --git a/benchmark/benchmark_device_select.cpp b/benchmark/benchmark_device_select.cpp index c3b9f36d..d1617c79 100644 --- a/benchmark/benchmark_device_select.cpp +++ b/benchmark/benchmark_device_select.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -386,8 +386,8 @@ void run_unique_by_key_benchmark(benchmark::State& state, } } - const auto input_values = benchmark_utils::get_random_data(size, ValueT(-1000), ValueT(1000)); - unsigned int selected_count_output = 0; + const auto input_values + = benchmark_utils::get_random_data(size, ValueT(-1000), ValueT(1000)); KeyT* d_keys_input; ValueT* d_values_input; @@ -399,7 +399,7 @@ void run_unique_by_key_benchmark(benchmark::State& state, HIP_CHECK(hipMalloc(&d_values_input, input_values.size() * sizeof(input_values[0]))); HIP_CHECK(hipMalloc(&d_keys_output, input_keys.size() * sizeof(input_keys[0]))); HIP_CHECK(hipMalloc(&d_values_output, input_values.size() * sizeof(input_values[0]))); - HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(selected_count_output))); + HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(*d_selected_count_output))); HIP_CHECK( hipMemcpy( diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index c1738491..edcfe835 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -139,62 +139,49 @@ endif(USER_BUILD_BENCHMARK) # CUB (only for CUDA platform) if(HIP_COMPILER STREQUAL "nvcc") - + set(CCCL_MINIMUM_VERSION 2.2.0) if(NOT DOWNLOAD_CUB) - find_package(cub QUIET) - find_package(thrust QUIET) + find_package(CUB ${CCCL_MINIMUM_VERSION} CONFIG) + find_package(Thrust ${CCCL_MINIMUM_VERSION} CONFIG) + find_package(libcudacxx ${CCCL_MINIMUM_VERSION} CONFIG) endif() - if(NOT DEFINED CUB_INCLUDE_DIR) - file( - DOWNLOAD https://github.com/NVIDIA/cub/archive/2.1.0.zip - ${CMAKE_CURRENT_BINARY_DIR}/cub-2.1.0.zip - STATUS cub_download_status LOG cub_download_log - ) - list(GET cub_download_status 0 cub_download_error_code) - if(cub_download_error_code) - message(FATAL_ERROR "Error: downloading " - "https://github.com/NVIDIA/cub/archive/2.1.0.zip failed " - "error_code: ${cub_download_error_code} " - "log: ${cub_download_log} " - ) + if (NOT CUB_FOUND OR NOT Thrust_FOUND OR NOT libcudacxx_FOUND) + if(CUB_FOUND OR Thrust_FOUND OR libcudacxx_FOUND) + message(WARNING "Found one of CUB, Thrust or libcu++, but not all of them. + This can lead to mixing different potentially incompatible versions.") endif() - execute_process( - COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/cub-2.1.0.zip - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - RESULT_VARIABLE cub_unpack_error_code - ) - if(cub_unpack_error_code) - message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/cub-2.1.0.zip failed") - endif() - set(CUB_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cub-2.1.0/ CACHE PATH "") - endif() + message(STATUS "CUB, Thrust or libcu++ not found, downloading and extracting CCCL ${CCCL_MINIMUM_VERSION}") + file(DOWNLOAD https://github.com/NVIDIA/cccl/archive/refs/tags/v${CCCL_MINIMUM_VERSION}.zip + ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}.zip + STATUS cccl_download_status LOG cccl_download_log) - if(NOT DEFINED THRUST_INCLUDE_DIR) - file( - DOWNLOAD https://github.com/NVIDIA/thrust/archive/2.1.0.zip - ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.1.0.zip - STATUS thrust_download_status LOG thrust_download_log - ) - list(GET thrust_download_status 0 thrust_download_error_code) - if(thrust_download_error_code) + list(GET cccl_download_status 0 cccl_download_error_code) + if(cccl_download_error_code) message(FATAL_ERROR "Error: downloading " - "https://github.com/NVIDIA/thrust/archive/2.1.0.zip failed " - "error_code: ${thrust_download_error_code} " - "log: ${thrust_download_log} " - ) + "https://github.com/NVIDIA/cccl/archive/refs/tags/v${CCCL_MINIMUM_VERSION}.zip failed " + "error_code: ${cccl_download_error_code} " + "log: ${cccl_download_log}") endif() - execute_process( - COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.1.0.zip - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - RESULT_VARIABLE thrust_unpack_error_code - ) - if(thrust_unpack_error_code) - message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.1.0.zip failed") + if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) + file(ARCHIVE_EXTRACT INPUT ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}.zip) + else() + execute_process(COMMAND "${CMAKE_COMMAND}" -E tar xf ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}.zip + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + RESULT_VARIABLE cccl_unpack_error_code) + if(cccl_unpack_error_code) + message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}.zip failed") + endif() endif() - set(THRUST_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.1.0/ CACHE PATH "") + + find_package(CUB ${CCCL_MINIMUM_VERSION} CONFIG REQUIRED NO_DEFAULT_PATH + PATHS ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}/cub) + find_package(Thrust ${CCCL_MINIMUM_VERSION} CONFIG REQUIRED NO_DEFAULT_PATH + PATHS ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}/thrust) + find_package(libcudacxx ${CCCL_MINIMUM_VERSION} CONFIG REQUIRED NO_DEFAULT_PATH + PATHS ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}/libcudacxx) endif() else() # rocPRIM (only for ROCm platform) @@ -206,7 +193,7 @@ else() message(STATUS "rocPRIM not found. Fetching...") FetchContent_Declare( prim - GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rocPRIM.git + GIT_REPOSITORY https://github.com/ROCm/rocPRIM.git GIT_TAG develop ) FetchContent_MakeAvailable(prim) diff --git a/cmake/SetupNVCC.cmake b/cmake/SetupNVCC.cmake index 1b84996c..5ec377da 100644 --- a/cmake/SetupNVCC.cmake +++ b/cmake/SetupNVCC.cmake @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -98,26 +98,30 @@ set(NVGPU_TARGETS "${DEFAULT_NVGPU_TARGETS}" CACHE STRING "List of NVIDIA GPU targets (compute capabilities), for example \"35;50\"" ) set(CMAKE_CUDA_ARCHITECTURES ${NVGPU_TARGETS}) -# Generate compiler flags based on targeted CUDA architectures if CMake doesn't. (Controlled by policy CP0104, on by default after 3.18) -if(CMAKE_VERSION VERSION_LESS "3.18") - foreach(CUDA_ARCH ${NVGPU_TARGETS}) - list(APPEND HIP_NVCC_FLAGS "--generate-code arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH} ") - list(APPEND HIP_NVCC_FLAGS "--generate-code arch=compute_${CUDA_ARCH},code=compute_${CUDA_ARCH} ") - endforeach() -endif() -execute_process( - COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --cpp_config - OUTPUT_VARIABLE HIP_CPP_CONFIG_FLAGS - OUTPUT_STRIP_TRAILING_WHITESPACE - ERROR_STRIP_TRAILING_WHITESPACE -) +if (NOT _HIPCUB_HIP_NVCC_FLAGS_SET) + execute_process( + COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --cpp_config + OUTPUT_VARIABLE HIP_CPP_CONFIG_FLAGS + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_STRIP_TRAILING_WHITESPACE + ) -# Update list parameter -string(REPLACE ";" " " HIP_NVCC_FLAGS ${HIP_NVCC_FLAGS}) + # Generate compiler flags based on targeted CUDA architectures if CMake doesn't. (Controlled by policy CP0104, on by default after 3.18) + if(CMAKE_VERSION VERSION_LESS "3.18") + foreach(CUDA_ARCH ${NVGPU_TARGETS}) + list(APPEND HIP_NVCC_FLAGS "--generate-code" "arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH}") + list(APPEND HIP_NVCC_FLAGS "--generate-code" "arch=compute_${CUDA_ARCH},code=compute_${CUDA_ARCH}") + endforeach() + endif() + + # Update list parameter + list(JOIN HIP_NVCC_FLAGS " " HIP_NVCC_FLAGS) -set(CMAKE_CUDA_FLAGS "${HIP_CPP_CONFIG_FLAGS} ${HIP_NVCC_FLAGS}" - CACHE STRING "Cuda compile flags" FORCE) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${HIP_CPP_CONFIG_FLAGS} ${HIP_NVCC_FLAGS}" + CACHE STRING "Cuda compile flags" FORCE) + set(_HIPCUB_HIP_NVCC_FLAGS_SET ON CACHE INTERNAL "") +endif() # Ignore warnings about #pragma unroll # and about deprecated CUDA function(s) used in hip/nvcc_detail/hip_runtime_api.h diff --git a/hipcub/CMakeLists.txt b/hipcub/CMakeLists.txt index 1ac64fcd..67a348e3 100644 --- a/hipcub/CMakeLists.txt +++ b/hipcub/CMakeLists.txt @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -61,13 +61,7 @@ else() # hipcub_cub target is only for internal use. add_library(hipcub_cub INTERFACE) target_link_libraries(hipcub_cub - INTERFACE - hipcub - ) - target_include_directories(hipcub_cub - INTERFACE - ${CUB_INCLUDE_DIR} - ${THRUST_INCLUDE_DIR} + INTERFACE hipcub CUB::CUB Thrust::Thrust libcudacxx::libcudacxx ) endif() diff --git a/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp b/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp index 4af294a4..6966cf09 100644 --- a/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp +++ b/hipcub/include/hipcub/backend/rocprim/device/device_scan.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2024, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -30,13 +30,14 @@ #ifndef HIPCUB_ROCPRIM_DEVICE_DEVICE_SCAN_HPP_ #define HIPCUB_ROCPRIM_DEVICE_DEVICE_SCAN_HPP_ -#include #include "../../../config.hpp" - #include "../thread/thread_operators.hpp" +#include #include #include +#include +#include BEGIN_HIPCUB_NAMESPACE @@ -78,12 +79,22 @@ class DeviceScan hipStream_t stream = 0, bool debug_synchronous = false) { - return ::rocprim::inclusive_scan( - d_temp_storage, temp_storage_bytes, - d_in, d_out, num_items, - scan_op, - stream, debug_synchronous - ); + using acc_t = ::rocprim::invoke_result_binary_op_t< + typename std::iterator_traits::value_type, + ScanOpT>; + + return ::rocprim::inclusive_scan<::rocprim::default_config, + InputIteratorT, + OutputIteratorT, + ScanOpT, + acc_t>(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + scan_op, + stream, + debug_synchronous); } template < @@ -124,12 +135,24 @@ class DeviceScan hipStream_t stream = 0, bool debug_synchronous = false) { - return ::rocprim::exclusive_scan( - d_temp_storage, temp_storage_bytes, - d_in, d_out, init_value, num_items, - scan_op, - stream, debug_synchronous - ); + using acc_t + = ::rocprim::invoke_result_binary_op_t, + ScanOpT>; + + return ::rocprim::exclusive_scan<::rocprim::default_config, + InputIteratorT, + OutputIteratorT, + InitValueT, + ScanOpT, + acc_t>(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + init_value, + num_items, + scan_op, + stream, + debug_synchronous); } template < @@ -150,12 +173,24 @@ class DeviceScan hipStream_t stream = 0, bool debug_synchronous = false) { - return ::rocprim::exclusive_scan( - d_temp_storage, temp_storage_bytes, - d_in, d_out, init_value, num_items, - scan_op, - stream, debug_synchronous - ); + using acc_t + = ::rocprim::invoke_result_binary_op_t, + ScanOpT>; + + return ::rocprim::exclusive_scan<::rocprim::default_config, + InputIteratorT, + OutputIteratorT, + InitValueT, + ScanOpT, + acc_t>(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + init_value, + num_items, + scan_op, + stream, + debug_synchronous); } template < @@ -177,12 +212,17 @@ class DeviceScan { using in_value_type = typename std::iterator_traits::value_type; - return ::rocprim::exclusive_scan_by_key( - d_temp_storage, temp_storage_bytes, - d_keys_in, d_values_in, d_values_out, - static_cast(0), static_cast(num_items), - ::hipcub::Sum(), equality_op, stream, debug_synchronous - ); + return ExclusiveScanByKey(d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + d_values_out, + ::hipcub::Sum(), + static_cast(0), + static_cast(num_items), + equality_op, + stream, + debug_synchronous); } template < @@ -206,12 +246,27 @@ class DeviceScan hipStream_t stream = 0, bool debug_synchronous = false) { - return ::rocprim::exclusive_scan_by_key( - d_temp_storage, temp_storage_bytes, - d_keys_in, d_values_in, d_values_out, - init_value, static_cast(num_items), - scan_op, equality_op, stream, debug_synchronous - ); + using acc_t = rocprim::invoke_result_binary_op_t, + ScanOpT>; + + return ::rocprim::exclusive_scan_by_key<::rocprim::default_config, + KeysInputIteratorT, + ValuesInputIteratorT, + ValuesOutputIteratorT, + InitValueT, + ScanOpT, + EqualityOpT, + acc_t>(d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + d_values_out, + init_value, + static_cast(num_items), + scan_op, + equality_op, + stream, + debug_synchronous); } template < @@ -231,12 +286,16 @@ class DeviceScan hipStream_t stream = 0, bool debug_synchronous = false) { - return ::rocprim::inclusive_scan_by_key( - d_temp_storage, temp_storage_bytes, - d_keys_in, d_values_in, d_values_out, - static_cast(num_items), ::hipcub::Sum(), - equality_op, stream, debug_synchronous - ); + return InclusiveScanByKey(d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + d_values_out, + ::hipcub::Sum(), + num_items, + equality_op, + stream, + debug_synchronous); } template < @@ -258,12 +317,26 @@ class DeviceScan hipStream_t stream = 0, bool debug_synchronous = false) { - return ::rocprim::inclusive_scan_by_key( - d_temp_storage, temp_storage_bytes, - d_keys_in, d_values_in, d_values_out, - static_cast(num_items), scan_op, - equality_op, stream, debug_synchronous - ); + using acc_t = ::rocprim::invoke_result_binary_op_t< + typename std::iterator_traits::value_type, + ScanOpT>; + + return ::rocprim::inclusive_scan_by_key<::rocprim::default_config, + KeysInputIteratorT, + ValuesInputIteratorT, + ValuesOutputIteratorT, + ScanOpT, + EqualityOpT, + acc_t>(d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + d_values_out, + static_cast(num_items), + scan_op, + equality_op, + stream, + debug_synchronous); } }; diff --git a/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp b/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp index 74fcbb23..87ee725f 100644 --- a/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp +++ b/hipcub/include/hipcub/backend/rocprim/device/device_segmented_reduce.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2023, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2024, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -38,6 +38,7 @@ #include "../iterator/arg_index_input_iterator.hpp" #include "../thread/thread_operators.hpp" #include "device_reduce.hpp" +#include "rocprim/type_traits.hpp" #include @@ -131,7 +132,7 @@ inline hipError_t segmented_arg_minmax(void* temporary_storage, { using input_type = typename std::iterator_traits::value_type; using result_type = - typename ::rocprim::detail::match_result_type::type; + typename ::rocprim::invoke_result_binary_op::type; using config = ::rocprim::detail::wrapped_reduce_config; diff --git a/hipcub/include/hipcub/backend/rocprim/iterator/cache_modified_input_iterator.hpp b/hipcub/include/hipcub/backend/rocprim/iterator/cache_modified_input_iterator.hpp index 156a4191..673db63b 100644 --- a/hipcub/include/hipcub/backend/rocprim/iterator/cache_modified_input_iterator.hpp +++ b/hipcub/include/hipcub/backend/rocprim/iterator/cache_modified_input_iterator.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2024, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -145,13 +145,13 @@ class CacheModifiedInputIterator } /// Equal to - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) const { return (ptr == rhs.ptr); } /// Not equal to - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) const { return (ptr != rhs.ptr); } diff --git a/hipcub/include/hipcub/backend/rocprim/iterator/cache_modified_output_iterator.hpp b/hipcub/include/hipcub/backend/rocprim/iterator/cache_modified_output_iterator.hpp index e330d25f..c9e7ebff 100644 --- a/hipcub/include/hipcub/backend/rocprim/iterator/cache_modified_output_iterator.hpp +++ b/hipcub/include/hipcub/backend/rocprim/iterator/cache_modified_output_iterator.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2024, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -162,13 +162,13 @@ class CacheModifiedOutputIterator } /// Equal to - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) const { return (ptr == rhs.ptr); } /// Not equal to - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) const { return (ptr != rhs.ptr); } diff --git a/hipcub/include/hipcub/backend/rocprim/iterator/discard_output_iterator.hpp b/hipcub/include/hipcub/backend/rocprim/iterator/discard_output_iterator.hpp index b0c835c3..3254c4b1 100644 --- a/hipcub/include/hipcub/backend/rocprim/iterator/discard_output_iterator.hpp +++ b/hipcub/include/hipcub/backend/rocprim/iterator/discard_output_iterator.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2020-2024, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -201,7 +201,7 @@ class DiscardOutputIterator * @typedef self_type * @brief Equal to */ - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) const { return (offset == rhs.offset); } @@ -210,7 +210,7 @@ class DiscardOutputIterator * @typedef self_type * @brief Not equal to */ - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) const { return (offset != rhs.offset); } diff --git a/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp b/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp index 5297770a..028796fa 100644 --- a/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp +++ b/hipcub/include/hipcub/backend/rocprim/thread/thread_operators.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2017-2023, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2017-2024, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -34,7 +34,7 @@ #include "../util_type.hpp" -#include +#include BEGIN_HIPCUB_NAMESPACE @@ -276,7 +276,7 @@ using non_void_value_t = // Invoke result type. template -using invoke_result_t = typename ::rocprim::detail::invoke_result::type; +using invoke_result_t = ::rocprim::invoke_result_t; /// Intermediate accumulator type. template @@ -293,7 +293,7 @@ using accumulator_t = std::decay_t>; // rocPRIM (as well as Thrust) uses result type of BinaryFunction instead (if not void): // // using input_type = typename std::iterator_traits::value_type; -// using result_type = typename ::rocprim::detail::match_result_type< +// using result_type = typename ::rocprim::invoke_result_binary_op< // input_type, BinaryFunction // >::type; // diff --git a/test/extra/CMakeLists.txt b/test/extra/CMakeLists.txt index 93329abd..5602171e 100644 --- a/test/extra/CMakeLists.txt +++ b/test/extra/CMakeLists.txt @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -42,62 +42,49 @@ include(VerifyCompiler) # CUB (only for CUDA platform) if(HIP_COMPILER STREQUAL "nvcc") - + set(CCCL_MINIMUM_VERSION 2.2.0) if(NOT DOWNLOAD_CUB) - find_package(cub QUIET) - find_package(thrust QUIET) + find_package(CUB ${CCCL_MINIMUM_VERSION} CONFIG) + find_package(Thrust ${CCCL_MINIMUM_VERSION} CONFIG) + find_package(libcudacxx ${CCCL_MINIMUM_VERSION} CONFIG) endif() - if(NOT DEFINED CUB_INCLUDE_DIR) - file( - DOWNLOAD https://github.com/NVIDIA/cub/archive/2.1.0.zip - ${CMAKE_CURRENT_BINARY_DIR}/cub-2.1.0.zip - STATUS cub_download_status LOG cub_download_log - ) - list(GET cub_download_status 0 cub_download_error_code) - if(cub_download_error_code) - message(FATAL_ERROR "Error: downloading " - "https://github.com/NVIDIA/cub/archive/2.1.0.zip failed " - "error_code: ${cub_download_error_code} " - "log: ${cub_download_log} " - ) + if (NOT CUB_FOUND OR NOT Thrust_FOUND OR NOT libcudacxx_FOUND) + if(CUB_FOUND OR Thrust_FOUND OR libcudacxx_FOUND) + message(WARNING "Found one of CUB, Thrust or libcu++, but not all of them. + This can lead to mixing different potentially incompatible versions.") endif() - execute_process( - COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/cub-2.1.0.zip - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - RESULT_VARIABLE cub_unpack_error_code - ) - if(cub_unpack_error_code) - message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/cub-2.1.0.zip failed") - endif() - set(CUB_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cub-2.1.0/ CACHE PATH "") - endif() + message(STATUS "CUB, Thrust or libcu++ not found, downloading and extracting CCCL ${CCCL_MINIMUM_VERSION}") + file(DOWNLOAD https://github.com/NVIDIA/cccl/archive/refs/tags/v${CCCL_MINIMUM_VERSION}.zip + ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}.zip + STATUS cccl_download_status LOG cccl_download_log) - if(NOT DEFINED THRUST_INCLUDE_DIR) - file( - DOWNLOAD https://github.com/NVIDIA/thrust/archive/2.1.0.zip - ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.1.0.zip - STATUS thrust_download_status LOG thrust_download_log - ) - list(GET thrust_download_status 0 thrust_download_error_code) - if(thrust_download_error_code) + list(GET cccl_download_status 0 cccl_download_error_code) + if(cccl_download_error_code) message(FATAL_ERROR "Error: downloading " - "https://github.com/NVIDIA/thrust/archive/2.1.0.zip failed " - "error_code: ${thrust_download_error_code} " - "log: ${thrust_download_log} " - ) + "https://github.com/NVIDIA/cccl/archive/refs/tags/v${CCCL_MINIMUM_VERSION}.zip failed " + "error_code: ${cccl_download_error_code} " + "log: ${cccl_download_log}") endif() - execute_process( - COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.1.0.zip - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - RESULT_VARIABLE thrust_unpack_error_code - ) - if(thrust_unpack_error_code) - message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.1.0.zip failed") + if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) + file(ARCHIVE_EXTRACT INPUT ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}.zip) + else() + execute_process(COMMAND "${CMAKE_COMMAND}" -E tar xf ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}.zip + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + RESULT_VARIABLE cccl_unpack_error_code) + if(cccl_unpack_error_code) + message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}.zip failed") + endif() endif() - set(THRUST_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.1.0/ CACHE PATH "") + + find_package(CUB ${CCCL_MINIMUM_VERSION} CONFIG REQUIRED NO_DEFAULT_PATH + PATHS ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}/cub) + find_package(Thrust ${CCCL_MINIMUM_VERSION} CONFIG REQUIRED NO_DEFAULT_PATH + PATHS ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}/thrust) + find_package(libcudacxx ${CCCL_MINIMUM_VERSION} CONFIG REQUIRED NO_DEFAULT_PATH + PATHS ${CMAKE_CURRENT_BINARY_DIR}/cccl-${CCCL_MINIMUM_VERSION}/libcudacxx) endif() else() # rocPRIM (only for ROCm platform) @@ -109,7 +96,7 @@ else() message(STATUS "rocPRIM not found. Fetching...") FetchContent_Declare( prim - GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rocPRIM.git + GIT_REPOSITORY https://github.com/ROCm/rocPRIM.git GIT_TAG develop ) FetchContent_MakeAvailable(prim) @@ -140,15 +127,8 @@ function(add_hipcub_test TEST_NAME TEST_SOURCES) if(HIP_COMPILER STREQUAL "nvcc") set_property(TARGET ${TEST_TARGET} PROPERTY CUDA_STANDARD 14) set_source_files_properties(${TEST_SOURCES} PROPERTIES LANGUAGE CUDA) - target_link_libraries(${TEST_TARGET} - PRIVATE - hip::hipcub - ) - target_include_directories(${TEST_TARGET} - PRIVATE - $ - $ - ) + target_link_libraries(${TEST_TARGET} PRIVATE + hip::hipcub CUB::CUB Thrust::Thrust libcudacxx::libcudacxx) elseif(HIP_COMPILER STREQUAL "clang") target_link_libraries(${TEST_TARGET} PRIVATE diff --git a/test/hipcub/CMakeLists.txt b/test/hipcub/CMakeLists.txt index e1d67d85..5d6d4ca1 100644 --- a/test/hipcub/CMakeLists.txt +++ b/test/hipcub/CMakeLists.txt @@ -1,5 +1,5 @@ # MIT License # -# Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -179,7 +179,6 @@ ${TEST_TYPE_SLICE_COUNT} test type slice(s) for test target ${TEST_TARGET}") message(FATAL_ERROR "no .cpp files generated for target ${TEST_TARGET}") endif() - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_CLEAN_FILES "${TEST_TARGET}.parallel") add_hipcub_test_internal(${TEST_NAME} "${SOURCES}" ${TEST_TARGET}) target_include_directories("${TEST_TARGET}" PRIVATE "../../test/hipcub") endfunction() diff --git a/test/hipcub/identity_iterator.hpp b/test/hipcub/identity_iterator.hpp index ba8b31a0..9446b06c 100644 --- a/test/hipcub/identity_iterator.hpp +++ b/test/hipcub/identity_iterator.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -45,8 +45,7 @@ class identity_iterator : ptr_(ptr) { } - HIPCUB_HOST_DEVICE inline - ~identity_iterator() = default; + inline ~identity_iterator() = default; HIPCUB_HOST_DEVICE inline identity_iterator& operator++() diff --git a/test/hipcub/test_hipcub_block_run_length_decode.cpp b/test/hipcub/test_hipcub_block_run_length_decode.cpp index ea5cc11e..58e264a8 100644 --- a/test/hipcub/test_hipcub_block_run_length_decode.cpp +++ b/test/hipcub/test_hipcub_block_run_length_decode.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -179,7 +179,10 @@ TYPED_TEST(HipcubBlockRunLengthDecodeTest, TestDecode) std::numeric_limits::max(), seed_value ); + // Not strictly required, but fixes a spurious GCC warning and good practice anyways + run_items.reserve(run_items.size() + empty_run_items.size()); run_items.insert(run_items.end(), empty_run_items.begin(), empty_run_items.end()); + run_lengths.reserve(run_lengths.size() + num_trailing_empty_runs); run_lengths.insert(run_lengths.end(), num_trailing_empty_runs, static_cast(0)); std::vector expected; diff --git a/test/hipcub/test_hipcub_block_shuffle.cpp b/test/hipcub/test_hipcub_block_shuffle.cpp index 4b863ad5..6e0f79c4 100644 --- a/test/hipcub/test_hipcub_block_shuffle.cpp +++ b/test/hipcub/test_hipcub_block_shuffle.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -31,6 +31,8 @@ // required test headers #include "test_utils.hpp" +#include + // Params for tests template< class T, @@ -110,7 +112,9 @@ TYPED_TEST(HipcubBlockShuffleTests, BlockOffset) int distance = rand() % std::min(size_t(10), block_size/2) - std::min(size_t(10), block_size/2); SCOPED_TRACE(testing::Message() << "with seed= " << seed_value <<" & distance = "< input_data = test_utils::get_random_data(size, -100, 100, seed_value); + const int min_value = std::is_unsigned::value ? 0 : -100; + std::vector input_data + = test_utils::get_random_data(size, min_value, 100, seed_value); std::vector output_data(input_data); // Preparing device @@ -198,7 +202,9 @@ TYPED_TEST(HipcubBlockShuffleTests, BlockRotate) int distance = rand() % std::min(size_t(5), block_size/2); SCOPED_TRACE(testing::Message() << "with seed= " << seed_value <<" & distance = "< input_data = test_utils::get_random_data(size, -100, 100, seed_value); + const int min_value = std::is_unsigned::value ? 0 : -100; + std::vector input_data + = test_utils::get_random_data(size, min_value, 100, seed_value); std::vector output_data(input_data); // Preparing device @@ -283,7 +289,9 @@ TYPED_TEST(HipcubBlockShuffleTests, BlockUp) unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data - std::vector input_data = test_utils::get_random_data(ItemsPerThread * size, -100, 100, seed_value); + const int min_value = std::is_unsigned::value ? 0 : -100; + std::vector input_data + = test_utils::get_random_data(ItemsPerThread * size, min_value, 100, seed_value); std::vector output_data(input_data); std::vector arr_input(size); @@ -381,7 +389,9 @@ TYPED_TEST(HipcubBlockShuffleTests, BlockDown) SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data - std::vector input_data = test_utils::get_random_data(ItemsPerThread * size, -100, 100, seed_value); + const int min_value = std::is_unsigned::value ? 0 : -100; + std::vector input_data + = test_utils::get_random_data(ItemsPerThread * size, min_value, 100, seed_value); std::vector output_data(input_data); std::vector arr_input(size); diff --git a/test/hipcub/test_hipcub_warp_reduce.cpp b/test/hipcub/test_hipcub_warp_reduce.cpp index 82202a81..5a6aa6ca 100644 --- a/test/hipcub/test_hipcub_warp_reduce.cpp +++ b/test/hipcub/test_hipcub_warp_reduce.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -318,7 +318,7 @@ TYPED_TEST(HipcubWarpReduceTests, ReduceValid) for(size_t i = 0; i < output.size(); i++) { T value = 0; - for(size_t j = 0; j < valid; j++) + for(int j = 0; j < valid; ++j) { auto idx = i * logical_warp_size + j; value += input[idx];