diff --git a/benchmark/benchmark_block_merge_sort.cpp b/benchmark/benchmark_block_merge_sort.cpp index 62ffbdfa..83167642 100644 --- a/benchmark/benchmark_block_merge_sort.cpp +++ b/benchmark/benchmark_block_merge_sort.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -43,9 +43,8 @@ template -__global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T* input, - T* output, - CompareOp compare_op) +__global__ __launch_bounds__(BlockSize) +void sort_keys_kernel(const T* input, T* output, CompareOp compare_op) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; @@ -68,9 +67,8 @@ template -__global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T* input, - T* output, - CompareOp compare_op) +__global__ __launch_bounds__(BlockSize) +void sort_pairs_kernel(const T* input, T* output, CompareOp compare_op) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; @@ -111,16 +109,11 @@ void run_benchmark(benchmark::State& state, constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); - std::vector input; - if(std::is_floating_point::value) - { - input = benchmark_utils::get_random_data(size, (T)-1000, (T) + 1000); - } else - { - input = benchmark_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max()); - } + std::vector input + = benchmark_utils::get_random_data(size, + benchmark_utils::generate_limits::min(), + benchmark_utils::generate_limits::max()); + T* d_input; T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); @@ -143,7 +136,8 @@ void run_benchmark(benchmark::State& state, d_input, d_output, CompareOp()); - } else if(benchmark_kind == benchmark_kinds::sort_pairs) + } + else if(benchmark_kind == benchmark_kinds::sort_pairs) { hipLaunchKernelGGL( HIP_KERNEL_NAME(sort_pairs_kernel), diff --git a/benchmark/benchmark_block_radix_rank.cpp b/benchmark/benchmark_block_radix_rank.cpp index 8578b75c..2dac0a5d 100644 --- a/benchmark/benchmark_block_radix_rank.cpp +++ b/benchmark/benchmark_block_radix_rank.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -109,18 +109,11 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int size = items_per_block * ((N + items_per_block - 1) / items_per_block); - std::vector input; - if(std::is_floating_point::value) - { - input = benchmark_utils::get_random_data(size, - static_cast(-1000), - static_cast(1000)); - } else - { - input = benchmark_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max()); - } + std::vector input + = benchmark_utils::get_random_data(size, + benchmark_utils::generate_limits::min(), + benchmark_utils::generate_limits::max()); + T* d_input; int* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); diff --git a/benchmark/benchmark_block_radix_sort.cpp b/benchmark/benchmark_block_radix_sort.cpp index 7413214e..0bae7b87 100644 --- a/benchmark/benchmark_block_radix_sort.cpp +++ b/benchmark/benchmark_block_radix_sort.cpp @@ -189,16 +189,11 @@ void run_benchmark(benchmark::State& state, constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); - std::vector input; - if(std::is_floating_point::value) - { - input = benchmark_utils::get_random_data(size, (T)-1000, (T) + 1000); - } else - { - input = benchmark_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max()); - } + std::vector input + = benchmark_utils::get_random_data(size, + benchmark_utils::generate_limits::min(), + benchmark_utils::generate_limits::max()); + T* d_input; T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); diff --git a/benchmark/benchmark_device_memory.cpp b/benchmark/benchmark_device_memory.cpp index 5a879210..bb256ac5 100644 --- a/benchmark/benchmark_device_memory.cpp +++ b/benchmark/benchmark_device_memory.cpp @@ -268,16 +268,11 @@ template input; - if(std::is_floating_point::value) - { - input = benchmark_utils::get_random_data(size, (T)-1000, (T) + 1000); - } else - { - input = benchmark_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max()); - } + std::vector input + = benchmark_utils::get_random_data(size, + benchmark_utils::generate_limits::min(), + benchmark_utils::generate_limits::max()); + T* d_input; T* d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); diff --git a/benchmark/benchmark_device_merge_sort.cpp b/benchmark/benchmark_device_merge_sort.cpp index 506a8c04..fc64fc08 100644 --- a/benchmark/benchmark_device_merge_sort.cpp +++ b/benchmark/benchmark_device_merge_sort.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -33,33 +33,28 @@ const size_t DEFAULT_N = 32 << 20; const unsigned int batch_size = 10; const unsigned int warmup_size = 5; -template -std::vector generate_keys(size_t size) +template +struct CompareFunction { - using key_type = Key; - - if(std::is_floating_point::value) + HIPCUB_DEVICE + inline constexpr bool + operator()(const key_type& a, const key_type& b) { - return benchmark_utils::get_random_data(size, - static_cast(-1000), - static_cast(1000), - size); - } else - { - return benchmark_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), - size); + return a < b; } -} +}; template void run_sort_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { - using key_type = Key; - auto compare_function = [] __device__(const key_type& a, const key_type& b) { return a < b; }; + using key_type = Key; + + CompareFunction compare_function; - auto keys_input = generate_keys(size); + std::vector keys_input = benchmark_utils::get_random_data( + size, + benchmark_utils::generate_limits::min(), + benchmark_utils::generate_limits::max()); key_type* d_keys_input; key_type* d_keys_output; @@ -126,11 +121,16 @@ void run_sort_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t template void run_sort_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { - using key_type = Key; - using value_type = Value; - auto compare_function = [] __device__(const key_type& a, const key_type& b) { return a < b; }; + using key_type = Key; + using value_type = Value; + + CompareFunction compare_function; + + std::vector keys_input = benchmark_utils::get_random_data( + size, + benchmark_utils::generate_limits::min(), + benchmark_utils::generate_limits::max()); - auto keys_input = generate_keys(size); std::vector values_input(size); for(size_t i = 0; i < size; i++) { diff --git a/benchmark/benchmark_device_radix_sort.cpp b/benchmark/benchmark_device_radix_sort.cpp index 366e62d9..c65abd81 100644 --- a/benchmark/benchmark_device_radix_sort.cpp +++ b/benchmark/benchmark_device_radix_sort.cpp @@ -40,19 +40,10 @@ std::vector generate_keys(size_t size) { using key_type = Key; - if(std::is_floating_point::value) - { - return benchmark_utils::get_random_data(size, - (key_type)-1000, - (key_type) + 1000, - size); - } else - { - return benchmark_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max(), - size); - } + return benchmark_utils::get_random_data( + size, + benchmark_utils::generate_limits::min(), + benchmark_utils::generate_limits::max()); } template diff --git a/benchmark/benchmark_device_segmented_radix_sort.cpp b/benchmark/benchmark_device_segmented_radix_sort.cpp index ad7f3075..05566d6e 100644 --- a/benchmark/benchmark_device_segmented_radix_sort.cpp +++ b/benchmark/benchmark_device_segmented_radix_sort.cpp @@ -83,18 +83,10 @@ void run_sort_keys_benchmark(benchmark::State& state, } offsets.push_back(size); - std::vector keys_input; - if(std::is_floating_point::value) - { - keys_input - = benchmark_utils::get_random_data(size, (key_type)-1000, (key_type) + 1000); - } else - { - keys_input - = benchmark_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max()); - } + std::vector keys_input = benchmark_utils::get_random_data( + size, + benchmark_utils::generate_limits::min(), + benchmark_utils::generate_limits::max()); offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); @@ -230,18 +222,10 @@ void run_sort_pairs_benchmark(benchmark::State& state, } offsets.push_back(size); - std::vector keys_input; - if(std::is_floating_point::value) - { - keys_input - = benchmark_utils::get_random_data(size, (key_type)-1000, (key_type) + 1000); - } else - { - keys_input - = benchmark_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max()); - } + std::vector keys_input = benchmark_utils::get_random_data( + size, + benchmark_utils::generate_limits::min(), + benchmark_utils::generate_limits::max()); std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); diff --git a/benchmark/benchmark_device_segmented_sort.cpp b/benchmark/benchmark_device_segmented_sort.cpp index d98c7f42..e9bbaf30 100644 --- a/benchmark/benchmark_device_segmented_sort.cpp +++ b/benchmark/benchmark_device_segmented_sort.cpp @@ -83,19 +83,10 @@ void run_sort_keys_benchmark(benchmark::State& state, } offsets.push_back(size); - std::vector keys_input; - if(std::is_floating_point::value) - { - keys_input = benchmark_utils::get_random_data(size, - static_cast(-1000), - static_cast(1000)); - } else - { - keys_input - = benchmark_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max()); - } + std::vector keys_input = benchmark_utils::get_random_data( + size, + benchmark_utils::generate_limits::min(), + benchmark_utils::generate_limits::max()); offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); @@ -229,19 +220,10 @@ void run_sort_pairs_benchmark(benchmark::State& state, } offsets.push_back(size); - std::vector keys_input; - if(std::is_floating_point::value) - { - keys_input = benchmark_utils::get_random_data(size, - static_cast(-1000), - static_cast(1000)); - } else - { - keys_input - = benchmark_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max()); - } + std::vector keys_input = benchmark_utils::get_random_data( + size, + benchmark_utils::generate_limits::min(), + benchmark_utils::generate_limits::max()); std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); diff --git a/benchmark/benchmark_device_select.cpp b/benchmark/benchmark_device_select.cpp index c0921d54..2cd5e4cf 100644 --- a/benchmark/benchmark_device_select.cpp +++ b/benchmark/benchmark_device_select.cpp @@ -35,18 +35,13 @@ void run_flagged_benchmark(benchmark::State& state, const hipStream_t stream, float true_probability) { - std::vector input; + std::vector input + = benchmark_utils::get_random_data(size, + benchmark_utils::generate_limits::min(), + benchmark_utils::generate_limits::max()); + std::vector flags = benchmark_utils::get_random_data01(size, true_probability); - if(std::is_floating_point::value) - { - input = benchmark_utils::get_random_data(size, T(-1000), T(1000)); - } else - { - input = benchmark_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max()); - } T* d_input; FlagType* d_flags; @@ -126,6 +121,19 @@ void run_flagged_benchmark(benchmark::State& state, HIP_CHECK(hipDeviceSynchronize()); } +template +struct SelectOperator +{ + float true_probability; + SelectOperator(float true_probability_) : true_probability(true_probability_) {} + HIPCUB_DEVICE + inline constexpr bool + operator()(const T& value) + { + return value < T(1000 * true_probability); + } +}; + template void run_selectop_benchmark(benchmark::State& state, size_t size, @@ -134,12 +142,7 @@ void run_selectop_benchmark(benchmark::State& state, { std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); - auto select_op = [true_probability] __device__(const T& value) -> bool - { - if(value < T(1000 * true_probability)) - return true; - return false; - }; + SelectOperator select_op(true_probability); T* d_input; T* d_output; diff --git a/benchmark/benchmark_utils.hpp b/benchmark/benchmark_utils.hpp index fa3da901..48a46b8b 100644 --- a/benchmark/benchmark_utils.hpp +++ b/benchmark/benchmark_utils.hpp @@ -295,6 +295,50 @@ struct custom_type_decomposer } }; +template +struct generate_limits; + +template +struct generate_limits::value>> +{ + static inline T min() + { + return std::numeric_limits::min(); + } + static inline T max() + { + return std::numeric_limits::max(); + } +}; + +template +struct generate_limits::value>> +{ + using F = typename T::first_type; + using S = typename T::second_type; + static inline T min() + { + return T(generate_limits::min(), generate_limits::min()); + } + static inline T max() + { + return T(generate_limits::max(), generate_limits::max()); + } +}; + +template +struct generate_limits::value>> +{ + static inline T min() + { + return T(-1000); + } + static inline T max() + { + return T(1000); + } +}; + template inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) -> typename std::enable_if::value, std::vector>::type diff --git a/benchmark/benchmark_warp_merge_sort.cpp b/benchmark/benchmark_warp_merge_sort.cpp index e31f68eb..5b2d87ca 100644 --- a/benchmark/benchmark_warp_merge_sort.cpp +++ b/benchmark/benchmark_warp_merge_sort.cpp @@ -309,13 +309,10 @@ void run_benchmark(benchmark::State& state, constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); - const auto input = std::is_floating_point::value - ? benchmark_utils::get_random_data(size, - static_cast(-1000), - static_cast(1000)) - : benchmark_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max()); + const std::vector input + = benchmark_utils::get_random_data(size, + benchmark_utils::generate_limits::min(), + benchmark_utils::generate_limits::max()); T* d_input = nullptr; T* d_output = nullptr; @@ -380,13 +377,10 @@ void run_segmented_benchmark(benchmark::State& state, const auto num_segments = num_blocks * segments_per_block; const auto size = num_blocks * items_per_block; - const auto input = std::is_floating_point::value - ? benchmark_utils::get_random_data(size, - static_cast(-1000), - static_cast(1000)) - : benchmark_utils::get_random_data(size, - std::numeric_limits::min(), - std::numeric_limits::max()); + const std::vector input + = benchmark_utils::get_random_data(size, + benchmark_utils::generate_limits::min(), + benchmark_utils::generate_limits::max()); const auto segment_sizes = benchmark_utils::get_random_data(num_segments, 0, max_segment_size);