diff --git a/HIPStream.cpp b/HIPStream.cpp index 750aa943..492eff5c 100644 --- a/HIPStream.cpp +++ b/HIPStream.cpp @@ -84,13 +84,6 @@ HIPStream::HIPStream(const unsigned int ARRAY_SIZE, const bool event_timing, throw std::runtime_error(ss.str()); } -#ifdef __HIP_PLATFORM_NVCC__ - dot_block_cnt = 256; -#else - dot_block_cnt = block_cnt; -#endif - - std::cerr << "dot block count " << dot_block_cnt << std::endl; std::cerr << "block count " << block_cnt << std::endl; // Set device @@ -110,7 +103,7 @@ HIPStream::HIPStream(const unsigned int ARRAY_SIZE, const bool event_timing, // TODO would like to use hipHostMallocNonCoherent here, but it appears to // be broken with hipExtLaunchKernelGGL(). The data never becomes coherent // with the system, even if we device sync or wait on a system scope event - hipHostMalloc(&sums, sizeof(T) * dot_block_cnt, hipHostMallocCoherent); + hipHostMalloc(&sums, sizeof(T) * block_cnt, hipHostMallocCoherent); // Check buffers fit on the device hipDeviceProp_t props; @@ -436,7 +429,7 @@ float HIPStream::triad() return kernel_time; } -template +template struct Reducer { template __device__ @@ -445,20 +438,29 @@ struct Reducer { { if (n == 1) return; +#if defined(__HIP_PLATFORM_NVCC__) + constexpr unsigned int warpSize = 32; +#endif constexpr bool is_same_warp{n <= warpSize * 2}; - if (block_sz >= n) + if (static_cast(threadIdx.x) < n / 2) { - if (threadIdx.x < n / 2) - { - it[threadIdx.x] += it[threadIdx.x + n / 2]; - } - is_same_warp ? __threadfence_block() : __syncthreads(); + it[threadIdx.x] += it[threadIdx.x + n / 2]; } + is_same_warp ? __threadfence_block() : __syncthreads(); - Reducer::reduce(it); + Reducer::reduce(it); } }; +template<> +struct Reducer<1u> { + template + __device__ + static + void reduce(I) noexcept + {} +}; + template __launch_bounds__(TBSIZE) __global__ @@ -482,13 +484,12 @@ void dot_kernel(const T * __restrict a, const T * __restrict b, __syncthreads(); - Reducer::reduce(tb_sum); + Reducer<>::reduce(tb_sum); if (threadIdx.x) { return; } - sum[blockIdx.x] = tb_sum[0]; } @@ -496,11 +497,11 @@ template T HIPStream::dot() { hipLaunchKernelSynchronous(dot_kernel, - dim3(dot_block_cnt), dim3(TBSIZE), nullptr, true, + dim3(block_cnt), dim3(TBSIZE), nullptr, true, d_a, d_b, sums); T sum{0}; - for (auto i = 0u; i != dot_block_cnt; ++i) + for (auto i = 0u; i != block_cnt; ++i) { sum += sums[i]; } diff --git a/HIPStream.h b/HIPStream.h index 8077fa11..ca616abb 100644 --- a/HIPStream.h +++ b/HIPStream.h @@ -25,6 +25,7 @@ class HIPStream : public Stream { #ifdef __HIP_PLATFORM_NVCC__ static constexpr unsigned int elts_per_lane{1}; + static constexpr unsigned int chunks_per_block{8}; #else static constexpr unsigned int best_size{sizeof(unsigned int) * 1}; static constexpr unsigned int elts_per_lane{ @@ -35,7 +36,6 @@ class HIPStream : public Stream // Size of arrays const unsigned int array_size; const unsigned int block_cnt; - unsigned int dot_block_cnt; const bool evt_timing; hipEvent_t start_ev; hipEvent_t stop_ev;