diff --git a/HIPStream.cpp b/HIPStream.cpp
index 750aa943..492eff5c 100644
--- a/HIPStream.cpp
+++ b/HIPStream.cpp
@@ -84,13 +84,6 @@ HIPStream<T>::HIPStream(const unsigned int ARRAY_SIZE, const bool event_timing,
     throw std::runtime_error(ss.str());
   }
 
-#ifdef __HIP_PLATFORM_NVCC__
-  dot_block_cnt = 256;
-#else
-  dot_block_cnt = block_cnt;
-#endif
-
-  std::cerr << "dot block count " << dot_block_cnt << std::endl;
   std::cerr << "block count " << block_cnt << std::endl;
 
   // Set device
@@ -110,7 +103,7 @@ HIPStream<T>::HIPStream(const unsigned int ARRAY_SIZE, const bool event_timing,
   // TODO would like to use hipHostMallocNonCoherent here, but it appears to
   // be broken with hipExtLaunchKernelGGL(). The data never becomes coherent
   // with the system, even if we device sync or wait on a system scope event
-  hipHostMalloc(&sums, sizeof(T) * dot_block_cnt, hipHostMallocCoherent);
+  hipHostMalloc(&sums, sizeof(T) * block_cnt, hipHostMallocCoherent);
 
   // Check buffers fit on the device
   hipDeviceProp_t props;
@@ -436,7 +429,7 @@ float HIPStream<T>::triad()
   return kernel_time;
 }
 
-template<unsigned int block_sz, unsigned int n = 1024>
+template<unsigned int n = TBSIZE>
 struct Reducer {
   template<typename I>
   __device__
@@ -445,20 +438,29 @@ struct Reducer {
   {
     if (n == 1) return;
 
+#if defined(__HIP_PLATFORM_NVCC__)
+    constexpr unsigned int warpSize = 32;
+#endif
     constexpr bool is_same_warp{n <= warpSize * 2};
-    if (block_sz >= n)
+    if (static_cast<int>(threadIdx.x) < n / 2)
     {
-      if (threadIdx.x < n / 2)
-      {
-        it[threadIdx.x] += it[threadIdx.x + n / 2];
-      }
-      is_same_warp ? __threadfence_block() : __syncthreads();
+      it[threadIdx.x] += it[threadIdx.x + n / 2];
     }
+    is_same_warp ? __threadfence_block() : __syncthreads();
 
-    Reducer<block_sz, n / 2>::reduce(it);
+    Reducer<n / 2>::reduce(it);
   }
 };
 
+template<>
+struct Reducer<1u> {
+  template<typename I>
+  __device__
+  static
+  void reduce(I) noexcept
+  {}
+};
+
 template <unsigned int elts_per_lane, unsigned int chunks_per_block, typename T>
 __launch_bounds__(TBSIZE)
 __global__
@@ -482,13 +484,12 @@ void dot_kernel(const T * __restrict a, const T * __restrict b,
 
   __syncthreads();
 
-  Reducer<TBSIZE>::reduce(tb_sum);
+  Reducer<>::reduce(tb_sum);
 
   if (threadIdx.x)
   {
     return;
   }
-
   sum[blockIdx.x] = tb_sum[0];
 }
 
@@ -496,11 +497,11 @@ template <class T>
 T HIPStream<T>::dot()
 {
   hipLaunchKernelSynchronous(dot_kernel<elts_per_lane, chunks_per_block, T>,
-                             dim3(dot_block_cnt), dim3(TBSIZE), nullptr, true,
+                             dim3(block_cnt), dim3(TBSIZE), nullptr, true,
                              d_a, d_b, sums);
 
   T sum{0};
-  for (auto i = 0u; i != dot_block_cnt; ++i)
+  for (auto i = 0u; i != block_cnt; ++i)
   {
     sum += sums[i];
   }
diff --git a/HIPStream.h b/HIPStream.h
index 8077fa11..ca616abb 100644
--- a/HIPStream.h
+++ b/HIPStream.h
@@ -25,6 +25,7 @@ class HIPStream : public Stream<T>
 {
 #ifdef __HIP_PLATFORM_NVCC__
   static constexpr unsigned int elts_per_lane{1};
+  static constexpr unsigned int chunks_per_block{8};
 #else
   static constexpr unsigned int best_size{sizeof(unsigned int) * 1};
   static constexpr unsigned int elts_per_lane{
@@ -35,7 +36,6 @@ class HIPStream : public Stream<T>
     // Size of arrays
     const unsigned int array_size;
     const unsigned int block_cnt;
-    unsigned int dot_block_cnt;
     const bool evt_timing;
     hipEvent_t start_ev;
     hipEvent_t stop_ev;