improve atomic usage to avoid casts

ginkgo-project · Jan 16, 2025 · 7ff5f59 · 7ff5f59
1 parent 0debdc6
commit 7ff5f59
Show file tree

Hide file tree

Showing 6 changed files with 291 additions and 38 deletions.
diff --git a/common/cuda_hip/components/memory.nvidia.hpp.inc b/common/cuda_hip/components/memory.nvidia.hpp.inc
@@ -153,6 +153,26 @@ __device__ __forceinline__ int32 atomic_max_relaxed_shared(int32* ptr,
 }
 
 
+__device__ __forceinline__ int32 atomic_cas_relaxed_shared(int32* ptr,
+                                                           int32 old_val,
+                                                           int32 new_val)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.cas.b32 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.cas.b32 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.shared.cas.b32 %0, [%1], %2, %3;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(old_val), "r"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr)
 {
     int64 result;
@@ -241,6 +261,26 @@ __device__ __forceinline__ int64 atomic_max_relaxed_shared(int64* ptr,
 }
 
 
+__device__ __forceinline__ int64 atomic_cas_relaxed_shared(int64* ptr,
+                                                           int64 old_val,
+                                                           int64 new_val)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.cas.b64 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.cas.b64 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.shared.cas.b64 %0, [%1], %2, %3;"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(old_val), "l"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ float load_relaxed_shared(const float* ptr)
 {
     float result;
@@ -467,6 +507,26 @@ __device__ __forceinline__ uint32 atomic_or_relaxed_shared(uint32* ptr,
 }
 
 
+__device__ __forceinline__ uint32 atomic_cas_relaxed_shared(uint32* ptr,
+                                                            uint32 old_val,
+                                                            uint32 new_val)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.cas.b32 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.cas.b32 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.shared.cas.b32 %0, [%1], %2, %3;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(old_val), "r"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ uint64 load_relaxed_shared(const uint64* ptr)
 {
     uint64 result;
@@ -593,6 +653,26 @@ __device__ __forceinline__ uint64 atomic_or_relaxed_shared(uint64* ptr,
 }
 
 
+__device__ __forceinline__ uint64 atomic_cas_relaxed_shared(uint64* ptr,
+                                                            uint64 old_val,
+                                                            uint64 new_val)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.cas.b64 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.cas.b64 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.shared.cas.b64 %0, [%1], %2, %3;"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(old_val), "l"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ int32 load_acquire_shared(const int32* ptr)
 {
     int32 result;
@@ -873,6 +953,26 @@ __device__ __forceinline__ int32 atomic_max_relaxed_local(int32* ptr,
 }
 
 
+__device__ __forceinline__ int32 atomic_cas_relaxed_local(int32* ptr,
+                                                          int32 old_val,
+                                                          int32 new_val)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b32 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.cas.b32 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.cas.b32 %0, [%1], %2, %3;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(old_val), "r"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ int64 load_relaxed_local(const int64* ptr)
 {
     int64 result;
@@ -961,6 +1061,26 @@ __device__ __forceinline__ int64 atomic_max_relaxed_local(int64* ptr,
 }
 
 
+__device__ __forceinline__ int64 atomic_cas_relaxed_local(int64* ptr,
+                                                          int64 old_val,
+                                                          int64 new_val)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b64 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.cas.b64 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.cas.b64 %0, [%1], %2, %3;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(old_val), "l"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ float load_relaxed_local(const float* ptr)
 {
     float result;
@@ -1187,6 +1307,26 @@ __device__ __forceinline__ uint32 atomic_or_relaxed_local(uint32* ptr,
 }
 
 
+__device__ __forceinline__ uint32 atomic_cas_relaxed_local(uint32* ptr,
+                                                           uint32 old_val,
+                                                           uint32 new_val)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b32 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.cas.b32 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.cas.b32 %0, [%1], %2, %3;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(old_val), "r"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ uint64 load_relaxed_local(const uint64* ptr)
 {
     uint64 result;
@@ -1313,6 +1453,26 @@ __device__ __forceinline__ uint64 atomic_or_relaxed_local(uint64* ptr,
 }
 
 
+__device__ __forceinline__ uint64 atomic_cas_relaxed_local(uint64* ptr,
+                                                           uint64 old_val,
+                                                           uint64 new_val)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b64 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.cas.b64 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.cas.b64 %0, [%1], %2, %3;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(old_val), "l"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ int32 load_acquire_local(const int32* ptr)
 {
     int32 result;
@@ -1590,6 +1750,25 @@ __device__ __forceinline__ int32 atomic_max_relaxed(int32* ptr, int32 value)
 }
 
 
+__device__ __forceinline__ int32 atomic_cas_relaxed(int32* ptr, int32 old_val,
+                                                    int32 new_val)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b32 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.cas.b32 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.gpu.cas.b32 %0, [%1], %2, %3;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(old_val), "r"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ int64 load_relaxed(const int64* ptr)
 {
     int64 result;
@@ -1675,6 +1854,25 @@ __device__ __forceinline__ int64 atomic_max_relaxed(int64* ptr, int64 value)
 }
 
 
+__device__ __forceinline__ int64 atomic_cas_relaxed(int64* ptr, int64 old_val,
+                                                    int64 new_val)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b64 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.cas.b64 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.gpu.cas.b64 %0, [%1], %2, %3;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(old_val), "l"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ float load_relaxed(const float* ptr)
 {
     float result;
@@ -1894,6 +2092,26 @@ __device__ __forceinline__ uint32 atomic_or_relaxed(uint32* ptr, uint32 value)
 }
 
 
+__device__ __forceinline__ uint32 atomic_cas_relaxed(uint32* ptr,
+                                                     uint32 old_val,
+                                                     uint32 new_val)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b32 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.cas.b32 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.gpu.cas.b32 %0, [%1], %2, %3;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(old_val), "r"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ uint64 load_relaxed(const uint64* ptr)
 {
     uint64 result;
@@ -2015,6 +2233,26 @@ __device__ __forceinline__ uint64 atomic_or_relaxed(uint64* ptr, uint64 value)
 }
 
 
+__device__ __forceinline__ uint64 atomic_cas_relaxed(uint64* ptr,
+                                                     uint64 old_val,
+                                                     uint64 new_val)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b64 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.cas.b64 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.gpu.cas.b64 %0, [%1], %2, %3;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(old_val), "l"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ int32 load_acquire(const int32* ptr)
 {
     int32 result;

diff --git a/common/cuda_hip/factorization/elimination_forest_kernels.cpp b/common/cuda_hip/factorization/elimination_forest_kernels.cpp
@@ -139,8 +139,6 @@ __global__ __launch_bounds__(default_block_size) void mst_join_edges(
     IndexType* __restrict__ out_sources, IndexType* __restrict__ out_targets,
     IndexType* __restrict__ out_counter)
 {
-    using atomic_type = std::conditional_t<std::is_same_v<IndexType, int32>,
-                                           int32, unsigned long long>;
     const auto i = thread::get_thread_id_flat<IndexType>();
     if (i >= size) {
         return;
@@ -159,9 +157,7 @@ __global__ __launch_bounds__(default_block_size) void mst_join_edges(
         do {
             repeat = false;
             auto old_parent =
-                atomicCAS(reinterpret_cast<atomic_type*>(parents + old_rep),
-                          static_cast<atomic_type>(old_rep),
-                          static_cast<atomic_type>(new_rep));
+                atomic_cas_relaxed(parents + old_rep, old_rep, new_rep);
             // if this fails, the parent of old_rep changed recently, so we need
             // to try again by updating the parent's parent (hopefully its rep)
             if (old_parent != old_rep) {

diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -35,6 +35,7 @@
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/cuda_hip/components/format_conversion.hpp"
 #include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "common/cuda_hip/components/merging.hpp"
 #include "common/cuda_hip/components/prefix_sum.hpp"
 #include "common/cuda_hip/components/reduction.hpp"
@@ -1293,7 +1294,8 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup(
         }
 #else
         if (i < row_len) {
-            while (atomicCAS(local_storage + hash, empty, i) != empty) {
+            while (atomic_cas_relaxed_local(local_storage + hash, empty,
+                                            static_cast<int32>(i)) != empty) {
                 hash++;
                 if (hash >= available_storage) {
                     hash = 0;