Skip to content

Commit

Permalink
improve atomic usage to avoid casts
Browse files Browse the repository at this point in the history
  • Loading branch information
upsj committed Jan 16, 2025
1 parent 0debdc6 commit 7ff5f59
Show file tree
Hide file tree
Showing 6 changed files with 291 additions and 38 deletions.
238 changes: 238 additions & 0 deletions common/cuda_hip/components/memory.nvidia.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,26 @@ __device__ __forceinline__ int32 atomic_max_relaxed_shared(int32* ptr,
}


__device__ __forceinline__ int32 atomic_cas_relaxed_shared(int32* ptr,
int32 old_val,
int32 new_val)
{
int32 result;
asm volatile(
#if __CUDA_ARCH__ < 600
"atom.shared.cas.b32 %0, [%1], %2, %3;"
#elif __CUDA_ARCH__ < 700
"atom.shared.cta.cas.b32 %0, [%1], %2, %3;"
#else
"atom.relaxed.cta.shared.cas.b32 %0, [%1], %2, %3;"
#endif
: "=r"(result)
: "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(old_val), "r"(new_val)
: "memory");
return result;
}


__device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr)
{
int64 result;
Expand Down Expand Up @@ -241,6 +261,26 @@ __device__ __forceinline__ int64 atomic_max_relaxed_shared(int64* ptr,
}


__device__ __forceinline__ int64 atomic_cas_relaxed_shared(int64* ptr,
int64 old_val,
int64 new_val)
{
int64 result;
asm volatile(
#if __CUDA_ARCH__ < 600
"atom.shared.cas.b64 %0, [%1], %2, %3;"
#elif __CUDA_ARCH__ < 700
"atom.shared.cta.cas.b64 %0, [%1], %2, %3;"
#else
"atom.relaxed.cta.shared.cas.b64 %0, [%1], %2, %3;"
#endif
: "=l"(result)
: "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(old_val), "l"(new_val)
: "memory");
return result;
}


__device__ __forceinline__ float load_relaxed_shared(const float* ptr)
{
float result;
Expand Down Expand Up @@ -467,6 +507,26 @@ __device__ __forceinline__ uint32 atomic_or_relaxed_shared(uint32* ptr,
}


__device__ __forceinline__ uint32 atomic_cas_relaxed_shared(uint32* ptr,
uint32 old_val,
uint32 new_val)
{
uint32 result;
asm volatile(
#if __CUDA_ARCH__ < 600
"atom.shared.cas.b32 %0, [%1], %2, %3;"
#elif __CUDA_ARCH__ < 700
"atom.shared.cta.cas.b32 %0, [%1], %2, %3;"
#else
"atom.relaxed.cta.shared.cas.b32 %0, [%1], %2, %3;"
#endif
: "=r"(result)
: "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(old_val), "r"(new_val)
: "memory");
return result;
}


__device__ __forceinline__ uint64 load_relaxed_shared(const uint64* ptr)
{
uint64 result;
Expand Down Expand Up @@ -593,6 +653,26 @@ __device__ __forceinline__ uint64 atomic_or_relaxed_shared(uint64* ptr,
}


__device__ __forceinline__ uint64 atomic_cas_relaxed_shared(uint64* ptr,
uint64 old_val,
uint64 new_val)
{
uint64 result;
asm volatile(
#if __CUDA_ARCH__ < 600
"atom.shared.cas.b64 %0, [%1], %2, %3;"
#elif __CUDA_ARCH__ < 700
"atom.shared.cta.cas.b64 %0, [%1], %2, %3;"
#else
"atom.relaxed.cta.shared.cas.b64 %0, [%1], %2, %3;"
#endif
: "=l"(result)
: "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(old_val), "l"(new_val)
: "memory");
return result;
}


__device__ __forceinline__ int32 load_acquire_shared(const int32* ptr)
{
int32 result;
Expand Down Expand Up @@ -873,6 +953,26 @@ __device__ __forceinline__ int32 atomic_max_relaxed_local(int32* ptr,
}


__device__ __forceinline__ int32 atomic_cas_relaxed_local(int32* ptr,
int32 old_val,
int32 new_val)
{
int32 result;
asm volatile(
#if __CUDA_ARCH__ < 600
"atom.cas.b32 %0, [%1], %2, %3;"
#elif __CUDA_ARCH__ < 700
"atom.cta.cas.b32 %0, [%1], %2, %3;"
#else
"atom.relaxed.cta.cas.b32 %0, [%1], %2, %3;"
#endif
: "=r"(result)
: "l"(ptr), "r"(old_val), "r"(new_val)
: "memory");
return result;
}


__device__ __forceinline__ int64 load_relaxed_local(const int64* ptr)
{
int64 result;
Expand Down Expand Up @@ -961,6 +1061,26 @@ __device__ __forceinline__ int64 atomic_max_relaxed_local(int64* ptr,
}


__device__ __forceinline__ int64 atomic_cas_relaxed_local(int64* ptr,
int64 old_val,
int64 new_val)
{
int64 result;
asm volatile(
#if __CUDA_ARCH__ < 600
"atom.cas.b64 %0, [%1], %2, %3;"
#elif __CUDA_ARCH__ < 700
"atom.cta.cas.b64 %0, [%1], %2, %3;"
#else
"atom.relaxed.cta.cas.b64 %0, [%1], %2, %3;"
#endif
: "=l"(result)
: "l"(ptr), "l"(old_val), "l"(new_val)
: "memory");
return result;
}


__device__ __forceinline__ float load_relaxed_local(const float* ptr)
{
float result;
Expand Down Expand Up @@ -1187,6 +1307,26 @@ __device__ __forceinline__ uint32 atomic_or_relaxed_local(uint32* ptr,
}


__device__ __forceinline__ uint32 atomic_cas_relaxed_local(uint32* ptr,
uint32 old_val,
uint32 new_val)
{
uint32 result;
asm volatile(
#if __CUDA_ARCH__ < 600
"atom.cas.b32 %0, [%1], %2, %3;"
#elif __CUDA_ARCH__ < 700
"atom.cta.cas.b32 %0, [%1], %2, %3;"
#else
"atom.relaxed.cta.cas.b32 %0, [%1], %2, %3;"
#endif
: "=r"(result)
: "l"(ptr), "r"(old_val), "r"(new_val)
: "memory");
return result;
}


__device__ __forceinline__ uint64 load_relaxed_local(const uint64* ptr)
{
uint64 result;
Expand Down Expand Up @@ -1313,6 +1453,26 @@ __device__ __forceinline__ uint64 atomic_or_relaxed_local(uint64* ptr,
}


__device__ __forceinline__ uint64 atomic_cas_relaxed_local(uint64* ptr,
uint64 old_val,
uint64 new_val)
{
uint64 result;
asm volatile(
#if __CUDA_ARCH__ < 600
"atom.cas.b64 %0, [%1], %2, %3;"
#elif __CUDA_ARCH__ < 700
"atom.cta.cas.b64 %0, [%1], %2, %3;"
#else
"atom.relaxed.cta.cas.b64 %0, [%1], %2, %3;"
#endif
: "=l"(result)
: "l"(ptr), "l"(old_val), "l"(new_val)
: "memory");
return result;
}


__device__ __forceinline__ int32 load_acquire_local(const int32* ptr)
{
int32 result;
Expand Down Expand Up @@ -1590,6 +1750,25 @@ __device__ __forceinline__ int32 atomic_max_relaxed(int32* ptr, int32 value)
}


__device__ __forceinline__ int32 atomic_cas_relaxed(int32* ptr, int32 old_val,
int32 new_val)
{
int32 result;
asm volatile(
#if __CUDA_ARCH__ < 600
"atom.cas.b32 %0, [%1], %2, %3;"
#elif __CUDA_ARCH__ < 700
"atom.gpu.cas.b32 %0, [%1], %2, %3;"
#else
"atom.relaxed.gpu.cas.b32 %0, [%1], %2, %3;"
#endif
: "=r"(result)
: "l"(ptr), "r"(old_val), "r"(new_val)
: "memory");
return result;
}


__device__ __forceinline__ int64 load_relaxed(const int64* ptr)
{
int64 result;
Expand Down Expand Up @@ -1675,6 +1854,25 @@ __device__ __forceinline__ int64 atomic_max_relaxed(int64* ptr, int64 value)
}


__device__ __forceinline__ int64 atomic_cas_relaxed(int64* ptr, int64 old_val,
int64 new_val)
{
int64 result;
asm volatile(
#if __CUDA_ARCH__ < 600
"atom.cas.b64 %0, [%1], %2, %3;"
#elif __CUDA_ARCH__ < 700
"atom.gpu.cas.b64 %0, [%1], %2, %3;"
#else
"atom.relaxed.gpu.cas.b64 %0, [%1], %2, %3;"
#endif
: "=l"(result)
: "l"(ptr), "l"(old_val), "l"(new_val)
: "memory");
return result;
}


__device__ __forceinline__ float load_relaxed(const float* ptr)
{
float result;
Expand Down Expand Up @@ -1894,6 +2092,26 @@ __device__ __forceinline__ uint32 atomic_or_relaxed(uint32* ptr, uint32 value)
}


__device__ __forceinline__ uint32 atomic_cas_relaxed(uint32* ptr,
uint32 old_val,
uint32 new_val)
{
uint32 result;
asm volatile(
#if __CUDA_ARCH__ < 600
"atom.cas.b32 %0, [%1], %2, %3;"
#elif __CUDA_ARCH__ < 700
"atom.gpu.cas.b32 %0, [%1], %2, %3;"
#else
"atom.relaxed.gpu.cas.b32 %0, [%1], %2, %3;"
#endif
: "=r"(result)
: "l"(ptr), "r"(old_val), "r"(new_val)
: "memory");
return result;
}


__device__ __forceinline__ uint64 load_relaxed(const uint64* ptr)
{
uint64 result;
Expand Down Expand Up @@ -2015,6 +2233,26 @@ __device__ __forceinline__ uint64 atomic_or_relaxed(uint64* ptr, uint64 value)
}


__device__ __forceinline__ uint64 atomic_cas_relaxed(uint64* ptr,
uint64 old_val,
uint64 new_val)
{
uint64 result;
asm volatile(
#if __CUDA_ARCH__ < 600
"atom.cas.b64 %0, [%1], %2, %3;"
#elif __CUDA_ARCH__ < 700
"atom.gpu.cas.b64 %0, [%1], %2, %3;"
#else
"atom.relaxed.gpu.cas.b64 %0, [%1], %2, %3;"
#endif
: "=l"(result)
: "l"(ptr), "l"(old_val), "l"(new_val)
: "memory");
return result;
}


__device__ __forceinline__ int32 load_acquire(const int32* ptr)
{
int32 result;
Expand Down
6 changes: 1 addition & 5 deletions common/cuda_hip/factorization/elimination_forest_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,6 @@ __global__ __launch_bounds__(default_block_size) void mst_join_edges(
IndexType* __restrict__ out_sources, IndexType* __restrict__ out_targets,
IndexType* __restrict__ out_counter)
{
using atomic_type = std::conditional_t<std::is_same_v<IndexType, int32>,
int32, unsigned long long>;
const auto i = thread::get_thread_id_flat<IndexType>();
if (i >= size) {
return;
Expand All @@ -159,9 +157,7 @@ __global__ __launch_bounds__(default_block_size) void mst_join_edges(
do {
repeat = false;
auto old_parent =
atomicCAS(reinterpret_cast<atomic_type*>(parents + old_rep),
static_cast<atomic_type>(old_rep),
static_cast<atomic_type>(new_rep));
atomic_cas_relaxed(parents + old_rep, old_rep, new_rep);
// if this fails, the parent of old_rep changed recently, so we need
// to try again by updating the parent's parent (hopefully its rep)
if (old_parent != old_rep) {
Expand Down
6 changes: 4 additions & 2 deletions common/cuda_hip/matrix/csr_kernels.template.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

Expand Down Expand Up @@ -35,6 +35,7 @@
#include "common/cuda_hip/components/cooperative_groups.hpp"
#include "common/cuda_hip/components/format_conversion.hpp"
#include "common/cuda_hip/components/intrinsics.hpp"
#include "common/cuda_hip/components/memory.hpp"
#include "common/cuda_hip/components/merging.hpp"
#include "common/cuda_hip/components/prefix_sum.hpp"
#include "common/cuda_hip/components/reduction.hpp"
Expand Down Expand Up @@ -1293,7 +1294,8 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup(
}
#else
if (i < row_len) {
while (atomicCAS(local_storage + hash, empty, i) != empty) {
while (atomic_cas_relaxed_local(local_storage + hash, empty,
static_cast<int32>(i)) != empty) {
hash++;
if (hash >= available_storage) {
hash = 0;
Expand Down
Loading

0 comments on commit 7ff5f59

Please sign in to comment.