[BUG] calling cast_smem_ptr_to_uint(device fn) from make_gmma_desc(host device fn) is not allowed #1997

lygztq · 2024-12-18T09:36:39Z

Describe the bug
nvcc will report an error like

error: calling a __device__ function("cute::cast_smem_ptr_to_uint(const void *)") from a __host__ __device__ function("make_gmma_desc") is not allowed
    uint32_t start_address = cast_smem_ptr_to_uint(raw_pointer_cast(u128_tensor.data()));
                             ^
          detected during instantiation of "cute::GmmaDescriptor cute::GMMA::make_gmma_desc<MajorMode,TEngine,TLayout>(const cute::Tensor<TEngine, TLayout> &) [with MajorMode=cute::GMMA::Major::K, TEngine=cute::ViewEngine<cute::swizzle_ptr<cute::Swizzle<3, 4, 3>, cute::smem_ptr<cutlass::float_e4m3_t *>>>, TLayout=cute::Layout<cute::tuple<cute::C<64>, cute::C<32>>, cute::tuple<cute::_128, cute::_1>>]"

when I try to make a partition fragment on smem tensor with wgmma tiled mma.

Steps/Code to reproduce bug
Follow this guide http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports to craft a minimal bug report. This helps us reproduce the issue you're having and resolve the issue more quickly.

#include "cutlass/float8.h"

#include "cute/layout.hpp"
#include "cute/pointer.hpp"
#include "cute/tensor.hpp"

#include "cute/swizzle_layout.hpp"
#include "cute/underscore.hpp"

#include "cute/pointer_flagged.hpp"

#include "cute/arch/copy.hpp"
#include "cute/arch/copy_sm90.hpp"
#include "cute/atom/copy_traits_sm90_tma.hpp"
#include "cute/atom/copy_traits_sm90_tma_swizzle.hpp"

#include "cute/arch/mma_sm90_gmma.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cute/atom/mma_traits_sm90_gmma.hpp"

#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/gemm/gemm.h"

#include "cute/arch/mma_sm90.hpp"

#include "cutlass/numeric_conversion.h"
#include <cstdint>

__global__ void test_kernel() {
  using namespace cute;
  constexpr int M = 256;
  constexpr int N = 16;
  constexpr int K = 128;

  using Element = cutlass::float_e4m3_t;
  using AccumElement = float;
  using TileShape_MNK = cute::Shape<cute::Int<M>, cute::Int<N>, cute::Int<K>>;
  using GmmaTileShape = cute::Layout<cute::Shape<cute::Int<M / 64>, cute::_1, cute::_1>>;
  using TiledGmma0 = decltype(cute::make_tiled_mma(
      cute::GMMA::ss_op_selector<Element, Element, AccumElement,
                                 cute::Shape<cute::Int<M>, cute::Int<N>, cute::Int<K>>>(),
      GmmaTileShape{}));
  using SmemLayoutAtomA =
      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
               cute::GMMA::Major::K, Element, decltype(cute::get<0>(TileShape_MNK{})),
               decltype(cute::get<2>(TileShape_MNK{}))>());
  using SmemLayoutA =
      decltype(cute::tile_to_shape(SmemLayoutAtomA{}, cute::select<0, 2>(TileShape_MNK{})));

  __shared__ uint8_t smem_a_bytes[size(select<0, 2>(TileShape_MNK{}))];

  auto tiled_mma0 = TiledGmma0{};
  auto thread_mma0 = tiled_mma0.get_thread_slice(threadIdx.x);

  auto sA = make_tensor(make_smem_ptr(reinterpret_cast<Element *>(smem_a_bytes)), SmemLayoutA{});
  auto tOrA = thread_mma0.partition_fragment_A(sA);
}

int main() {
  dim3 grid(1);
  dim3 block(4 * 32);
  test_kernel<<<grid, block>>>();
  cudaDeviceSynchronize();
  return 0;
}

Expected behavior
I don't think it is a proper invocation (calling device in host device) that is acceptable to nvcc. However I do notice that in some cases the same invocation can be accepted by nvcc, why?

Environment details (please complete the following information):
cuda 12.4, cutlass 3.4

The text was updated successfully, but these errors were encountered:

thakkarV · 2024-12-18T09:52:57Z

You're most likely using your own command line. Use the command line flags generated by our cmake.

lygztq · 2024-12-18T10:15:30Z

Here is my compile command

/usr/local/cuda/bin/nvcc -forward-unknown-to-host-compiler  -I/soft/3rdparty/cutlass/include -I/soft/3rdparty/cutlass/tools/util/include -I/soft/3rdparty/cutlass/examples/common --generate-code=arch=compute_90a,code=[compute_90a,sm_90a] --use_fast_math --forward-unknown-to-host-compiler --expt-extended-lambda --expt-relaxed-constexpr --generate-line-info -Xcompiler=-fPIE -Xcompiler=-Wno-psabi -Xcompiler=-fno-strict-aliasing -O3 -std=c++17 -MD -MT case_study/CMakeFiles/foo.dir/foo.cu.o -MF CMakeFiles/foo.dir/foo.cu.o.d -x cu -c /soft/case_study/foo.cu -o CMakeFiles/foo.dir/foo.cu.o

If you mean the missing --expt-relaxed-constexpr when calling __host__ in __host__ __device__, I have it in my command. Could you tell me which flag I can use to avoid such error?

lygztq added ? - Needs Triage bug Something isn't working labels Dec 18, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[BUG] calling cast_smem_ptr_to_uint(device fn) from make_gmma_desc(host device fn) is not allowed #1997

[BUG] calling cast_smem_ptr_to_uint(device fn) from make_gmma_desc(host device fn) is not allowed #1997

lygztq commented Dec 18, 2024 •

edited

Loading

thakkarV commented Dec 18, 2024

lygztq commented Dec 18, 2024

[BUG] calling cast_smem_ptr_to_uint(device fn) from make_gmma_desc(host device fn) is not allowed #1997

[BUG] calling cast_smem_ptr_to_uint(device fn) from make_gmma_desc(host device fn) is not allowed #1997

Comments

lygztq commented Dec 18, 2024 • edited Loading

thakkarV commented Dec 18, 2024

lygztq commented Dec 18, 2024

lygztq commented Dec 18, 2024 •

edited

Loading