You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
#include"cutlass/float8.h"
#include"cute/layout.hpp"
#include"cute/pointer.hpp"
#include"cute/tensor.hpp"
#include"cute/swizzle_layout.hpp"
#include"cute/underscore.hpp"
#include"cute/pointer_flagged.hpp"
#include"cute/arch/copy.hpp"
#include"cute/arch/copy_sm90.hpp"
#include"cute/atom/copy_traits_sm90_tma.hpp"
#include"cute/atom/copy_traits_sm90_tma_swizzle.hpp"
#include"cute/arch/mma_sm90_gmma.hpp"
#include"cute/atom/mma_atom.hpp"
#include"cute/atom/mma_traits_sm90_gmma.hpp"
#include"cutlass/gemm/collective/collective_builder.hpp"
#include"cutlass/gemm/gemm.h"
#include"cute/arch/mma_sm90.hpp"
#include"cutlass/numeric_conversion.h"
#include<cstdint>
__global__ voidtest_kernel() {
usingnamespacecute;constexprint M = 256;
constexprint N = 16;
constexprint K = 128;
using Element = cutlass::float_e4m3_t;
using AccumElement = float;
using TileShape_MNK = cute::Shape<cute::Int<M>, cute::Int<N>, cute::Int<K>>;
using GmmaTileShape = cute::Layout<cute::Shape<cute::Int<M / 64>, cute::_1, cute::_1>>;
using TiledGmma0 = decltype(cute::make_tiled_mma(
cute::GMMA::ss_op_selector<Element, Element, AccumElement,
cute::Shape<cute::Int<M>, cute::Int<N>, cute::Int<K>>>(),
GmmaTileShape{}));
using SmemLayoutAtomA =
decltype(cutlass::gemm::collective::detail::ss_smem_selector<
cute::GMMA::Major::K, Element, decltype(cute::get<0>(TileShape_MNK{})),
decltype(cute::get<2>(TileShape_MNK{}))>());
using SmemLayoutA =
decltype(cute::tile_to_shape(SmemLayoutAtomA{}, cute::select<0, 2>(TileShape_MNK{})));
__shared__ uint8_t smem_a_bytes[size(select<0, 2>(TileShape_MNK{}))];
auto tiled_mma0 = TiledGmma0{};
auto thread_mma0 = tiled_mma0.get_thread_slice(threadIdx.x);
autosA = make_tensor(make_smem_ptr(reinterpret_cast<Element *>(smem_a_bytes)), SmemLayoutA{});
auto tOrA = thread_mma0.partition_fragment_A(sA);
}
intmain() {
dim3 grid(1);
dim3 block(4 * 32);
test_kernel<<<grid, block>>>();
cudaDeviceSynchronize();
return0;
}
Expected behavior
I don't think it is a proper invocation (calling device in hostdevice) that is acceptable to nvcc. However I do notice that in some cases the same invocation can be accepted by nvcc, why?
Environment details (please complete the following information):
cuda 12.4, cutlass 3.4
The text was updated successfully, but these errors were encountered:
If you mean the missing --expt-relaxed-constexpr when calling __host__ in __host__ __device__, I have it in my command. Could you tell me which flag I can use to avoid such error?
Describe the bug
nvcc will report an error like
when I try to make a partition fragment on smem tensor with wgmma tiled mma.
Steps/Code to reproduce bug
Follow this guide http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports to craft a minimal bug report. This helps us reproduce the issue you're having and resolve the issue more quickly.
Expected behavior
I don't think it is a proper invocation (calling device in host device) that is acceptable to nvcc. However I do notice that in some cases the same invocation can be accepted by nvcc, why?
Environment details (please complete the following information):
cuda 12.4, cutlass 3.4
The text was updated successfully, but these errors were encountered: