Skip to content

Commit

Permalink
[runtime] modified OpKernel to support dynamic shape
Browse files Browse the repository at this point in the history
  • Loading branch information
XG-zheng committed Mar 25, 2024
1 parent c4eeef0 commit 0662cd8
Show file tree
Hide file tree
Showing 13 changed files with 427 additions and 49 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ using CurandOpKernelIfaceTraits =
* struct ConcreateOpImpl {
* ConcreateOpImpl(const OpAccessor&);
* void Execute(args..., cudaStream_t);
* optional<void ProluguePerExecute(const OpAccessor&)>;
* };
* using ConcreteOp = CudaOpKernel<ConcreateOpImpl, Arguments...>;
*/
Expand All @@ -153,6 +154,7 @@ BRT_DEF_OP_KERNEL_WRPPER(CudaOpKernel,
* struct ConcreateOpImpl {
* ConcreateOpImpl(const OpAccessor&);
* void Execute(args..., cublasHandle_t, cudaStream_t);
* optional<void ProluguePerExecute(const OpAccessor&)>;
* };
* using ConcreteOp = CublasOpKernel<ConcreateOpImpl, Arguments...>;
*/
Expand All @@ -163,6 +165,7 @@ BRT_DEF_OP_KERNEL_WRPPER(CublasOpKernel,
* struct ConcreateOpImpl {
* ConcreateOpImpl(const OpAccessor&);
* void Execute(args..., cudnnHandle_t, cudaStream_t);
* optional<void ProluguePerExecute(const OpAccessor&)>;
* };
* using ConcreteOp = CudnnOpKernel<ConcreateOpImpl, Arguments...>;
*/
Expand All @@ -173,6 +176,7 @@ BRT_DEF_OP_KERNEL_WRPPER(CudnnOpKernel,
* struct ConcreateOpImpl {
* ConcreateOpImpl(const OpAccessor&);
* void Execute(args..., void* workspace, cudaStream_t);
* optional<void ProluguePerExecute(const OpAccessor&)>;
* size_t GetWorkspaceSize(const ExecutionContext &);
* };
* using ConcreteOp = CudaOpKernelWithWorkspace<ConcreateOpImpl, Arguments...>;
Expand Down
55 changes: 53 additions & 2 deletions runtime/include/brt/core/framework/op_kernel_impl_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "brt/core/context/work_queue.h"
#include "brt/core/framework/op_accessor.h"
#include "brt/core/framework/op_kernel.h"
#include "mlir/IR/BuiltinTypeInterfaces.h"

namespace brt {

Expand Down Expand Up @@ -169,6 +170,10 @@ template <typename... Arguments> struct OpKernelIfaceTraitsBase {

template <typename Impl>
common::Status static inline Run(Impl *impl, const ExecutionContext &ctx) {
auto status = impl->ProloguePerExecute(ctx);
if (!status.IsOK()) {
return status;
}
return impl->Execute(Arguments::Get(impl, ctx)...);
}

Expand All @@ -187,17 +192,63 @@ template <typename... Arguments> struct OpKernelIfaceTraitsBase {

template <typename... Arguments>
struct NaiveOpKernelIfaceTraits : public OpKernelIfaceTraitsBase<Arguments...> {

template <typename T> struct TrueHelper : std::true_type {};

template <typename ClassType, typename... ArgType>
struct HasProloguePerExecuteTraits {
template <typename Impl, typename... Arg>
static auto CheckPrologurePerExecute(int)
-> TrueHelper<decltype(std::declval<Impl>().ProloguePerExecute(
std::declval<Arg>()...))>;

template <typename Impl, typename... Arg>
static auto CheckPrologurePerExecute(...) -> std::false_type;

public:
enum {
value =
decltype(CheckPrologurePerExecute<ClassType, ArgType...>(0))::value
};
};

template <typename ImplBase> struct ImplMixin : public ImplBase {
public:
explicit ImplMixin(const OpKernelInfo &info)
: ImplBase(info), info_(info) {}
explicit ImplMixin(const OpKernelInfo &info) : ImplBase(info), info_(info) {
// initialize `io_contain_dynamic_shape`
io_contain_dynamic_shape = false;
OpAccessor accessor(info);
size_t num_args = accessor.GetNumArgs();
for (size_t i = 0; i < accessor.GetNumArgs(); ++i) {
auto shape = accessor.GetArgShape(i);
if (mlir::ShapedType::isDynamicShape(shape)) {
io_contain_dynamic_shape = true;
}
}
for (size_t i = 0; i < accessor.GetNumResults(); ++i) {
auto shape = accessor.GetArgShape(i + num_args);
if (mlir::ShapedType::isDynamicShape(shape)) {
io_contain_dynamic_shape = true;
}
}
}

common::Status ProloguePerExecute(const ExecutionContext &ctx) {
if constexpr (HasProloguePerExecuteTraits<ImplBase, OpAccessor>::value) {
if (io_contain_dynamic_shape) {
ImplBase::ProloguePerExecute(GetOpAccessor(ctx));
}
}
return Status::OK();
}

OpAccessor GetOpAccessor(const ExecutionContext &ctx) const {
return OpAccessor(info_, ctx.exec_frame);
}

private:
const OpKernelInfo &info_;
bool io_contain_dynamic_shape;
};
};

Expand Down
9 changes: 6 additions & 3 deletions runtime/include/brt/core/framework/op_kernel_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,16 @@ class OpKernelInfo {

// Utilities

// Get Tensor as uniuqe Index, from the ith argument of OpKernelInfo
// Get Tensor as unique Index, from the ith argument of OpKernelInfo
size_t GetTensorIndexFromOpArgIndex(const OpKernelInfo &, unsigned int i);

// Get Tensor as uniuqe Index, from MLIR Value
// Get Tensor as unique Index, from MLIR Value
size_t GetTensorIndexFromMLIRValue(const OpKernelInfo &, mlir::Value val);

// Get Scalar as uniuqe Index, from MLIR Value
// Get Scalar as unique Index, from the ith argument of OpKernelInfo
size_t GetScalarIndexFromOpArgIndex(const OpKernelInfo &, unsigned int i);

// Get Scalar as unique Index, from MLIR Value
size_t GetScalarIndexFromMLIRValue(const OpKernelInfo &, mlir::Value val);

// Get Rank of MLIR Value, of ith argument of OpKernelInfo
Expand Down
98 changes: 63 additions & 35 deletions runtime/lib/backends/cuda/providers/default/codegen/ptx.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ using namespace mlir;
#define BLOCK_SIZE_Z_ATTR "BlockSize.z"
#define ARG_RANKS_ATTR "arg_ranks"
#define CALL_CONVENTION_ATTR "call_convention"
#define DYNAMIC_CONFIG "__byteir_dynamic_config__"
#define KERNEL_LAUNCH_CONFIG_NUM 6

namespace brt {
namespace cuda {
Expand Down Expand Up @@ -123,42 +125,50 @@ PTXOpKernel::PTXOpKernel(const OpKernelInfo &info)
impl_->call_convention = "all";
// static assignment for config
// TODO extend to support dynamic
if (!info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_X_ATTR)) {
BRT_THROW_EX(std::runtime_error, "no GridSize.x attr");
bool dynamic_config_flag = false;
if (info.GetOperation()->hasAttr(DYNAMIC_CONFIG)) {
dynamic_config_flag = true;
}
int gx, gy, gz, bx, by, bz;
gx = gy = gz = bx = by = bz = 1;
if (!dynamic_config_flag) {
if (!info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_X_ATTR)) {
BRT_THROW_EX(std::runtime_error, "no GridSize.x attr");
}

if (!info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_X_ATTR)) {
BRT_THROW_EX(std::runtime_error, "no BlockSize.x attr");
}
if (!info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_X_ATTR)) {
BRT_THROW_EX(std::runtime_error, "no BlockSize.x attr");
}

int gx = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(GRID_SIZE_X_ATTR)
.getInt()),
gy = 1, gz = 1;
if (info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_Y_ATTR)) {
gy = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(GRID_SIZE_Y_ATTR)
.getInt());
}
if (info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_Z_ATTR)) {
gz = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(GRID_SIZE_Z_ATTR)
.getInt());
}
gx = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(GRID_SIZE_X_ATTR)
.getInt()),
gy = 1, gz = 1;
if (info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_Y_ATTR)) {
gy = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(GRID_SIZE_Y_ATTR)
.getInt());
}
if (info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_Z_ATTR)) {
gz = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(GRID_SIZE_Z_ATTR)
.getInt());
}

int bx = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(BLOCK_SIZE_X_ATTR)
.getInt()),
by = 1, bz = 1;
if (info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_Y_ATTR)) {
by = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(BLOCK_SIZE_Y_ATTR)
.getInt());
}
if (info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_Z_ATTR)) {
bz = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(BLOCK_SIZE_Z_ATTR)
.getInt());
bx = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(BLOCK_SIZE_X_ATTR)
.getInt()),
by = 1, bz = 1;
if (info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_Y_ATTR)) {
by = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(BLOCK_SIZE_Y_ATTR)
.getInt());
}
if (info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_Z_ATTR)) {
bz = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(BLOCK_SIZE_Z_ATTR)
.getInt());
}
}

std::vector<int> ranks;
Expand All @@ -172,6 +182,10 @@ PTXOpKernel::PTXOpKernel(const OpKernelInfo &info)
}

auto num_arg = GetOpArgNum(info_);
// filter launch config in inputs
// TODO: make `shared_size` be a input operand in compiler.
if (dynamic_config_flag)
num_arg -= KERNEL_LAUNCH_CONFIG_NUM;
impl_->grid = dim3(gx, gy, gz);
impl_->block = dim3(bx, by, bz);
impl_->shared_size = 0;
Expand All @@ -198,20 +212,34 @@ common::Status PTXOpKernel::RunImpl(const ExecutionContext &ctx) {
std::vector<void *> args;
std::vector<MLIREngineMemRefDescriptor> descs;
args.reserve(impl_->arg_reserve_size);
bool dynamic_config_flag = false;
if (info_.GetOperation()->hasAttr(DYNAMIC_CONFIG)) {
dynamic_config_flag = true;
auto num_arg = GetOpArgNum(info_);
std::vector<int64_t> launch_config;
launch_config.reserve(KERNEL_LAUNCH_CONFIG_NUM);
for (size_t i = num_arg - KERNEL_LAUNCH_CONFIG_NUM; i < num_arg; ++i) {
size_t idx = GetScalarIndexFromOpArgIndex(info_, i);
launch_config.emplace_back(ctx.exec_frame->GetScalar<int64_t>(idx));
}
impl_->grid = dim3(launch_config[0], launch_config[1], launch_config[2]);
impl_->block = dim3(launch_config[3], launch_config[4], launch_config[5]);
}

args.push_back(&(impl_->grid));
args.push_back(&(impl_->block));
args.push_back(&(impl_->shared_size));

descs.reserve(impl_->tensor_ids.size());
for (size_t i = 0; i < impl_->tensor_ids.size(); ++i) {
descs.emplace_back(ctx.exec_frame->GetAsyncValueRef(impl_->tensor_ids[i]),
impl_->tensor_ranks[i]);
ctx.exec_frame->GetShapeRef(impl_->tensor_ids[i]));
if (impl_->call_convention == "bare_ptr")
args.push_back(&descs.back().data);
else
else {
InsertMemDescToArgs(descs.back(), args);
}
}

auto work_queue = static_cast<CUDAWorkQueue *>(ctx.work_queue);
auto cuda_env = work_queue->GetCudaEnv();
BRT_ENFORCE(cuda_env.IsPrimaryContext(),
Expand Down
18 changes: 18 additions & 0 deletions runtime/lib/backends/cuda/providers/default/math/matmul.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,24 @@ template <typename T> MatmulImpl<T>::MatmulImpl(const OpAccessor &accessor) {
}
}

template <typename T>
void MatmulImpl<T>::ProloguePerExecute(const OpAccessor &accessor) {
auto shape_a = accessor.GetArgShape(0);
auto shape_b = accessor.GetArgShape(1);
if (!lhs_transpose) {
m = shape_a[0];
k = shape_a[1];
} else {
m = shape_a[1];
k = shape_a[0];
}
if (!rhs_transpose) {
n = shape_b[1];
} else {
n = shape_b[0];
}
}

template <>
void MatmulImpl<float>::Execute(const float *a_val, const float *b_val,
float *c_val, cublasHandle_t handle,
Expand Down
2 changes: 2 additions & 0 deletions runtime/lib/backends/cuda/providers/default/math/matmul.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ template <typename T> class MatmulImpl {
public:
explicit MatmulImpl(const OpAccessor &accessor);

void ProloguePerExecute(const OpAccessor &);

void Execute(const T *a_val, const T *b_val, T *c_val, cublasHandle_t handle,
cudaStream_t stream);

Expand Down
5 changes: 5 additions & 0 deletions runtime/lib/core/context/execution_frame.cc
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,11 @@ void BRTInferenceExecutionFrame::BindArg(size_t idx, const void *ptr) {
}

void *BRTInferenceExecutionFrame::GetArg(size_t idx) {
// this only for debug : get weight ptr
if (idx >= info_.graph_info.io_count) {
return ctx_.weights_and_ios[idx - info_.graph_info.io_count];
}

BRT_ENFORCE(idx < info_.graph_info.io_count);
int i = info_.weights.size() + idx;

Expand Down
31 changes: 22 additions & 9 deletions runtime/lib/core/framework/execution_plan.cc
Original file line number Diff line number Diff line change
Expand Up @@ -337,16 +337,20 @@ common::Status StaticBRTExecutionPlan::ProloguePerSession(
return WalkResult::interrupt();
}

auto maybeSpace = brt::ir::GetSpace(op_arg);
if (!maybeSpace.has_value()) {
status_internal = Status(BRT, FAIL, "non-memref Arg of Op " + key);
return WalkResult::interrupt();
}

auto space = maybeSpace.value();
IAllocator *cur_allocator = GetAllocator(allocators, space);
last_alloc = cur_allocator;
std::string space;
IAllocator *cur_allocator;
if (op_arg.getType().dyn_cast<MemRefType>()) {
auto maybeSpace = brt::ir::GetSpace(op_arg);
if (!maybeSpace.has_value()) {
status_internal =
Status(BRT, FAIL, "non-memref Arg of Op " + key);
return WalkResult::interrupt();
}

space = maybeSpace.value();
cur_allocator = GetAllocator(allocators, space);
last_alloc = cur_allocator;
}
// skip if visited
if (visited_ptrs.count(arg_ptr) != 0) {
continue;
Expand All @@ -366,6 +370,10 @@ common::Status StaticBRTExecutionPlan::ProloguePerSession(
graph_info_.tensor_to_id.emplace(arg_ptr,
graph_info_.tensors.size());
graph_info_.tensors.push_back(arg_ptr);
} else if (op_arg.getType().isa<IndexType>()) {
int64_t scalar_index = graph_info_.scalars.size();
graph_info_.scalar_to_id.emplace(arg_ptr, scalar_index);
graph_info_.scalars.push_back(arg_ptr);
} else {
status_internal =
Status(BRT, FAIL, " non-supported Arg Type of Op " + key);
Expand Down Expand Up @@ -473,6 +481,11 @@ common::Status StaticBRTExecutionPlan::ProloguePerSession(
return WalkResult::interrupt();
}

// PTXOp launch config?
if (op_arg.getType().isa<IndexType>()) {
continue;
}

auto found_arg = graph_info_.tensor_to_id.find(arg_ptr);
if (found_arg == graph_info_.tensor_to_id.end()) {
status_internal = Status(BRT, FAIL, "cannot find arg");
Expand Down
Loading

0 comments on commit 0662cd8

Please sign in to comment.