Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[draft][runtime] modified OpKernel to support dynamic shape #162

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ using CurandOpKernelIfaceTraits =
* struct ConcreateOpImpl {
* ConcreateOpImpl(const OpAccessor&);
* void Execute(args..., cudaStream_t);
* optional<void ProluguePerExecute(const OpAccessor&)>;
* };
* using ConcreteOp = CudaOpKernel<ConcreateOpImpl, Arguments...>;
*/
Expand All @@ -153,6 +154,7 @@ BRT_DEF_OP_KERNEL_WRPPER(CudaOpKernel,
* struct ConcreateOpImpl {
* ConcreateOpImpl(const OpAccessor&);
* void Execute(args..., cublasHandle_t, cudaStream_t);
* optional<void ProluguePerExecute(const OpAccessor&)>;
* };
* using ConcreteOp = CublasOpKernel<ConcreateOpImpl, Arguments...>;
*/
Expand All @@ -163,6 +165,7 @@ BRT_DEF_OP_KERNEL_WRPPER(CublasOpKernel,
* struct ConcreateOpImpl {
* ConcreateOpImpl(const OpAccessor&);
* void Execute(args..., cudnnHandle_t, cudaStream_t);
* optional<void ProluguePerExecute(const OpAccessor&)>;
* };
* using ConcreteOp = CudnnOpKernel<ConcreateOpImpl, Arguments...>;
*/
Expand All @@ -173,6 +176,7 @@ BRT_DEF_OP_KERNEL_WRPPER(CudnnOpKernel,
* struct ConcreateOpImpl {
* ConcreateOpImpl(const OpAccessor&);
* void Execute(args..., void* workspace, cudaStream_t);
* optional<void ProluguePerExecute(const OpAccessor&)>;
* size_t GetWorkspaceSize(const ExecutionContext &);
* };
* using ConcreteOp = CudaOpKernelWithWorkspace<ConcreateOpImpl, Arguments...>;
Expand Down
55 changes: 53 additions & 2 deletions runtime/include/brt/core/framework/op_kernel_impl_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "brt/core/context/work_queue.h"
#include "brt/core/framework/op_accessor.h"
#include "brt/core/framework/op_kernel.h"
#include "mlir/IR/BuiltinTypeInterfaces.h"

namespace brt {

Expand Down Expand Up @@ -169,6 +170,10 @@ template <typename... Arguments> struct OpKernelIfaceTraitsBase {

template <typename Impl>
common::Status static inline Run(Impl *impl, const ExecutionContext &ctx) {
auto status = impl->ProloguePerExecute(ctx);
if (!status.IsOK()) {
return status;
}
return impl->Execute(Arguments::Get(impl, ctx)...);
}

Expand All @@ -187,17 +192,63 @@ template <typename... Arguments> struct OpKernelIfaceTraitsBase {

template <typename... Arguments>
struct NaiveOpKernelIfaceTraits : public OpKernelIfaceTraitsBase<Arguments...> {

template <typename T> struct TrueHelper : std::true_type {};

template <typename ClassType, typename... ArgType>
struct HasProloguePerExecuteTraits {
template <typename Impl, typename... Arg>
static auto CheckPrologurePerExecute(int)
-> TrueHelper<decltype(std::declval<Impl>().ProloguePerExecute(
std::declval<Arg>()...))>;

template <typename Impl, typename... Arg>
static auto CheckPrologurePerExecute(...) -> std::false_type;

public:
enum {
value =
decltype(CheckPrologurePerExecute<ClassType, ArgType...>(0))::value
};
};

template <typename ImplBase> struct ImplMixin : public ImplBase {
public:
explicit ImplMixin(const OpKernelInfo &info)
: ImplBase(info), info_(info) {}
explicit ImplMixin(const OpKernelInfo &info) : ImplBase(info), info_(info) {
// initialize `io_contain_dynamic_shape`
io_contain_dynamic_shape = false;
OpAccessor accessor(info);
size_t num_args = accessor.GetNumArgs();
for (size_t i = 0; i < accessor.GetNumArgs(); ++i) {
auto shape = accessor.GetArgShape(i);
if (mlir::ShapedType::isDynamicShape(shape)) {
io_contain_dynamic_shape = true;
}
}
for (size_t i = 0; i < accessor.GetNumResults(); ++i) {
auto shape = accessor.GetArgShape(i + num_args);
if (mlir::ShapedType::isDynamicShape(shape)) {
io_contain_dynamic_shape = true;
}
}
}

common::Status ProloguePerExecute(const ExecutionContext &ctx) {
if constexpr (HasProloguePerExecuteTraits<ImplBase, OpAccessor>::value) {
if (io_contain_dynamic_shape) {
ImplBase::ProloguePerExecute(GetOpAccessor(ctx));
}
}
return Status::OK();
}

OpAccessor GetOpAccessor(const ExecutionContext &ctx) const {
return OpAccessor(info_, ctx.exec_frame);
}

private:
const OpKernelInfo &info_;
bool io_contain_dynamic_shape;
};
};

Expand Down
9 changes: 6 additions & 3 deletions runtime/include/brt/core/framework/op_kernel_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,16 @@ class OpKernelInfo {

// Utilities

// Get Tensor as uniuqe Index, from the ith argument of OpKernelInfo
// Get Tensor as unique Index, from the ith argument of OpKernelInfo
size_t GetTensorIndexFromOpArgIndex(const OpKernelInfo &, unsigned int i);

// Get Tensor as uniuqe Index, from MLIR Value
// Get Tensor as unique Index, from MLIR Value
size_t GetTensorIndexFromMLIRValue(const OpKernelInfo &, mlir::Value val);

// Get Scalar as uniuqe Index, from MLIR Value
// Get Scalar as unique Index, from the ith argument of OpKernelInfo
size_t GetScalarIndexFromOpArgIndex(const OpKernelInfo &, unsigned int i);

// Get Scalar as unique Index, from MLIR Value
size_t GetScalarIndexFromMLIRValue(const OpKernelInfo &, mlir::Value val);

// Get Rank of MLIR Value, of ith argument of OpKernelInfo
Expand Down
98 changes: 63 additions & 35 deletions runtime/lib/backends/cuda/providers/default/codegen/ptx.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ using namespace mlir;
#define BLOCK_SIZE_Z_ATTR "BlockSize.z"
#define ARG_RANKS_ATTR "arg_ranks"
#define CALL_CONVENTION_ATTR "call_convention"
#define DYNAMIC_CONFIG "__byteir_dynamic_config__"
#define KERNEL_LAUNCH_CONFIG_NUM 6

namespace brt {
namespace cuda {
Expand Down Expand Up @@ -123,42 +125,50 @@ PTXOpKernel::PTXOpKernel(const OpKernelInfo &info)
impl_->call_convention = "all";
// static assignment for config
// TODO extend to support dynamic
if (!info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_X_ATTR)) {
BRT_THROW_EX(std::runtime_error, "no GridSize.x attr");
bool dynamic_config_flag = false;
if (info.GetOperation()->hasAttr(DYNAMIC_CONFIG)) {
dynamic_config_flag = true;
}
int gx, gy, gz, bx, by, bz;
gx = gy = gz = bx = by = bz = 1;
if (!dynamic_config_flag) {
if (!info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_X_ATTR)) {
BRT_THROW_EX(std::runtime_error, "no GridSize.x attr");
}

if (!info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_X_ATTR)) {
BRT_THROW_EX(std::runtime_error, "no BlockSize.x attr");
}
if (!info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_X_ATTR)) {
BRT_THROW_EX(std::runtime_error, "no BlockSize.x attr");
}

int gx = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(GRID_SIZE_X_ATTR)
.getInt()),
gy = 1, gz = 1;
if (info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_Y_ATTR)) {
gy = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(GRID_SIZE_Y_ATTR)
.getInt());
}
if (info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_Z_ATTR)) {
gz = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(GRID_SIZE_Z_ATTR)
.getInt());
}
gx = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(GRID_SIZE_X_ATTR)
.getInt()),
gy = 1, gz = 1;
if (info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_Y_ATTR)) {
gy = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(GRID_SIZE_Y_ATTR)
.getInt());
}
if (info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_Z_ATTR)) {
gz = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(GRID_SIZE_Z_ATTR)
.getInt());
}

int bx = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(BLOCK_SIZE_X_ATTR)
.getInt()),
by = 1, bz = 1;
if (info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_Y_ATTR)) {
by = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(BLOCK_SIZE_Y_ATTR)
.getInt());
}
if (info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_Z_ATTR)) {
bz = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(BLOCK_SIZE_Z_ATTR)
.getInt());
bx = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(BLOCK_SIZE_X_ATTR)
.getInt()),
by = 1, bz = 1;
if (info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_Y_ATTR)) {
by = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(BLOCK_SIZE_Y_ATTR)
.getInt());
}
if (info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_Z_ATTR)) {
bz = static_cast<int>(info.GetOperation()
->getAttrOfType<IntegerAttr>(BLOCK_SIZE_Z_ATTR)
.getInt());
}
}

std::vector<int> ranks;
Expand All @@ -172,6 +182,10 @@ PTXOpKernel::PTXOpKernel(const OpKernelInfo &info)
}

auto num_arg = GetOpArgNum(info_);
// filter launch config in inputs
// TODO: make `shared_size` be a input operand in compiler.
if (dynamic_config_flag)
num_arg -= KERNEL_LAUNCH_CONFIG_NUM;
impl_->grid = dim3(gx, gy, gz);
impl_->block = dim3(bx, by, bz);
impl_->shared_size = 0;
Expand All @@ -198,20 +212,34 @@ common::Status PTXOpKernel::RunImpl(const ExecutionContext &ctx) {
std::vector<void *> args;
std::vector<MLIREngineMemRefDescriptor> descs;
args.reserve(impl_->arg_reserve_size);
bool dynamic_config_flag = false;
if (info_.GetOperation()->hasAttr(DYNAMIC_CONFIG)) {
dynamic_config_flag = true;
auto num_arg = GetOpArgNum(info_);
std::vector<int64_t> launch_config;
launch_config.reserve(KERNEL_LAUNCH_CONFIG_NUM);
for (size_t i = num_arg - KERNEL_LAUNCH_CONFIG_NUM; i < num_arg; ++i) {
size_t idx = GetScalarIndexFromOpArgIndex(info_, i);
launch_config.emplace_back(ctx.exec_frame->GetScalar<int64_t>(idx));
}
impl_->grid = dim3(launch_config[0], launch_config[1], launch_config[2]);
impl_->block = dim3(launch_config[3], launch_config[4], launch_config[5]);
}

args.push_back(&(impl_->grid));
args.push_back(&(impl_->block));
args.push_back(&(impl_->shared_size));

descs.reserve(impl_->tensor_ids.size());
for (size_t i = 0; i < impl_->tensor_ids.size(); ++i) {
descs.emplace_back(ctx.exec_frame->GetAsyncValueRef(impl_->tensor_ids[i]),
impl_->tensor_ranks[i]);
ctx.exec_frame->GetShapeRef(impl_->tensor_ids[i]));
if (impl_->call_convention == "bare_ptr")
args.push_back(&descs.back().data);
else
else {
InsertMemDescToArgs(descs.back(), args);
}
}

auto work_queue = static_cast<CUDAWorkQueue *>(ctx.work_queue);
auto cuda_env = work_queue->GetCudaEnv();
BRT_ENFORCE(cuda_env.IsPrimaryContext(),
Expand Down
18 changes: 18 additions & 0 deletions runtime/lib/backends/cuda/providers/default/math/matmul.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,24 @@ template <typename T> MatmulImpl<T>::MatmulImpl(const OpAccessor &accessor) {
}
}

template <typename T>
void MatmulImpl<T>::ProloguePerExecute(const OpAccessor &accessor) {
auto shape_a = accessor.GetArgShape(0);
auto shape_b = accessor.GetArgShape(1);
if (!lhs_transpose) {
m = shape_a[0];
k = shape_a[1];
} else {
m = shape_a[1];
k = shape_a[0];
}
if (!rhs_transpose) {
n = shape_b[1];
} else {
n = shape_b[0];
}
}

template <>
void MatmulImpl<float>::Execute(const float *a_val, const float *b_val,
float *c_val, cublasHandle_t handle,
Expand Down
2 changes: 2 additions & 0 deletions runtime/lib/backends/cuda/providers/default/math/matmul.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ template <typename T> class MatmulImpl {
public:
explicit MatmulImpl(const OpAccessor &accessor);

void ProloguePerExecute(const OpAccessor &);

void Execute(const T *a_val, const T *b_val, T *c_val, cublasHandle_t handle,
cudaStream_t stream);

Expand Down
5 changes: 5 additions & 0 deletions runtime/lib/core/context/execution_frame.cc
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,11 @@ void BRTInferenceExecutionFrame::BindArg(size_t idx, const void *ptr) {
}

void *BRTInferenceExecutionFrame::GetArg(size_t idx) {
// this only for debug : get weight ptr
if (idx >= info_.graph_info.io_count) {
return ctx_.weights_and_ios[idx - info_.graph_info.io_count];
}

BRT_ENFORCE(idx < info_.graph_info.io_count);
int i = info_.weights.size() + idx;

Expand Down
31 changes: 22 additions & 9 deletions runtime/lib/core/framework/execution_plan.cc
Original file line number Diff line number Diff line change
Expand Up @@ -337,16 +337,20 @@ common::Status StaticBRTExecutionPlan::ProloguePerSession(
return WalkResult::interrupt();
}

auto maybeSpace = brt::ir::GetSpace(op_arg);
if (!maybeSpace.has_value()) {
status_internal = Status(BRT, FAIL, "non-memref Arg of Op " + key);
return WalkResult::interrupt();
}

auto space = maybeSpace.value();
IAllocator *cur_allocator = GetAllocator(allocators, space);
last_alloc = cur_allocator;
std::string space;
IAllocator *cur_allocator;
if (op_arg.getType().dyn_cast<MemRefType>()) {
auto maybeSpace = brt::ir::GetSpace(op_arg);
if (!maybeSpace.has_value()) {
status_internal =
Status(BRT, FAIL, "non-memref Arg of Op " + key);
return WalkResult::interrupt();
}

space = maybeSpace.value();
cur_allocator = GetAllocator(allocators, space);
last_alloc = cur_allocator;
}
// skip if visited
if (visited_ptrs.count(arg_ptr) != 0) {
continue;
Expand All @@ -366,6 +370,10 @@ common::Status StaticBRTExecutionPlan::ProloguePerSession(
graph_info_.tensor_to_id.emplace(arg_ptr,
graph_info_.tensors.size());
graph_info_.tensors.push_back(arg_ptr);
} else if (op_arg.getType().isa<IndexType>()) {
int64_t scalar_index = graph_info_.scalars.size();
graph_info_.scalar_to_id.emplace(arg_ptr, scalar_index);
graph_info_.scalars.push_back(arg_ptr);
} else {
status_internal =
Status(BRT, FAIL, " non-supported Arg Type of Op " + key);
Expand Down Expand Up @@ -473,6 +481,11 @@ common::Status StaticBRTExecutionPlan::ProloguePerSession(
return WalkResult::interrupt();
}

// PTXOp launch config?
if (op_arg.getType().isa<IndexType>()) {
continue;
}

auto found_arg = graph_info_.tensor_to_id.find(arg_ptr);
if (found_arg == graph_info_.tensor_to_id.end()) {
status_internal = Status(BRT, FAIL, "cannot find arg");
Expand Down
Loading
Loading