bytedance · XG-zheng · Mar 25, 2024
diff --git a/runtime/include/brt/backends/cuda/device/utils/op_kernel_impl_helpers.h b/runtime/include/brt/backends/cuda/device/utils/op_kernel_impl_helpers.h
@@ -143,6 +143,7 @@ using CurandOpKernelIfaceTraits =
  *   struct ConcreateOpImpl {
  *     ConcreateOpImpl(const OpAccessor&);
  *     void Execute(args..., cudaStream_t);
+ *     optional<void ProluguePerExecute(const OpAccessor&)>;
  *   };
  *   using ConcreteOp = CudaOpKernel<ConcreateOpImpl, Arguments...>;
  */
@@ -153,6 +154,7 @@ BRT_DEF_OP_KERNEL_WRPPER(CudaOpKernel,
  *   struct ConcreateOpImpl {
  *     ConcreateOpImpl(const OpAccessor&);
  *     void Execute(args..., cublasHandle_t, cudaStream_t);
+ *     optional<void ProluguePerExecute(const OpAccessor&)>;
  *   };
  *   using ConcreteOp = CublasOpKernel<ConcreateOpImpl, Arguments...>;
  */
@@ -163,6 +165,7 @@ BRT_DEF_OP_KERNEL_WRPPER(CublasOpKernel,
  *   struct ConcreateOpImpl {
  *     ConcreateOpImpl(const OpAccessor&);
  *     void Execute(args..., cudnnHandle_t, cudaStream_t);
+ *     optional<void ProluguePerExecute(const OpAccessor&)>;
  *   };
  *   using ConcreteOp = CudnnOpKernel<ConcreateOpImpl, Arguments...>;
  */
@@ -173,6 +176,7 @@ BRT_DEF_OP_KERNEL_WRPPER(CudnnOpKernel,
  *   struct ConcreateOpImpl {
  *     ConcreateOpImpl(const OpAccessor&);
  *     void Execute(args..., void* workspace, cudaStream_t);
+ *     optional<void ProluguePerExecute(const OpAccessor&)>;
  *     size_t GetWorkspaceSize(const ExecutionContext &);
  *   };
  *   using ConcreteOp = CudaOpKernelWithWorkspace<ConcreateOpImpl, Arguments...>;

diff --git a/runtime/include/brt/core/framework/op_kernel_impl_base.h b/runtime/include/brt/core/framework/op_kernel_impl_base.h
@@ -22,6 +22,7 @@
 #include "brt/core/context/work_queue.h"
 #include "brt/core/framework/op_accessor.h"
 #include "brt/core/framework/op_kernel.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
 
 namespace brt {
 
@@ -169,6 +170,10 @@ template <typename... Arguments> struct OpKernelIfaceTraitsBase {
 
   template <typename Impl>
   common::Status static inline Run(Impl *impl, const ExecutionContext &ctx) {
+    auto status = impl->ProloguePerExecute(ctx);
+    if (!status.IsOK()) {
+      return status;
+    }
     return impl->Execute(Arguments::Get(impl, ctx)...);
   }
 
@@ -187,17 +192,63 @@ template <typename... Arguments> struct OpKernelIfaceTraitsBase {
 
 template <typename... Arguments>
 struct NaiveOpKernelIfaceTraits : public OpKernelIfaceTraitsBase<Arguments...> {
+
+  template <typename T> struct TrueHelper : std::true_type {};
+
+  template <typename ClassType, typename... ArgType>
+  struct HasProloguePerExecuteTraits {
+    template <typename Impl, typename... Arg>
+    static auto CheckPrologurePerExecute(int)
+        -> TrueHelper<decltype(std::declval<Impl>().ProloguePerExecute(
+            std::declval<Arg>()...))>;
+
+    template <typename Impl, typename... Arg>
+    static auto CheckPrologurePerExecute(...) -> std::false_type;
+
+  public:
+    enum {
+      value =
+          decltype(CheckPrologurePerExecute<ClassType, ArgType...>(0))::value
+    };
+  };
+
   template <typename ImplBase> struct ImplMixin : public ImplBase {
   public:
-    explicit ImplMixin(const OpKernelInfo &info)
-        : ImplBase(info), info_(info) {}
+    explicit ImplMixin(const OpKernelInfo &info) : ImplBase(info), info_(info) {
+      // initialize `io_contain_dynamic_shape`
+      io_contain_dynamic_shape = false;
+      OpAccessor accessor(info);
+      size_t num_args = accessor.GetNumArgs();
+      for (size_t i = 0; i < accessor.GetNumArgs(); ++i) {
+        auto shape = accessor.GetArgShape(i);
+        if (mlir::ShapedType::isDynamicShape(shape)) {
+          io_contain_dynamic_shape = true;
+        }
+      }
+      for (size_t i = 0; i < accessor.GetNumResults(); ++i) {
+        auto shape = accessor.GetArgShape(i + num_args);
+        if (mlir::ShapedType::isDynamicShape(shape)) {
+          io_contain_dynamic_shape = true;
+        }
+      }
+    }
+
+    common::Status ProloguePerExecute(const ExecutionContext &ctx) {
+      if constexpr (HasProloguePerExecuteTraits<ImplBase, OpAccessor>::value) {
+        if (io_contain_dynamic_shape) {
+          ImplBase::ProloguePerExecute(GetOpAccessor(ctx));
+        }
+      }
+      return Status::OK();
+    }
 
     OpAccessor GetOpAccessor(const ExecutionContext &ctx) const {
       return OpAccessor(info_, ctx.exec_frame);
     }
 
   private:
     const OpKernelInfo &info_;
+    bool io_contain_dynamic_shape;
   };
 };
 

diff --git a/runtime/include/brt/core/framework/op_kernel_info.h b/runtime/include/brt/core/framework/op_kernel_info.h
@@ -143,13 +143,16 @@ class OpKernelInfo {
 
 // Utilities
 
-// Get Tensor as uniuqe Index, from the ith argument of OpKernelInfo
+// Get Tensor as unique Index, from the ith argument of OpKernelInfo
 size_t GetTensorIndexFromOpArgIndex(const OpKernelInfo &, unsigned int i);
 
-// Get Tensor as uniuqe Index, from MLIR Value
+// Get Tensor as unique Index, from MLIR Value
 size_t GetTensorIndexFromMLIRValue(const OpKernelInfo &, mlir::Value val);
 
-// Get Scalar as uniuqe Index, from MLIR Value
+// Get Scalar as unique Index, from the ith argument of OpKernelInfo
+size_t GetScalarIndexFromOpArgIndex(const OpKernelInfo &, unsigned int i);
+
+// Get Scalar as unique Index, from MLIR Value
 size_t GetScalarIndexFromMLIRValue(const OpKernelInfo &, mlir::Value val);
 
 // Get Rank of MLIR Value, of ith argument of OpKernelInfo

diff --git a/runtime/lib/backends/cuda/providers/default/codegen/ptx.cc b/runtime/lib/backends/cuda/providers/default/codegen/ptx.cc
@@ -45,6 +45,8 @@ using namespace mlir;
 #define BLOCK_SIZE_Z_ATTR "BlockSize.z"
 #define ARG_RANKS_ATTR "arg_ranks"
 #define CALL_CONVENTION_ATTR "call_convention"
+#define DYNAMIC_CONFIG "__byteir_dynamic_config__"
+#define KERNEL_LAUNCH_CONFIG_NUM 6
 
 namespace brt {
 namespace cuda {
@@ -123,42 +125,50 @@ PTXOpKernel::PTXOpKernel(const OpKernelInfo &info)
     impl_->call_convention = "all";
   // static assignment for config
   // TODO extend to support dynamic
-  if (!info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_X_ATTR)) {
-    BRT_THROW_EX(std::runtime_error, "no GridSize.x attr");
+  bool dynamic_config_flag = false;
+  if (info.GetOperation()->hasAttr(DYNAMIC_CONFIG)) {
+    dynamic_config_flag = true;
   }
+  int gx, gy, gz, bx, by, bz;
+  gx = gy = gz = bx = by = bz = 1;
+  if (!dynamic_config_flag) {
+    if (!info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_X_ATTR)) {
+      BRT_THROW_EX(std::runtime_error, "no GridSize.x attr");
+    }
 
-  if (!info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_X_ATTR)) {
-    BRT_THROW_EX(std::runtime_error, "no BlockSize.x attr");
-  }
+    if (!info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_X_ATTR)) {
+      BRT_THROW_EX(std::runtime_error, "no BlockSize.x attr");
+    }
 
-  int gx = static_cast<int>(info.GetOperation()
-                                ->getAttrOfType<IntegerAttr>(GRID_SIZE_X_ATTR)
-                                .getInt()),
-      gy = 1, gz = 1;
-  if (info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_Y_ATTR)) {
-    gy = static_cast<int>(info.GetOperation()
-                              ->getAttrOfType<IntegerAttr>(GRID_SIZE_Y_ATTR)
-                              .getInt());
-  }
-  if (info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_Z_ATTR)) {
-    gz = static_cast<int>(info.GetOperation()
-                              ->getAttrOfType<IntegerAttr>(GRID_SIZE_Z_ATTR)
-                              .getInt());
-  }
+    gx = static_cast<int>(info.GetOperation()
+                              ->getAttrOfType<IntegerAttr>(GRID_SIZE_X_ATTR)
+                              .getInt()),
+    gy = 1, gz = 1;
+    if (info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_Y_ATTR)) {
+      gy = static_cast<int>(info.GetOperation()
+                                ->getAttrOfType<IntegerAttr>(GRID_SIZE_Y_ATTR)
+                                .getInt());
+    }
+    if (info.GetOperation()->hasAttrOfType<IntegerAttr>(GRID_SIZE_Z_ATTR)) {
+      gz = static_cast<int>(info.GetOperation()
+                                ->getAttrOfType<IntegerAttr>(GRID_SIZE_Z_ATTR)
+                                .getInt());
+    }
 
-  int bx = static_cast<int>(info.GetOperation()
-                                ->getAttrOfType<IntegerAttr>(BLOCK_SIZE_X_ATTR)
-                                .getInt()),
-      by = 1, bz = 1;
-  if (info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_Y_ATTR)) {
-    by = static_cast<int>(info.GetOperation()
-                              ->getAttrOfType<IntegerAttr>(BLOCK_SIZE_Y_ATTR)
-                              .getInt());
-  }
-  if (info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_Z_ATTR)) {
-    bz = static_cast<int>(info.GetOperation()
-                              ->getAttrOfType<IntegerAttr>(BLOCK_SIZE_Z_ATTR)
-                              .getInt());
+    bx = static_cast<int>(info.GetOperation()
+                              ->getAttrOfType<IntegerAttr>(BLOCK_SIZE_X_ATTR)
+                              .getInt()),
+    by = 1, bz = 1;
+    if (info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_Y_ATTR)) {
+      by = static_cast<int>(info.GetOperation()
+                                ->getAttrOfType<IntegerAttr>(BLOCK_SIZE_Y_ATTR)
+                                .getInt());
+    }
+    if (info.GetOperation()->hasAttrOfType<IntegerAttr>(BLOCK_SIZE_Z_ATTR)) {
+      bz = static_cast<int>(info.GetOperation()
+                                ->getAttrOfType<IntegerAttr>(BLOCK_SIZE_Z_ATTR)
+                                .getInt());
+    }
   }
 
   std::vector<int> ranks;
@@ -172,6 +182,10 @@ PTXOpKernel::PTXOpKernel(const OpKernelInfo &info)
   }
 
   auto num_arg = GetOpArgNum(info_);
+  // filter launch config in inputs
+  // TODO: make `shared_size` be a input operand in compiler.
+  if (dynamic_config_flag)
+    num_arg -= KERNEL_LAUNCH_CONFIG_NUM;
   impl_->grid = dim3(gx, gy, gz);
   impl_->block = dim3(bx, by, bz);
   impl_->shared_size = 0;
@@ -198,20 +212,34 @@ common::Status PTXOpKernel::RunImpl(const ExecutionContext &ctx) {
   std::vector<void *> args;
   std::vector<MLIREngineMemRefDescriptor> descs;
   args.reserve(impl_->arg_reserve_size);
+  bool dynamic_config_flag = false;
+  if (info_.GetOperation()->hasAttr(DYNAMIC_CONFIG)) {
+    dynamic_config_flag = true;
+    auto num_arg = GetOpArgNum(info_);
+    std::vector<int64_t> launch_config;
+    launch_config.reserve(KERNEL_LAUNCH_CONFIG_NUM);
+    for (size_t i = num_arg - KERNEL_LAUNCH_CONFIG_NUM; i < num_arg; ++i) {
+      size_t idx = GetScalarIndexFromOpArgIndex(info_, i);
+      launch_config.emplace_back(ctx.exec_frame->GetScalar<int64_t>(idx));
+    }
+    impl_->grid = dim3(launch_config[0], launch_config[1], launch_config[2]);
+    impl_->block = dim3(launch_config[3], launch_config[4], launch_config[5]);
+  }
+
   args.push_back(&(impl_->grid));
   args.push_back(&(impl_->block));
   args.push_back(&(impl_->shared_size));
 
   descs.reserve(impl_->tensor_ids.size());
   for (size_t i = 0; i < impl_->tensor_ids.size(); ++i) {
     descs.emplace_back(ctx.exec_frame->GetAsyncValueRef(impl_->tensor_ids[i]),
-                       impl_->tensor_ranks[i]);
+                       ctx.exec_frame->GetShapeRef(impl_->tensor_ids[i]));
     if (impl_->call_convention == "bare_ptr")
       args.push_back(&descs.back().data);
-    else
+    else {
       InsertMemDescToArgs(descs.back(), args);
+    }
   }
-
   auto work_queue = static_cast<CUDAWorkQueue *>(ctx.work_queue);
   auto cuda_env = work_queue->GetCudaEnv();
   BRT_ENFORCE(cuda_env.IsPrimaryContext(),

diff --git a/runtime/lib/backends/cuda/providers/default/math/matmul.cc b/runtime/lib/backends/cuda/providers/default/math/matmul.cc
@@ -68,6 +68,24 @@ template <typename T> MatmulImpl<T>::MatmulImpl(const OpAccessor &accessor) {
   }
 }
 
+template <typename T>
+void MatmulImpl<T>::ProloguePerExecute(const OpAccessor &accessor) {
+  auto shape_a = accessor.GetArgShape(0);
+  auto shape_b = accessor.GetArgShape(1);
+  if (!lhs_transpose) {
+    m = shape_a[0];
+    k = shape_a[1];
+  } else {
+    m = shape_a[1];
+    k = shape_a[0];
+  }
+  if (!rhs_transpose) {
+    n = shape_b[1];
+  } else {
+    n = shape_b[0];
+  }
+}
+
 template <>
 void MatmulImpl<float>::Execute(const float *a_val, const float *b_val,
                                 float *c_val, cublasHandle_t handle,

diff --git a/runtime/lib/backends/cuda/providers/default/math/matmul.h b/runtime/lib/backends/cuda/providers/default/math/matmul.h
@@ -30,6 +30,8 @@ template <typename T> class MatmulImpl {
 public:
   explicit MatmulImpl(const OpAccessor &accessor);
 
+  void ProloguePerExecute(const OpAccessor &);
+
   void Execute(const T *a_val, const T *b_val, T *c_val, cublasHandle_t handle,
                cudaStream_t stream);
 

diff --git a/runtime/lib/core/context/execution_frame.cc b/runtime/lib/core/context/execution_frame.cc
@@ -186,6 +186,11 @@ void BRTInferenceExecutionFrame::BindArg(size_t idx, const void *ptr) {
 }
 
 void *BRTInferenceExecutionFrame::GetArg(size_t idx) {
+  // this only for debug : get weight ptr
+  if (idx >= info_.graph_info.io_count) {
+    return ctx_.weights_and_ios[idx - info_.graph_info.io_count];
+  }
+
   BRT_ENFORCE(idx < info_.graph_info.io_count);
   int i = info_.weights.size() + idx;
 

diff --git a/runtime/lib/core/framework/execution_plan.cc b/runtime/lib/core/framework/execution_plan.cc
@@ -337,16 +337,20 @@ common::Status StaticBRTExecutionPlan::ProloguePerSession(
             return WalkResult::interrupt();
           }
 
-          auto maybeSpace = brt::ir::GetSpace(op_arg);
-          if (!maybeSpace.has_value()) {
-            status_internal = Status(BRT, FAIL, "non-memref Arg of Op " + key);
-            return WalkResult::interrupt();
-          }
-
-          auto space = maybeSpace.value();
-          IAllocator *cur_allocator = GetAllocator(allocators, space);
-          last_alloc = cur_allocator;
+          std::string space;
+          IAllocator *cur_allocator;
+          if (op_arg.getType().dyn_cast<MemRefType>()) {
+            auto maybeSpace = brt::ir::GetSpace(op_arg);
+            if (!maybeSpace.has_value()) {
+              status_internal =
+                  Status(BRT, FAIL, "non-memref Arg of Op " + key);
+              return WalkResult::interrupt();
+            }
 
+            space = maybeSpace.value();
+            cur_allocator = GetAllocator(allocators, space);
+            last_alloc = cur_allocator;
+          }
           // skip if visited
           if (visited_ptrs.count(arg_ptr) != 0) {
             continue;
@@ -366,6 +370,10 @@ common::Status StaticBRTExecutionPlan::ProloguePerSession(
             graph_info_.tensor_to_id.emplace(arg_ptr,
                                              graph_info_.tensors.size());
             graph_info_.tensors.push_back(arg_ptr);
+          } else if (op_arg.getType().isa<IndexType>()) {
+            int64_t scalar_index = graph_info_.scalars.size();
+            graph_info_.scalar_to_id.emplace(arg_ptr, scalar_index);
+            graph_info_.scalars.push_back(arg_ptr);
           } else {
             status_internal =
                 Status(BRT, FAIL, " non-supported Arg Type of Op " + key);
@@ -473,6 +481,11 @@ common::Status StaticBRTExecutionPlan::ProloguePerSession(
           return WalkResult::interrupt();
         }
 
+        // PTXOp launch config?
+        if (op_arg.getType().isa<IndexType>()) {
+          continue;
+        }
+
         auto found_arg = graph_info_.tensor_to_id.find(arg_ptr);
         if (found_arg == graph_info_.tensor_to_id.end()) {
           status_internal = Status(BRT, FAIL, "cannot find arg");