Merge branch 'inference' into cuda_graph

flexflow · Jan 22, 2024 · 1043f1d · 1043f1d
2 parents 4d79e0d + 57d1883
commit 1043f1d
Show file tree

Hide file tree

Showing 97 changed files with 746 additions and 190 deletions.
diff --git a/examples/python/keras/seq_cifar10_cnn.py b/examples/python/keras/seq_cifar10_cnn.py
@@ -56,7 +56,7 @@ def top_level_task():
 
 
 if __name__ == "__main__":
-  print("Sequantial model, cifar10 cnn")
+  print("Sequential model, cifar10 cnn")
   configs = ff.get_configs()
   ff.init_flexflow_runtime(configs)
   top_level_task()
diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
@@ -9,6 +9,14 @@
 #include "flexflow/utils/dot/record_formatter.h"
 #include <vector>
 
+#include <sys/stat.h>
+#include <sys/types.h>
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include "flexflow/utils/cuda_helper.h"
+#else
+#include "flexflow/utils/hip_helper.h"
+#endif
+
 namespace FlexFlow {
 
 extern LegionRuntime::Logger::Category log_measure;
@@ -227,13 +235,126 @@ class Op {
     assert(false);
   };
   virtual void print_layer(FFModel const &model) = 0;
+  template <typename OpMetaType>
+  static std::string get_op_name_without_uid(OpMetaType *m) {
+    std::string op_name_without_uid = std::string(m->op_name);
+    size_t last_underscore = op_name_without_uid.length() - 1;
+    for (int i = op_name_without_uid.length() - 1; i > 0; i--) {
+      if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) {
+        break;
+      } else if (m->op_name[i] == '_') {
+        last_underscore = i;
+      }
+    }
+    op_name_without_uid.erase(last_underscore);
+    return op_name_without_uid;
+  }
+  template <typename OpMetaType>
   static void save_inference_tensors_to_file(
-      OpMeta *m,
+      OpMetaType *m,
       int shard_id,
       BatchConfig const *bc,
       std::vector<GenericTensorAccessorR> input_tensors,
       std::vector<GenericTensorAccessorR> weight_tensors,
-      std::vector<GenericTensorAccessorW> output_tensors);
+      std::vector<GenericTensorAccessorR> output_tensors,
+      bool before_kernel = false) {
+    // Check if output directory exists, and create it if it does not
+    char const *folder_path = "./inference_tensors";
+    struct stat st = {0};
+    if (stat(folder_path, &st) == -1) {
+      // Directory does not exist, create it
+      mkdir(folder_path, 0700);
+    }
+    // output base filepath, shared by all tensors from the same operator
+    std::string op_name_without_uid = get_op_name_without_uid(m);
+    std::string base_filepath =
+        "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
+        "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" +
+        std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" +
+        op_name_without_uid + "_shard-id_" + std::to_string(shard_id);
+    if (before_kernel) {
+      base_filepath += "_pre";
+    }
+    // save batch config, if passed
+    if (bc != nullptr) {
+      bc->save_to_file(base_filepath + "_batch-config");
+    }
+    // save all inputs
+    for (int i = 0; i < input_tensors.size(); i++) {
+      std::string filename = base_filepath + "_input_" + std::to_string(i);
+      if (input_tensors[i].data_type == DT_FLOAT) {
+        save_tensor(input_tensors[i].get_float_ptr(),
+                    input_tensors[i].domain.get_volume(),
+                    filename.c_str());
+      } else if (input_tensors[i].data_type == DT_HALF) {
+        save_tensor(input_tensors[i].get_half_ptr(),
+                    input_tensors[i].domain.get_volume(),
+                    filename.c_str());
+      } else if (input_tensors[i].data_type == DT_INT32) {
+        save_tensor(input_tensors[i].get_int32_ptr(),
+                    input_tensors[i].domain.get_volume(),
+                    filename.c_str());
+      } else if (input_tensors[i].data_type == DT_INT64) {
+        save_tensor(input_tensors[i].get_int64_ptr(),
+                    input_tensors[i].domain.get_volume(),
+                    filename.c_str());
+      } else {
+        assert(false && "Tensor data type not supported");
+      }
+    }
+    // only dump the weights once
+    if (m->decoding_step == 0) {
+      for (int i = 0; i < weight_tensors.size(); i++) {
+        std::string filename = base_filepath + "_weight_" + std::to_string(i);
+        if (weight_tensors[i].data_type == DT_FLOAT) {
+          save_tensor(weight_tensors[i].get_float_ptr(),
+                      weight_tensors[i].domain.get_volume(),
+                      filename.c_str());
+        } else if (weight_tensors[i].data_type == DT_HALF) {
+          save_tensor(weight_tensors[i].get_half_ptr(),
+                      weight_tensors[i].domain.get_volume(),
+                      filename.c_str());
+        } else if (weight_tensors[i].data_type == DT_INT32) {
+          save_tensor(weight_tensors[i].get_int32_ptr(),
+                      weight_tensors[i].domain.get_volume(),
+                      filename.c_str());
+        } else if (weight_tensors[i].data_type == DT_INT64) {
+          save_tensor(weight_tensors[i].get_int64_ptr(),
+                      weight_tensors[i].domain.get_volume(),
+                      filename.c_str());
+        } else {
+          assert(false && "Tensor data type not supported");
+        }
+      }
+    }
+    // save all outputs
+    for (int i = 0; i < output_tensors.size(); i++) {
+      std::string filename = base_filepath + "_output_" + std::to_string(i);
+      if (output_tensors[i].data_type == DT_FLOAT) {
+        save_tensor(output_tensors[i].get_float_ptr(),
+                    output_tensors[i].domain.get_volume(),
+                    filename.c_str());
+      } else if (output_tensors[i].data_type == DT_HALF) {
+        save_tensor(output_tensors[i].get_half_ptr(),
+                    output_tensors[i].domain.get_volume(),
+                    filename.c_str());
+      } else if (output_tensors[i].data_type == DT_INT32) {
+        save_tensor(output_tensors[i].get_int32_ptr(),
+                    output_tensors[i].domain.get_volume(),
+                    filename.c_str());
+      } else if (output_tensors[i].data_type == DT_INT64) {
+        save_tensor(output_tensors[i].get_int64_ptr(),
+                    output_tensors[i].domain.get_volume(),
+                    filename.c_str());
+      } else {
+        assert(false && "Tensor data type not supported");
+      }
+    }
+    // increase count of decoding steps
+    if (!before_kernel) {
+      m->decoding_step++;
+    }
+  }
   virtual bool measure_operator_cost(Simulator *sim,
                                      MachineView const &mv,
                                      CostMetrics &cost_metrics) const = 0;

diff --git a/include/flexflow/ops/add_bias_residual_layer_norm_params.h b/include/flexflow/ops/add_bias_residual_layer_norm_params.h
@@ -12,6 +12,7 @@ struct AddBiasResidualLayerNormParams {
   bool elementwise_affine;
   float eps;
   bool use_bias;
+  char name[MAX_OPNAME];
   bool is_valid(
       std::pair<ParallelTensorShape, ParallelTensorShape> const &) const;
 };

diff --git a/include/flexflow/ops/aggregate_params.h b/include/flexflow/ops/aggregate_params.h
@@ -9,6 +9,7 @@ namespace FlexFlow {
 struct AggregateParams {
   int n;
   float lambda_bal;
+  char name[MAX_OPNAME];
   bool is_valid(std::vector<ParallelTensorShape> const &) const;
 };
 bool operator==(AggregateParams const &, AggregateParams const &);

diff --git a/include/flexflow/ops/aggregate_spec_params.h b/include/flexflow/ops/aggregate_spec_params.h
@@ -9,6 +9,7 @@ namespace FlexFlow {
 struct AggregateSpecParams {
   int n;
   float lambda_bal;
+  char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
 };
 bool operator==(AggregateSpecParams const &, AggregateSpecParams const &);

diff --git a/include/flexflow/ops/arg_topk_params.h b/include/flexflow/ops/arg_topk_params.h
@@ -12,6 +12,7 @@ struct ArgTopKParams {
   int k;
   bool sorted;
   bool speculative_decoding;
+  char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
 };
 bool operator==(ArgTopKParams const &, ArgTopKParams const &);

diff --git a/include/flexflow/ops/argmax_params.h b/include/flexflow/ops/argmax_params.h
@@ -9,6 +9,7 @@ namespace FlexFlow {
 struct ArgMaxParams {
   bool beam_search;
   bool is_valid(ParallelTensorShape const &) const;
+  char name[MAX_OPNAME];
 };
 bool operator==(ArgMaxParams const &, ArgMaxParams const &);
 

diff --git a/include/flexflow/ops/attention_params.h b/include/flexflow/ops/attention_params.h
@@ -11,6 +11,7 @@ struct MultiHeadAttentionParams {
   int embed_dim, num_heads, kdim, vdim;
   float dropout;
   bool bias, add_bias_kv, add_zero_attn;
+  char name[MAX_OPNAME];
 
   bool is_valid(std::tuple<ParallelTensorShape,
                            ParallelTensorShape,

diff --git a/include/flexflow/ops/batch_matmul_params.h b/include/flexflow/ops/batch_matmul_params.h
@@ -6,6 +6,7 @@ namespace FlexFlow {
 
 struct BatchMatmulParams {
   int a_seq_length_dim, b_seq_length_dim;
+  char name[MAX_OPNAME];
   bool is_valid(
       std::pair<ParallelTensorShape, ParallelTensorShape> const &) const;
 };

diff --git a/include/flexflow/ops/beam_topk_params.h b/include/flexflow/ops/beam_topk_params.h
@@ -11,6 +11,7 @@ struct BeamTopKParams {
   LayerID layer_guid;
   bool sorted;
   int max_beam_width;
+  char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
 };
 bool operator==(BeamTopKParams const &, BeamTopKParams const &);

diff --git a/include/flexflow/ops/cast_params.h b/include/flexflow/ops/cast_params.h
@@ -8,6 +8,7 @@ namespace FlexFlow {
 
 struct CastParams {
   DataType dtype;
+  char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
 };
 bool operator==(CastParams const &, CastParams const &);

diff --git a/include/flexflow/ops/concat_params.h b/include/flexflow/ops/concat_params.h
@@ -7,7 +7,7 @@ namespace FlexFlow {
 
 struct ConcatParams {
   int axis;
-
+  char name[MAX_OPNAME];
   bool is_valid(std::vector<ParallelTensorShape> const &) const;
 };
 

diff --git a/include/flexflow/ops/conv_2d_params.h b/include/flexflow/ops/conv_2d_params.h
@@ -13,6 +13,7 @@ struct Conv2DParams {
       padding_w, groups;
   ActiMode activation;
   bool use_bias;
+  char name[MAX_OPNAME];
 
   bool is_valid(ParallelTensorShape const &input) const;
   void solve_dims(ParallelTensorShape const &input,

diff --git a/include/flexflow/ops/dropout_params.h b/include/flexflow/ops/dropout_params.h
@@ -9,6 +9,7 @@ namespace FlexFlow {
 struct DropoutParams {
   float rate;
   unsigned long long seed;
+  char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
 };
 bool operator==(DropoutParams const &, DropoutParams const &);

diff --git a/include/flexflow/ops/element_binary_params.h b/include/flexflow/ops/element_binary_params.h
@@ -11,6 +11,7 @@ struct ElementBinaryParams {
   LayerID layer_guid;
   OperatorType type;
   bool inplace_a;
+  char name[MAX_OPNAME];
 
   bool is_valid(
       std::pair<ParallelTensorShape, ParallelTensorShape> const &) const;

diff --git a/include/flexflow/ops/element_unary_params.h b/include/flexflow/ops/element_unary_params.h
@@ -12,6 +12,7 @@ struct ElementUnaryParams {
   bool inplace;
   float scalar = 0.0;
   LayerID layer_guid;
+  char name[MAX_OPNAME];
 
   bool is_valid(ParallelTensorShape const &) const;
 };

diff --git a/include/flexflow/ops/embedding_params.h b/include/flexflow/ops/embedding_params.h
@@ -12,6 +12,7 @@ struct EmbeddingParams {
   LayerID layer_guid;
   AggrMode aggr;
   DataType data_type;
+  char name[MAX_OPNAME];
 
   bool is_valid(ParallelTensorShape const &) const;
 };

diff --git a/include/flexflow/ops/experts_params.h b/include/flexflow/ops/experts_params.h
@@ -17,6 +17,7 @@ struct ExpertsParams {
   int experts_internal_dim_size;
   bool use_bias;
   ActiMode activation;
+  char name[MAX_OPNAME];
 
   bool is_valid(std::vector<ParallelTensorShape> const &) const;
 };

diff --git a/include/flexflow/ops/flat_params.h b/include/flexflow/ops/flat_params.h
@@ -7,6 +7,7 @@
 namespace FlexFlow {
 
 struct FlatParams {
+  char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
   void solve_dims(ParallelTensorShape const &input,
                   ParallelDim output_dims[MAX_TENSOR_DIM],

diff --git a/include/flexflow/ops/gather_params.h b/include/flexflow/ops/gather_params.h
@@ -10,6 +10,7 @@ namespace FlexFlow {
 struct GatherParams {
   int legion_dim;
   LayerID layer_guid;
+  char name[MAX_OPNAME];
   bool is_valid(
       std::pair<ParallelTensorShape, ParallelTensorShape> const &input) const;
 };

diff --git a/include/flexflow/ops/groupby_params.h b/include/flexflow/ops/groupby_params.h
@@ -9,6 +9,7 @@ namespace FlexFlow {
 struct Group_byParams {
   int n;
   float alpha;
+  char name[MAX_OPNAME];
   bool is_valid(
       std::pair<ParallelTensorShape, ParallelTensorShape> const &) const;
 };

diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h
@@ -16,6 +16,7 @@ struct IncMultiHeadSelfAttentionParams {
       scaling_query, qk_prod_scaling, position_bias;
   DataType quantization_type;
   bool offload;
+  char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
 };
 

diff --git a/include/flexflow/ops/layer_norm_params.h b/include/flexflow/ops/layer_norm_params.h
@@ -12,6 +12,7 @@ struct LayerNormParams {
   bool elementwise_affine;
   float eps;
   bool use_bias;
+  char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
 };
 

diff --git a/include/flexflow/ops/linear_params.h b/include/flexflow/ops/linear_params.h
@@ -20,6 +20,7 @@ class LinearParams {
   float kernel_reg_lambda;
   DataType quantization_type;
   bool offload;
+  char name[MAX_OPNAME];
 
   bool is_valid(ParallelTensorShape const &input_shape) const;
   void solve_dims(const ParallelTensor input,

diff --git a/include/flexflow/ops/pool_2d_params.h b/include/flexflow/ops/pool_2d_params.h
@@ -10,6 +10,7 @@ struct Pool2DParams {
   int kernel_h, kernel_w, stride_h, stride_w, padding_h, padding_w;
   PoolType pool_type;
   ActiMode activation;
+  char name[MAX_OPNAME];
 
   bool is_valid(ParallelTensorShape const &input) const;
   void solve_dims(ParallelTensorShape const &input,

diff --git a/include/flexflow/ops/reduce_params.h b/include/flexflow/ops/reduce_params.h
@@ -10,6 +10,7 @@ struct ReduceParams {
   std::vector<int> axes;
   bool keepdims;
   LayerID layer_guid;
+  char name[MAX_OPNAME];
 
   bool is_valid(ParallelTensorShape const &) const;
 };

diff --git a/include/flexflow/ops/reshape_params.h b/include/flexflow/ops/reshape_params.h
@@ -10,6 +10,7 @@ namespace FlexFlow {
 struct ReshapeParams {
   std::vector<int> shape;
   LayerID layer_guid;
+  char name[MAX_OPNAME];
 
   bool is_valid(ParallelTensorShape const &) const;
 };

diff --git a/include/flexflow/ops/residual_layer_norm_params.h b/include/flexflow/ops/residual_layer_norm_params.h
@@ -13,6 +13,7 @@ struct ResidualLayerNormParams {
   float eps;
   bool use_bias;
   bool use_two_residuals;
+  char name[MAX_OPNAME];
   bool is_valid(std::tuple<ParallelTensorShape,
                            ParallelTensorShape,
                            ParallelTensorShape> const &) const;

diff --git a/include/flexflow/ops/residual_rms_norm_params.h b/include/flexflow/ops/residual_rms_norm_params.h
@@ -11,6 +11,7 @@ struct ResidualRMSNormParams {
   LayerID layer_guid;
   float eps;
   int dim;
+  char name[MAX_OPNAME];
   bool is_valid(
       std::pair<ParallelTensorShape, ParallelTensorShape> const &input) const;
 };

diff --git a/include/flexflow/ops/rms_norm_params.h b/include/flexflow/ops/rms_norm_params.h
@@ -11,6 +11,7 @@ struct RMSNormParams {
   LayerID layer_guid;
   float eps;
   int dim;
+  char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
 };
 

diff --git a/include/flexflow/ops/sampling_params.h b/include/flexflow/ops/sampling_params.h
@@ -8,6 +8,7 @@ namespace FlexFlow {
 
 struct SamplingParams {
   float top_p;
+  char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
 };
 bool operator==(SamplingParams const &, SamplingParams const &);

diff --git a/include/flexflow/ops/sigmoid_silu_multi_params.h b/include/flexflow/ops/sigmoid_silu_multi_params.h
@@ -8,6 +8,7 @@ namespace FlexFlow {
 
 struct SigmoidSiluMultiParams {
   LayerID layer_guid;
+  char name[MAX_OPNAME];
   bool is_valid(
       std::pair<ParallelTensorShape, ParallelTensorShape> const &) const;
 };