Introduce manually optimized CUDA block_reduce function and use it to…

… generate a single reduce kernel (#622) (#637)
PaddlePaddle · Dec 10, 2021 · 70b04f0 · 70b04f0
1 parent de53fce
commit 70b04f0
Show file tree

Hide file tree

Showing 11 changed files with 821 additions and 351 deletions.
diff --git a/cinn/frontend/cinn_builder.cc b/cinn/frontend/cinn_builder.cc
@@ -216,17 +216,21 @@ Variable CinnBuilder::Reverse(const Variable& operand, const std::vector<int>& a
   return instr.GetOutput(0);
 }
 
-std::vector<Variable> CinnBuilder::BnMeanVarianceReduce(const Variable& x) {
-  Instruction instr("bn_mean_variance_reduce", {x});
+std::vector<Variable> CinnBuilder::BnMeanVariance(const Variable& x) {
+  Instruction instr("bn_mean_variance", {x});
+  // optimize bn forward reduce computation, set reduce dimension(NCHW suppport only, to be deprecated).
+  instr.SetAttr("dim", std::vector<int>{0, 2, 3});
+  instr.SetAttr("keep_dim", false);
   InferShape(instr);
   AppendInstruction(instr);
   return instr.GetOutputs();
 }
 
-std::vector<Variable> CinnBuilder::BnGradBiasScaleReduce(const Variable& x,
-                                                         const Variable& x_mean,
-                                                         const Variable& y_grad) {
-  Instruction instr("bn_grad_bias_scale_reduce", {x, x_mean, y_grad});
+std::vector<Variable> CinnBuilder::BnGradBiasScale(const Variable& x, const Variable& x_mean, const Variable& y_grad) {
+  Instruction instr("bn_grad_bias_scale", {x, x_mean, y_grad});
+  // optimize bn backward reduce computation, set reduce dimension(NCHW suppport only, to be deprecated).
+  instr.SetAttr("dim", std::vector<int>{0, 2, 3});
+  instr.SetAttr("keep_dim", false);
   InferShape(instr);
   AppendInstruction(instr);
   return instr.GetOutputs();

diff --git a/cinn/frontend/cinn_builder.h b/cinn/frontend/cinn_builder.h
@@ -179,9 +179,9 @@ class CinnBuilder : public BaseBuilder {
 
   Variable Reverse(const Variable& operand, const std::vector<int>& axis);
 
-  std::vector<Variable> BnMeanVarianceReduce(const Variable& x);
+  std::vector<Variable> BnMeanVariance(const Variable& x);
 
-  std::vector<Variable> BnGradBiasScaleReduce(const Variable& x, const Variable& x_mean, const Variable& y_grad);
+  std::vector<Variable> BnGradBiasScale(const Variable& x, const Variable& x_mean, const Variable& y_grad);
 
  private:
   Variable UnaryOp(const std::string& op_type, const Variable& operand);

diff --git a/cinn/frontend/decomposer/batch_norm.cc b/cinn/frontend/decomposer/batch_norm.cc
@@ -60,15 +60,13 @@ struct BatchNormHelper {
   std::vector<Variable> MeanAndVariance(Variable x) {
 #ifdef CINN_WITH_CUDA
     // To optimize the bn forward by merge the reduce computation of mean and variance,
-    // build a fusion op 'BnMeanVarianceReduce' by hand as the fusion pass is not support now.
+    // build a fusion op 'BnMeanVariance' by hand as the fusion pass is not support now.
     // When the fusion pass is rebuild, this op is to be removed.
-    auto vars               = builder->BnMeanVarianceReduce(x);
+    auto vars               = builder->BnMeanVariance(x);
     auto element_count_1d_0 = GetTensorFromScalar<float>(element_count, "element_count", param_shape);
     auto element_count_1d_1 = GetTensorFromScalar<float>(element_count, "element_count", param_shape);
-    auto mean = builder->Div(builder->Reduce(vars[0], ReduceKind::kSum, std::vector<int>(1, vars[0]->shape.size() - 1)),
-                             element_count_1d_0);
-    auto mean_squre = builder->Div(
-        builder->Reduce(vars[1], ReduceKind::kSum, std::vector<int>(1, vars[1]->shape.size() - 1)), element_count_1d_1);
+    auto mean               = builder->Div(vars[0], element_count_1d_0);
+    auto mean_squre         = builder->Div(vars[1], element_count_1d_1);
 
     auto variance = builder->Sub(mean_squre, builder->Mul(mean, builder->Identity(mean)));
 #else
@@ -82,11 +80,9 @@ struct BatchNormHelper {
 
   std::vector<Variable> GradBiasAndScale(Variable x, Variable x_mean, Variable y_grad) {
 #ifdef CINN_WITH_CUDA
-    // Using fusion op "BnGradBiasScaleReduce" as the same reason with "BnMeanVarianceReduce".
+    // Using fusion op "BnGradBiasScale" as the same reason with "BnMeanVariance".
     // It also will be removed.
-    auto vars = builder->BnGradBiasScaleReduce(x, x_mean, y_grad);
-    return {builder->Reduce(vars[0], ReduceKind::kSum, std::vector<int>(1, vars[0]->shape.size() - 1)),
-            builder->Reduce(vars[1], ReduceKind::kSum, std::vector<int>(1, vars[1]->shape.size() - 1))};
+    return builder->BnGradBiasScale(x, x_mean, y_grad);
 #else
     auto mean_4d     = builder->BroadcastTo(x_mean, x->shape, {channel_dim});
     auto x_mean_diff = builder->Sub(x, mean_4d);

diff --git a/cinn/hlir/op/reduction.cc b/cinn/hlir/op/reduction.cc