From 68fa73f06a594f5b7bd1fe0846bfe27c4acbd1e6 Mon Sep 17 00:00:00 2001
From: LianLiguang <lianliguang@huawei.com>
Date: Thu, 10 Dec 2020 22:26:44 +0800
Subject: [PATCH] change cast and weight format

---
 .../ascend/ascend_backend_optimization.cc     | 33 +++----
 .../ascend/format_type/convert_cast_format.cc | 69 ++++++++++++++
 .../ascend/format_type/convert_cast_format.h  | 35 ++++++++
 .../ccsrc/backend/session/ascend_session.cc   |  2 +-
 .../device/ascend/kernel_select_ascend.cc     | 90 ++++++++++++-------
 5 files changed, 175 insertions(+), 54 deletions(-)
 create mode 100644 mindspore/ccsrc/backend/optimizer/ascend/format_type/convert_cast_format.cc
 create mode 100644 mindspore/ccsrc/backend/optimizer/ascend/format_type/convert_cast_format.h
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc b/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
index 04869ff153..5fbe4e62bf 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
@@ -72,6 +72,7 @@
 #include "backend/optimizer/ascend/format_type/chang_axis_of_reduce_kernel.h"
 #include "backend/optimizer/ascend/format_type/split_unsupported_transdata.h"
 #include "backend/optimizer/ascend/format_type/insert_reshape_for_extract_image_patches_op.h"
+#include "backend/optimizer/ascend/format_type/convert_cast_format.h"
 #include "backend/optimizer/pass/getitem_tuple.h"
 #include "backend/optimizer/pass/optimize_dependence.h"
 #include "backend/optimizer/pass/erase_visit_attr.h"
@@ -188,27 +189,6 @@ void AddAscendIRFusionPass(PassManager *ir_fusion_pm) {
   ir_fusion_pm->AddPass(std::make_shared<GatherV2DsFission>());
 }
 }  // namespace
-
-void RunOpAscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
-  MS_EXCEPTION_IF_NULL(kernel_graph);
-  auto optimizer = std::make_shared<GraphOptimizer>();
-  auto data_layout_pm = std::make_shared<PassManager>("pynative_transop_pm");
-  data_layout_pm->AddPass(std::make_shared<ChangeAxisOfReduceKernel>());
-  data_layout_pm->AddPass(std::make_shared<RectifyDoMaskKernelInfo>());
-  data_layout_pm->AddPass(std::make_shared<DynamicRNNGradReformat>());
-  data_layout_pm->AddPass(std::make_shared<RunOpInsertTransData>());
-  data_layout_pm->AddPass(std::make_shared<GetitemTuple>());
-  data_layout_pm->AddPass(std::make_shared<CommonSubexpressionElimination>());
-  data_layout_pm->AddPass(std::make_shared<EliminateRedundantOp>());
-  data_layout_pm->AddPass(std::make_shared<InsertTransposeForDynamicGRUV2>());
-  data_layout_pm->AddPass(std::make_shared<OptimizeDependence>());
-  data_layout_pm->AddPass(std::make_shared<TransDataSplit>());
-  data_layout_pm->AddPass(std::make_shared<EraseVisitAttr>());
-  optimizer->AddPassManager(data_layout_pm);
-  (void)optimizer->Optimize(kernel_graph);
-  kernel_graph->SetExecOrderByDefault();
-}
-
 void AscendGraphKernelCommonProcess(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
   MS_EXCEPTION_IF_NULL(kernel_graph);
   auto optimizer = std::make_shared<GraphOptimizer>();
@@ -228,8 +208,17 @@ void AscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_graph)
   auto data_layout_pm = std::make_shared<PassManager>("transop_pm");
   data_layout_pm->AddPass(std::make_shared<RectifyDoMaskKernelInfo>());
   data_layout_pm->AddPass(std::make_shared<DynamicRNNGradReformat>());
+  data_layout_pm->AddPass(std::make_shared<ChangeAxisOfReduceKernel>());
   data_layout_pm->AddPass(std::make_shared<AddIoFormatAttrFor3DGraph>());
-  data_layout_pm->AddPass(std::make_shared<InsertTransOp>());
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) {
+    data_layout_pm->AddPass(std::make_shared<RunOpInsertTransData>());
+  } else {
+    data_layout_pm->AddPass(std::make_shared<MergeCastToOp>());
+    data_layout_pm->AddPass(std::make_shared<ConvertCastFormat>());
+    data_layout_pm->AddPass(std::make_shared<InsertTransOp>());
+  }
   data_layout_pm->AddPass(std::make_shared<GetitemTuple>());
   data_layout_pm->AddPass(std::make_shared<CommonSubexpressionElimination>());
   data_layout_pm->AddPass(std::make_shared<RemoveReshapePair>());
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/format_type/convert_cast_format.cc b/mindspore/ccsrc/backend/optimizer/ascend/format_type/convert_cast_format.cc
new file mode 100644
index 0000000000..763f0da805
--- /dev/null
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/convert_cast_format.cc
@@ -0,0 +1,69 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/optimizer/ascend/format_type/convert_cast_format.h"
+
+#include <memory>
+
+#include "backend/session/anf_runtime_algorithm.h"
+namespace mindspore {
+namespace opt {
+const BaseRef ConvertCastFormat::DefinePattern() const {
+  VarPtr X = std::make_shared<Var>();
+  VarPtr Xs = std::make_shared<SeqVar>();
+  return VectorRef({X, Xs});
+}
+
+const AnfNodePtr ConvertCastFormat::Process(const mindspore::FuncGraphPtr &, const mindspore::AnfNodePtr &node,
+                                            const mindspore::EquivPtr &) const {
+  if (node == nullptr || !node->isa<CNode>() || !AnfAlgo::IsRealCNodeKernel(node)) {
+    return nullptr;
+  }
+  auto node_name = AnfAlgo::GetCNodeName(node);
+  if (node_name == prim::kPrimCast->name()) {
+    return nullptr;
+  }
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(cnode); ++input_index) {
+    auto input_node = AnfAlgo::VisitKernelWithReturnType(AnfAlgo::GetInputNode(cnode, input_index), 0).first;
+    MS_EXCEPTION_IF_NULL(input_node);
+    if (!input_node->isa<CNode>()) {
+      continue;
+    }
+    auto cast_node = input_node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(cast_node);
+    auto input_node_name = AnfAlgo::GetCNodeName(cast_node);
+    if (input_node_name != prim::kPrimCast->name()) {
+      continue;
+    }
+    auto format = AnfAlgo::GetInputFormat(node, input_index);
+    auto cast_input_node = AnfAlgo::VisitKernelWithReturnType(AnfAlgo::GetInputNode(cast_node, 0), 0).first;
+    auto cast_input_format = AnfAlgo::GetOutputFormat(cast_input_node, 0);
+    // change cast to default that can be more faster when it cast other hw format
+    if (cast_input_format != format) {
+      if (cast_input_format == kOpFormat_DEFAULT || format == kOpFormat_DEFAULT) {
+        auto info_builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(
+          AnfAlgo::GetSelectKernelBuildInfo(cast_node));
+        info_builder->SetInputsFormat({kOpFormat_DEFAULT});
+        info_builder->SetOutputsFormat({kOpFormat_DEFAULT});
+        AnfAlgo::SetSelectKernelBuildInfo(info_builder->Build(), cast_node.get());
+      }
+    }
+  }
+  return nullptr;
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/format_type/convert_cast_format.h b/mindspore/ccsrc/backend/optimizer/ascend/format_type/convert_cast_format.h
new file mode 100644
index 0000000000..5c7f5a2a0d
--- /dev/null
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/convert_cast_format.h
@@ -0,0 +1,35 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_FORMAT_TYPE_CONVERT_CAST_FORMAT_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_FORMAT_TYPE_CONVERT_CAST_FORMAT_H_
+
+#include "backend/optimizer/common/optimizer.h"
+
+namespace mindspore {
+namespace opt {
+class ConvertCastFormat : public PatternProcessPass {
+ public:
+  explicit ConvertCastFormat(bool multigraph = true) : PatternProcessPass("convert_cast_format", multigraph) {}
+  ~ConvertCastFormat() override = default;
+  const BaseRef DefinePattern() const override;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+
+ private:
+  bool NeedChangeCastFormat();
+};
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_FORMAT_TYPE_
diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc
index 5881c5f68a..16abae939e 100644
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@@ -669,7 +669,7 @@ void AscendSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tens
 void AscendSession::RunOpHardwareOptimize(const std::shared_ptr<session::KernelGraph> &kernel_graph) const {
   MS_LOG(INFO) << "Start";
   // data layout optimization
-  opt::RunOpAscendDataLayout(kernel_graph);
+  opt::AscendDataLayout(kernel_graph);
   // mixed precision optimization
   opt::AscendMixPrecision(kernel_graph);
   MS_LOG(INFO) << "Finish";
diff --git a/mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc b/mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc
index a134a6165c..b33957c10a 100644
--- a/mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc
@@ -39,8 +39,9 @@ namespace mindspore {
 namespace device {
 namespace ascend {
 namespace {
-const float kWegihtBaseScore = 1;
-const float kFeatureMapBaseScore = 10;
+const int kWeightUnInitScore = 1;
+const int kWeightInitScore = 2;
+const int kFeatureMapBaseScore = 10;
 constexpr auto kPriChoosenFormat = "pri_format";
 enum MatchCountPriority : int {
   MATCH_COUNT_PRIORITY_BEGIN = 0,
@@ -140,18 +141,17 @@ void UpdateCurMatchCounts(const kernel::KernelBuildInfo &kernel_build_info, cons
   MS_EXCEPTION_IF_NULL(kernel_node);
   MS_EXCEPTION_IF_NULL(cur_kernelinfo_match_counts);
   if (cur_kernelinfo_match_counts->size() < MATCH_COUNT_PRIORITY_END) {
-    MS_LOG(EXCEPTION) << "Out of range cur_kernelinfo_match_counts " << MATCH_COUNT_PRIORITY_END;
+    MS_LOG(EXCEPTION) << "Out of range cur_kernel info_match_counts " << MATCH_COUNT_PRIORITY_END;
   }
   auto pri_match_format = GetPriorityMatchFormat(kernel_node);
   for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(kernel_node); ++input_index) {
-    auto input_anf_node = kernel_node->input(input_index + 1);
+    auto input_anf_node = AnfAlgo::VisitKernelWithReturnType(AnfAlgo::GetInputNode(kernel_node, input_index), 0).first;
+    MS_EXCEPTION_IF_NULL(input_anf_node);
     // we do not take ValueNode into consideration in graph kernel.
-    if (kernel_build_info.kernel_type() == KernelType::AKG_KERNEL) {
-      if (input_anf_node->isa<ValueNode>() && AnfAlgo::GetOutputDeviceDataType(input_anf_node, 0) == kTypeUnknown) {
-        continue;
-      }
+    auto base_score = AnfAlgo::IsFeatureMapInput(kernel_node, input_index) ? kFeatureMapBaseScore : kWeightInitScore;
+    if (AnfAlgo::GetOutputDeviceDataType(input_anf_node, 0) == kTypeUnknown) {
+      base_score = kWeightUnInitScore;
     }
-    auto base_score = AnfAlgo::IsFeatureMapInput(kernel_node, input_index) ? kFeatureMapBaseScore : kWegihtBaseScore;
     if (kernel_build_info.GetInputFormat(input_index) == AnfAlgo::GetPrevNodeOutputFormat(kernel_node, input_index)) {
       (*cur_kernelinfo_match_counts)[MATCH_FORMAT_COUNT] += base_score;
     }
@@ -356,6 +356,54 @@ void SetCastAndWeightFormat(const CNodePtr &kernel_node) {
   info_builder->SetOutputsFormat({format});
   AnfAlgo::SetSelectKernelBuildInfo(info_builder->Build(), kernel_node.get());
 }
+
+void SetWeightFormat(const AnfNodePtr &real_input_node, const std::vector<string> &output_format,
+                     const CNodePtr &kernel_node, size_t input_index) {
+  auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
+  // we set special device info of a input tensor.
+  bool is_ref = false;
+  auto op_info = kernel::tbe::TbeDynamicShapeUtil::FindOp(AnfAlgo::GetCNodeName(kernel_node), kernel_node);
+  if (op_info != nullptr) {
+    is_ref = op_info->is_ref();
+  }
+  auto selected_kernel_info = AnfAlgo::GetSelectKernelBuildInfo(kernel_node);
+  if (IsValueNode<tensor::Tensor>(real_input_node) &&
+      AnfAlgo::GetOutputDeviceDataType(real_input_node, 0) == kTypeUnknown) {
+    builder->SetOutputsFormat(output_format);
+    std::vector<TypeId> output_type = {selected_kernel_info->GetInputDeviceType(input_index)};
+    builder->SetOutputsDeviceType(output_type);
+    AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), real_input_node.get());
+    return;
+  }
+  if (AnfAlgo::GetOutputDeviceDataType(real_input_node, 0) == kTypeUnknown || is_ref) {
+    builder->SetOutputsFormat(output_format);
+    std::vector<TypeId> output_type = {selected_kernel_info->GetInputDeviceType(input_index)};
+    builder->SetOutputsDeviceType(output_type);
+    AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), real_input_node.get());
+  }
+}
+
+bool RefreshCastAndParamWeightFormat(const AnfNodePtr &input_node, const string &format) {
+  MS_EXCEPTION_IF_NULL(input_node);
+  if (!input_node->isa<CNode>()) {
+    return false;
+  }
+  auto cast_node = input_node->cast<CNodePtr>();
+  if (AnfAlgo::GetCNodeName(cast_node) != prim::kPrimCast->name()) {
+    return true;
+  }
+  if (AnfAlgo::IsFeatureMapOutput(cast_node)) {
+    return true;
+  }
+  auto info_builder =
+    std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(AnfAlgo::GetSelectKernelBuildInfo(input_node));
+  info_builder->SetInputsFormat({format});
+  info_builder->SetOutputsFormat({format});
+  AnfAlgo::SetSelectKernelBuildInfo(info_builder->Build(), cast_node.get());
+  auto cast_input_node = AnfAlgo::VisitKernel(AnfAlgo::GetInputNode(cast_node, 0), 0);
+  SetWeightFormat(cast_input_node.first, {format}, cast_node, 0);
+  return true;
+}
 }  // namespace
 void SetTensorDeviceInfo(const CNodePtr &kernel_node) {
   MS_EXCEPTION_IF_NULL(kernel_node);
@@ -367,22 +415,15 @@ void SetTensorDeviceInfo(const CNodePtr &kernel_node) {
     auto input_with_index = AnfAlgo::VisitKernel(input_kernel_node, 0);
     MS_EXCEPTION_IF_NULL(input_with_index.first);
     auto real_input_node = input_with_index.first;
-    if (real_input_node->isa<CNode>()) {
+    if (RefreshCastAndParamWeightFormat(real_input_node, selected_kernel_info->GetInputFormat(input_index))) {
       continue;
     }
     if (real_input_node->isa<Parameter>() && !AnfAlgo::IsParameterWeight(real_input_node->cast<ParameterPtr>())) {
       continue;
     }
-    // we set special device info of a input tensor.
-    bool is_ref = false;
-    auto op_info = kernel::tbe::TbeDynamicShapeUtil::FindOp(AnfAlgo::GetCNodeName(kernel_node), kernel_node);
-    if (op_info != nullptr) {
-      is_ref = op_info->is_ref();
-    }
     if (AnfAlgo::OutputAddrExist(real_input_node, 0)) {
       continue;
     }
-    auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
     auto refresh_format = selected_kernel_info->GetInputFormat(input_index);
     std::vector<std::string> output_format = {refresh_format};
     // if not find in host convert format map means the host has not registered the convert function of this format
@@ -390,20 +431,7 @@ void SetTensorDeviceInfo(const CNodePtr &kernel_node) {
         refresh_format != kOpFormat_DEFAULT) {
       output_format = {AnfAlgo::GetOutputFormat(real_input_node, 0)};
     }
-    if (IsValueNode<tensor::Tensor>(input_kernel_node) &&
-        AnfAlgo::GetOutputDeviceDataType(input_kernel_node, 0) == kTypeUnknown) {
-      builder->SetOutputsFormat(output_format);
-      std::vector<TypeId> output_type = {selected_kernel_info->GetInputDeviceType(input_index)};
-      builder->SetOutputsDeviceType(output_type);
-      AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), input_kernel_node.get());
-      continue;
-    }
-    if (AnfAlgo::GetOutputDeviceDataType(real_input_node, 0) == kTypeUnknown || is_ref) {
-      builder->SetOutputsFormat(output_format);
-      std::vector<TypeId> output_type = {selected_kernel_info->GetInputDeviceType(input_index)};
-      builder->SetOutputsDeviceType(output_type);
-      AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), real_input_node.get());
-    }
+    SetWeightFormat(real_input_node, output_format, kernel_node, input_index);
   }
 }