gpu add format transform pass

5 years ago · 5b76e8f3d7
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h
@@ -62,7 +62,7 @@ class TransposeGpuFwdKernel : public GpuKernel {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but transpose needs 1 output.";
      return false;
    }
    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
    shape_size_ = input_shape.size();
    if (shape_size_ > TRANSPOSE_MAX_DIMENSION) {
      MS_LOG(EXCEPTION) << "Input is " << shape_size_ << "-D, but transpose supports max " << TRANSPOSE_MAX_DIMENSION
--- a/mindspore/ccsrc/backend/kernel_compiler/kernel_build_info.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/kernel_build_info.cc
@@ -52,6 +52,8 @@ TypeId KernelBuildInfo::GetOutputDeviceType(size_t output_index) const {
  return outputs_device_type_[output_index];
 }

 const std::string &KernelBuildInfo::GetOriginDataFormat() const { return origin_data_format_; }

 const std::vector<std::string> &KernelBuildInfo::GetAllInputFormats() const { return inputs_format_; }

 const std::vector<std::string> &KernelBuildInfo::GetAllOutputFormats() const { return outputs_format_; }
@@ -132,6 +134,11 @@ void KernelBuildInfo::KernelBuildInfoBuilder::SetKernelType(const KernelType &ke
  kernel_build_info_->kernel_type_ = kernel_type;
 }

 void KernelBuildInfo::KernelBuildInfoBuilder::SetOriginDataFormat(const std::string &origin_data_format) {
  MS_EXCEPTION_IF_NULL(kernel_build_info_);
  kernel_build_info_->origin_data_format_ = origin_data_format;
 }

 void KernelBuildInfo::KernelBuildInfoBuilder::SetInputsFormat(const std::vector<std::string> &inputs_format) {
  MS_EXCEPTION_IF_NULL(kernel_build_info_);
  kernel_build_info_->inputs_format_ = inputs_format;
--- a/mindspore/ccsrc/backend/kernel_compiler/kernel_build_info.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/kernel_build_info.h
@@ -38,6 +38,7 @@ class KernelBuildInfo {
    op_pattern_ = kCommonPattern;
    input_reshape_type_ = {};
    output_reshape_type_ = {};
    origin_data_format_ = kOpFormat_DEFAULT;
    inputs_format_ = {};
    outputs_format_ = {};
    inputs_device_type_ = {};
@@ -64,6 +65,8 @@ class KernelBuildInfo {

  std::vector<Axis> GetOutputReshapeType(size_t input_index) const;

  const std::string &GetOriginDataFormat() const;

  const std::vector<std::string> &GetAllInputFormats() const;

  const std::vector<std::string> &GetAllOutputFormats() const;
@@ -97,6 +100,7 @@ class KernelBuildInfo {

 private:
  KernelType kernel_type_;
  std::string origin_data_format_;
  std::vector<std::string> inputs_format_;
  OpPattern op_pattern_;
  std::vector<std::string> outputs_format_;
@@ -135,6 +139,8 @@ class KernelBuildInfo::KernelBuildInfoBuilder {

  void SetKernelType(const KernelType &kernel_type);

  void SetOriginDataFormat(const std::string &origin_data_format);

  void SetInputsFormat(const std::vector<std::string> &inputs_format);

  void SetOutputsFormat(const std::vector<std::string> &outputs_format);
--- a/mindspore/ccsrc/backend/optimizer/common/helper.cc
+++ b/mindspore/ccsrc/backend/optimizer/common/helper.cc
@@ -506,6 +506,45 @@ std::shared_ptr<std::vector<std::pair<AnfNodePtr, int>>> GetRealNodeUsedList(con
  return output_node_list;
 }

 std::shared_ptr<std::vector<std::pair<AnfNodePtr, int>>> GetRealNodeUsedListByOutputIdx(const FuncGraphPtr &graph,
                                                                                        const AnfNodePtr &node,
                                                                                        size_t output_index) {
  auto output_node_list = std::make_shared<std::vector<std::pair<AnfNodePtr, int>>>();
  MS_EXCEPTION_IF_NULL(graph);
  auto manager = graph->manager();
  MS_EXCEPTION_IF_NULL(manager);
  auto iter = manager->node_users().find(node);
  if (iter == manager->node_users().end()) {
    MS_LOG(EXCEPTION) << "node has no output in manager";
  }
  auto output_info_list = iter->second;
  for (const auto &output_info : output_info_list) {
    if (AnfAlgo::GetCNodeName(output_info.first) == prim::kPrimControlDepend->name()) {
      continue;
    }
    if (AnfAlgo::GetCNodeName(output_info.first) == prim::kPrimDepend->name() &&
        output_info.second == kDependAttachNodeIndex) {
      continue;
    }
    size_t used_output_index;
    if (AnfAlgo::GetCNodeName(output_info.first) == prim::kPrimTupleGetItem->name()) {
      used_output_index = AnfAlgo::GetTupleGetItemOutIndex(utils::cast<CNodePtr>(output_info.first));
    } else if (AnfAlgo::GetCNodeName(node) == prim::kPrimTupleGetItem->name()) {
      used_output_index = output_index;
    } else {
      auto kernel_with_index = AnfAlgo::GetPrevNodeOutput(output_info.first, output_info.second - 1);
      if (kernel_with_index.first.get() != node.get()) {
        MS_LOG(EXCEPTION) << "Get used node failed for op[" << AnfAlgo::GetCNodeName(node) << "]";
      }
      used_output_index = kernel_with_index.second;
    }
    if (used_output_index == output_index) {
      output_node_list->push_back(output_info);
    }
  }
  return output_node_list;
 }

 bool IsUsedByOthers(const FuncGraphPtr &graph, const AnfNodePtr &node) {
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(node);
--- a/mindspore/ccsrc/backend/optimizer/common/helper.h
+++ b/mindspore/ccsrc/backend/optimizer/common/helper.h
@@ -172,6 +172,10 @@ bool IsUsedByOthers(const FuncGraphPtr &graph, const AnfNodePtr &node);
 std::shared_ptr<std::vector<std::pair<AnfNodePtr, int>>> GetRealNodeUsedList(const FuncGraphPtr &graph,
                                                                             const AnfNodePtr &node);

 std::shared_ptr<std::vector<std::pair<AnfNodePtr, int>>> GetRealNodeUsedListByOutputIdx(const FuncGraphPtr &graph,
                                                                                        const AnfNodePtr &node,
                                                                                        size_t output_index);

 void ConstInputToAttr(const CNodePtr &cnode, const std::unordered_set<size_t> &input_attrs);

 bool AnfEqual(const BaseRef &a, const BaseRef &b);
--- a/mindspore/ccsrc/backend/optimizer/gpu/insert_format_transform_op.cc
+++ b/mindspore/ccsrc/backend/optimizer/gpu/insert_format_transform_op.cc
@@ -0,0 +1,151 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/optimizer/gpu/insert_format_transform_op.h"
 #include <memory>
 #include <vector>
 #include <string>
 #include "backend/session/anf_runtime_algorithm.h"
 #include "ir/primitive.h"
 #include "utils/utils.h"
 #include "backend/optimizer/common/helper.h"
 #include "runtime/device/gpu/kernel_info_setter.h"

 namespace mindspore {
 namespace opt {
 namespace {
 std::vector<int> TransposeAxis(const std::string &src_format, const std::string &dst_format) {
  if ((src_format == kOpFormat_NCHW) && (dst_format == kOpFormat_NHWC)) {
    return {0, 2, 3, 1};
  } else if ((src_format == kOpFormat_NHWC) && (dst_format == kOpFormat_NCHW)) {
    return {0, 3, 1, 2};
  } else {
    MS_LOG(EXCEPTION) << "Invaild format transform, from " << src_format << " to " << dst_format;
  }
 }

 void SetTransposeOpBuildInfo(const std::string &input_format, const std::string &output_format,
                             const AnfNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  auto input_type = AnfAlgo::GetPrevNodeOutputInferDataType(node, 0);
  auto output_type = AnfAlgo::GetOutputInferDataType(node, 0);
  kernel::KernelBuildInfo::KernelBuildInfoBuilder builder;
  builder.SetInputsFormat({input_format});
  builder.SetInputsDeviceType({input_type});
  builder.SetOutputsFormat({output_format});
  builder.SetOutputsDeviceType({output_type});
  builder.SetKernelType(UNKNOWN_KERNEL_TYPE);
  builder.SetProcessor(kernel::Processor::CUDA);
  AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), node.get());
 }

 // Insert transpose op between node and used_node whose position is used_node_index.
 CNodePtr InsertTransposeOp(const FuncGraphPtr &graph, const AnfNodePtr &node, const AnfNodePtr &used_node,
                           int used_node_index, const std::vector<int> &transpose_perm) {
  MS_EXCEPTION_IF_NULL(graph);
  // 1.Create a transpose node.
  auto transpose_prim = std::make_shared<Primitive>(prim::kPrimTranspose->name());
  MS_EXCEPTION_IF_NULL(transpose_prim);
  // 2.Set the input of transpose.
  std::vector<AnfNodePtr> transpose_input = {NewValueNode(transpose_prim), node};
  auto transpose_op = graph->NewCNode(transpose_input);
  // 3.Set the output info of transpose.
  auto transpose_type = {AnfAlgo::GetPrevNodeOutputInferDataType(used_node, used_node_index)};
  auto transpose_shape = {AnfAlgo::GetPrevNodeOutputInferShape(used_node, used_node_index)};
  AnfAlgo::SetOutputInferTypeAndShape(transpose_type, transpose_shape, transpose_op.get());
  AnfAlgo::SetNodeAttr(kAttrPerm, MakeValue(transpose_perm), transpose_op);
  // 4.Set the input of used_node.
  MS_LOG(DEBUG) << "Node: " << node->fullname_with_scope() << ", used node: " << used_node->fullname_with_scope()
                << ", index: " << used_node_index;
  AnfAlgo::SetNodeInput(utils::cast<CNodePtr>(used_node), transpose_op, used_node_index);
  // 5. Update the manager info of transpose op.
  FuncGraphManagerPtr manager = graph->manager();
  MS_EXCEPTION_IF_NULL(manager);
  manager->Clear();
  manager->AddFuncGraph(graph);
  return transpose_op;
 }
 }  // namespace

 const AnfNodePtr InsertFormatTransformOp::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
                                                  const EquivPtr &equiv) const {
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(node);
  MS_EXCEPTION_IF_NULL(equiv);
  if (!AnfAlgo::IsRealCNodeKernel(node)) {
    return nullptr;
  }
  auto iter = device::gpu::kKernelFormatPositionMap.find(AnfAlgo::GetCNodeName(node));
  if (iter == device::gpu::kKernelFormatPositionMap.end()) {
    return nullptr;
  }
  auto origin_data_format = AnfAlgo::GetOriginDataFormat(node);
  if (origin_data_format == kOpFormat_DEFAULT) {
    origin_data_format = kOpFormat_NCHW;
  }
  MS_LOG(DEBUG) << "Process node: " << node->fullname_with_scope();
  // Insert input transpose from origin_data_format to input_format.
  auto inputs_format = AnfAlgo::GetAllInputFormats(node);
  for (size_t i = 0; i < inputs_format.size(); i++) {
    if ((inputs_format[i] != kOpFormat_DEFAULT) && (inputs_format[i] != origin_data_format)) {
      auto input_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), i);
      MS_EXCEPTION_IF_NULL(input_node);
      auto transpose_perm = TransposeAxis(origin_data_format, inputs_format[i]);
      auto transpose_op = InsertTransposeOp(graph, input_node, node, i, transpose_perm);
      SetTransposeOpBuildInfo(kOpFormat_DEFAULT, inputs_format[i], transpose_op);
    }
  }

  // Insert output transpose from output_format to origin_data_format.
  auto outputs_format = AnfAlgo::GetAllOutputFormats(node);
  for (size_t i = 0; i < outputs_format.size(); i++) {
    if ((outputs_format[i] != kOpFormat_DEFAULT) && (outputs_format[i] != origin_data_format)) {
      // Find all nodes connected with node output, and change their inputs to transpose.
      auto used_node_list = GetRealNodeUsedListByOutputIdx(graph, node, i);
      for (size_t j = 0; j < used_node_list->size(); j++) {
        auto used_node = used_node_list->at(j).first;
        auto used_node_index = used_node_list->at(j).second - 1;
        auto transpose_perm = TransposeAxis(outputs_format[i], origin_data_format);
        if (AnfAlgo::GetCNodeName(used_node) == prim::kPrimTupleGetItem->name()) {
          MS_LOG(DEBUG) << "The used node of [" << node->fullname_with_scope() << "] is tuple item.";
          // The tuple item need get next used nodes again.
          ProcessForTupleItem(graph, used_node, used_node_index, transpose_perm, outputs_format[i]);
          continue;
        }
        auto transpose_op = InsertTransposeOp(graph, node, used_node, used_node_index, transpose_perm);
        SetTransposeOpBuildInfo(outputs_format[i], kOpFormat_DEFAULT, transpose_op);
      }
    }
  }
  return node;
 }

 void InsertFormatTransformOp::ProcessForTupleItem(const FuncGraphPtr &graph, const AnfNodePtr &node, int node_index,
                                                  const std::vector<int> &transpose_perm,
                                                  const std::string &transpose_format) const {
  auto used_node_list = GetRealNodeUsedListByOutputIdx(graph, node, node_index);
  for (size_t i = 0; i < used_node_list->size(); i++) {
    auto used_node = used_node_list->at(i).first;
    auto used_node_index = used_node_list->at(i).second - 1;
    if (AnfAlgo::GetCNodeName(used_node) == prim::kPrimTupleGetItem->name()) {
      MS_LOG(EXCEPTION) << "The used node of tuple item can't be tuple item.";
    }
    auto transpose_op = InsertTransposeOp(graph, node, used_node, used_node_index, transpose_perm);
    SetTransposeOpBuildInfo(transpose_format, kOpFormat_DEFAULT, transpose_op);
  }
 }
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/gpu/insert_format_transform_op.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/insert_format_transform_op.h
@@ -0,0 +1,39 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_INSERT_FORMAT_TRANSFORM_OP_H_
 #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_INSERT_FORMAT_TRANSFORM_OP_H_

 #include <memory>
 #include <string>
 #include <vector>
 #include "backend/optimizer/common/optimizer.h"

 namespace mindspore {
 namespace opt {
 class InsertFormatTransformOp : public PatternProcessPass {
 public:
  explicit InsertFormatTransformOp(bool multigraph = true)
      : PatternProcessPass("insert_format_transform_op", multigraph) {}
  ~InsertFormatTransformOp() override = default;
  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;

 private:
  void ProcessForTupleItem(const FuncGraphPtr &graph, const AnfNodePtr &node, int node_index,
                           const std::vector<int> &transpose_perm, const std::string &transpose_format) const;
 };
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_INSERT_FORMAT_TRANSFORM_OP_H_
--- a/mindspore/ccsrc/backend/optimizer/gpu/remove_format_transform_pair.cc
+++ b/mindspore/ccsrc/backend/optimizer/gpu/remove_format_transform_pair.cc
@@ -0,0 +1,65 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/optimizer/gpu/remove_format_transform_pair.h"
 #include <memory>
 #include <vector>
 #include <string>
 #include "backend/session/anf_runtime_algorithm.h"
 #include "ir/primitive.h"
 #include "utils/utils.h"
 #include "backend/optimizer/common/helper.h"

 namespace mindspore {
 namespace opt {
 const BaseRef RemoveFormatTransformPair::DefinePattern() const {
  VarPtr X = std::make_shared<Var>();
  MS_EXCEPTION_IF_NULL(X);
  VectorRef transpose1 = VectorRef({prim::kPrimTranspose, X});
  VectorRef transpose2 = VectorRef({prim::kPrimTranspose, transpose1});
  return transpose2;
 }

 const AnfNodePtr RemoveFormatTransformPair::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
                                                    const EquivPtr &equiv) const {
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(node);
  MS_EXCEPTION_IF_NULL(equiv);
  MS_LOG(DEBUG) << "Process node:" << node->fullname_with_scope();
  auto input_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
  MS_EXCEPTION_IF_NULL(input_node);
  if (AnfAlgo::GetCNodeName(node) != prim::kPrimTranspose->name() ||
      AnfAlgo::GetCNodeName(input_node) != prim::kPrimTranspose->name()) {
    MS_LOG(EXCEPTION) << "The  pattern is not transpose pair, "
                      << "node:" << AnfAlgo::GetCNodeName(node) << " node input:" << AnfAlgo::GetCNodeName(input_node);
  }
  // If transpose operator used by more than one other operators, it cant not be deleted  directly.
  if (IsUsedByOthers(graph, input_node)) {
    MS_LOG(DEBUG) << "The transpose node [" << input_node->fullname_with_scope()
                  << "] is used by more than one other operators.";
    return nullptr;
  }
  auto transpose1_input_shape = AnfAlgo::GetInputDeviceShape(input_node, 0);
  auto transpose2_output_shape = AnfAlgo::GetOutputDeviceShape(node, 0);
  if (transpose2_output_shape == transpose1_input_shape) {
    auto transpose1_input_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(input_node), 0);
    MS_EXCEPTION_IF_NULL(transpose1_input_node);
    return transpose1_input_node;
  }
  return nullptr;
 }
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/gpu/remove_format_transform_pair.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/remove_format_transform_pair.h
@@ -0,0 +1,34 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REMOVE_FORMAT_TRANSFORM_PAIR_H_
 #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REMOVE_FORMAT_TRANSFORM_PAIR_H_

 #include <memory>
 #include "backend/optimizer/common/optimizer.h"

 namespace mindspore {
 namespace opt {
 class RemoveFormatTransformPair : public PatternProcessPass {
 public:
  explicit RemoveFormatTransformPair(bool multigraph = true)
      : PatternProcessPass("remove_format_transform_pair", multigraph) {}
  ~RemoveFormatTransformPair() override = default;
  const BaseRef DefinePattern() const override;
  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
 };
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REMOVE_FORMAT_TRANSFORM_PAIR_H_
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
@@ -353,6 +353,48 @@ size_t AnfRuntimeAlgorithm::GetOutputTensorNum(const AnfNodePtr &node) {
  }
 }

 std::vector<std::string> AnfRuntimeAlgorithm::GetAllOutputFormats(const AnfNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  if (!AnfAlgo::IsRealKernel(node)) {
    MS_LOG(EXCEPTION) << "Not real kernel:"
                      << "#node [" << node->DebugString() << "]";
  }
  auto kernel_info = dynamic_cast<device::KernelInfo *>(node->kernel_info());
  MS_EXCEPTION_IF_NULL(kernel_info);
  auto build_info = kernel_info->select_kernel_build_info();
  MS_EXCEPTION_IF_NULL(build_info);
  auto format = build_info->GetAllOutputFormats();
  return format;
 }

 std::vector<std::string> AnfRuntimeAlgorithm::GetAllInputFormats(const AnfNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  if (!AnfAlgo::IsRealKernel(node)) {
    MS_LOG(EXCEPTION) << "Not real kernel:"
                      << "#node [" << node->DebugString() << "]";
  }
  auto kernel_info = dynamic_cast<device::KernelInfo *>(node->kernel_info());
  MS_EXCEPTION_IF_NULL(kernel_info);
  auto build_info = kernel_info->select_kernel_build_info();
  MS_EXCEPTION_IF_NULL(build_info);
  auto format = build_info->GetAllInputFormats();
  return format;
 }

 std::string AnfRuntimeAlgorithm::GetOriginDataFormat(const AnfNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  if (!AnfAlgo::IsRealKernel(node)) {
    MS_LOG(EXCEPTION) << "Not real kernel:"
                      << "#node [" << node->DebugString() << "]";
  }
  auto kernel_info = dynamic_cast<device::KernelInfo *>(node->kernel_info());
  MS_EXCEPTION_IF_NULL(kernel_info);
  auto build_info = kernel_info->select_kernel_build_info();
  MS_EXCEPTION_IF_NULL(build_info);
  auto format = build_info->GetOriginDataFormat();
  return format;
 }

 std::string AnfRuntimeAlgorithm::GetOutputFormat(const AnfNodePtr &node, size_t output_idx) {
  MS_EXCEPTION_IF_NULL(node);
  if (output_idx > GetOutputTensorNum(node)) {
@@ -829,7 +871,7 @@ void AnfRuntimeAlgorithm::SetKernelMod(const KernelModPtr &kernel_mod, AnfNode *

 bool AnfRuntimeAlgorithm::IsRealKernel(const AnfNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  // parameter and value node is not a real kernel too
  // parameter and value node is a real kernel too
  if (!node->isa<CNode>()) {
    return true;
  }
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
@@ -101,6 +101,12 @@ class AnfRuntimeAlgorithm {
  static size_t GetInputTensorNum(const AnfNodePtr &node);
  // get the num of output real_kernel(which can be build and run in device)
  static size_t GetOutputTensorNum(const AnfNodePtr &node);
  // get all outputs format select of anf node
  static std::vector<std::string> GetAllOutputFormats(const AnfNodePtr &node);
  // get all inputs format select of anf node
  static std::vector<std::string> GetAllInputFormats(const AnfNodePtr &node);
  // get origin data format select of anf node
  static std::string GetOriginDataFormat(const AnfNodePtr &node);
  // get output format select of anf node
  static std::string GetOutputFormat(const AnfNodePtr &node, size_t output_idx);
  // get input format select of anf node
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -30,6 +30,8 @@
 #include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h"
 #include "backend/optimizer/gpu/replace_momentum_cast_fusion.h"
 #include "backend/optimizer/gpu/replace_addn_fusion.h"
 #include "backend/optimizer/gpu/insert_format_transform_op.h"
 #include "backend/optimizer/gpu/remove_format_transform_pair.h"
 #include "runtime/device/kernel_runtime_manager.h"
 #include "utils/ms_utils.h"
 #include "common/trans.h"
@@ -76,6 +78,8 @@ void GPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
 void GPUSession::HardwareOptimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
  auto optimizer = std::make_shared<opt::GraphOptimizer>();
  auto pm = std::make_shared<opt::PassManager>();
  pm->AddPass(std::make_shared<opt::InsertFormatTransformOp>());
  pm->AddPass(std::make_shared<opt::RemoveFormatTransformPair>());
  pm->AddPass(std::make_shared<opt::AllReduceFusion>());
  pm->AddPass(std::make_shared<opt::GetitemTuple>());
  optimizer->AddPassManager(pm);
@@ -203,7 +207,7 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
  }
  // Assign CUDA streams
  AssignStream(graph);
  // Hide NoOp from execution graph
  // Hide NopOp from execution graph
  opt::HideNopNode(graph.get());
  // Build kernel if node is cnode
  BuildKernel(graph);
@@ -213,7 +217,7 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
  graph->set_execution_order(execution_order);
  // Get summary nodes.
  SetSummaryNodes(graph.get());
  // Remove NoOp from execution graph
  // Remove NopOp from execution graph
  opt::RemoveNopNode(graph.get());
  // Set graph manager.
  MS_EXCEPTION_IF_NULL(context_);
@@ -272,7 +276,7 @@ void GPUSession::BuildOp(const OpRunInfo &op_run_info, const GraphInfo &graph_in
  MS_EXCEPTION_IF_NULL(kernel_graph);
  SelectKernel(kernel_graph);
  StartKernelRT();
  // Hide NoOp from execution graph
  // Hide NopOp from execution graph
  opt::HideNopNode(kernel_graph.get());
  BuildKernel(kernel_graph);
  run_op_graphs_[graph_info] = kernel_graph;
@@ -282,7 +286,7 @@ py::tuple GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph
                            const std::vector<tensor::TensorPtr> &input_tensors) {
  auto kernel_graph = run_op_graphs_[graph_info];
  MS_EXCEPTION_IF_NULL(kernel_graph);
  // Remove NoOp from execution graph
  // Remove NopOp from execution graph
  opt::RemoveNopNode(kernel_graph.get());
  RunOpAllocateMemory(op_run_info.value, input_tensors, kernel_graph.get());
  // Execute the computation
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
@@ -252,7 +252,7 @@ Status DeviceQueueOp::RetryPushGPUData(const std::vector<size_t> &data_size, con
        return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "invalid input Data, please check it.");
      } else {
        if (!stop_send_) {
          MS_LOG(WARNING) << "Retry pushing data...";
          MS_LOG(DEBUG) << "Retry pushing data...";
          continue;
        }
        break;
--- a/mindspore/ccsrc/runtime/device/gpu/blocking_queue.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/blocking_queue.cc
@@ -96,10 +96,10 @@ BlockQueueStatus_T BlockingQueue::Create(void *addr, const std::vector<size_t> &

 void BlockingQueue::RegisterRelease(const std::function<void(void *)> &func) { queue_->RegisterRelease(func); }

 BlockQueueStatus_T BlockingQueue::Push(const std::vector<DataItemGpu> &data, unsigned int timeout_in_sec) {
 BlockQueueStatus_T BlockingQueue::Push(const std::vector<DataItemGpu> &data, unsigned int) {
  std::unique_lock<std::mutex> locker(mutex_);
  if (queue_->IsFull()) {
    if (not_full_cond_.wait_for(locker, std::chrono::seconds(timeout_in_sec)) == std::cv_status::timeout) {
    if (not_full_cond_.wait_for(locker, std::chrono::microseconds(100)) == std::cv_status::timeout) {
      return TIMEOUT;
    }
  }
--- a/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc
@@ -19,6 +19,7 @@
 #include <memory>
 #include "backend/kernel_compiler/kernel.h"
 #include "utils/utils.h"
 #include "utils/ms_context.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "backend/kernel_compiler/kernel_build_info.h"
 #include "backend/session/anf_runtime_algorithm.h"
@@ -157,25 +158,87 @@ void SetTensorDeviceInfo(const kernel::KernelBuildInfo &selected_kernel_info, co
    }
  }
 }

 bool IsNeedProcessFormatInfo(const CNodePtr &kernel_node, const std::vector<TypeId> &inputs_type) {
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  if (ms_context->execution_mode() == kPynativeMode) {
    return false;
  }
  if (!AnfAlgo::IsRealCNodeKernel(kernel_node)) {
    return false;
  }
  auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
  auto iter = kKernelFormatPositionMap.find(kernel_name);
  if (iter == kKernelFormatPositionMap.end()) {
    return false;
  }
  if (inputs_type.size() == 0) {
    return false;
  }
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  if (input_shape.size() != 4) {
    return false;
  }
  return false;
 }

 void UpdateKernelFormatInfo(const CNodePtr &kernel_node, const std::vector<TypeId> &inputs_type,
                            std::vector<std::string> *inputs_format, std::vector<std::string> *outputs_format,
                            std::string *origin_data_format) {
  auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
  auto iter = kKernelFormatPositionMap.find(kernel_name);
  if (iter == kKernelFormatPositionMap.end()) {
    return;
  }
  auto cal_format = (inputs_type[0] == kNumberTypeFloat16) ? kOpFormat_NHWC : kOpFormat_NCHW;
  MS_LOG(DEBUG) << "Kernel node: " << kernel_node->fullname_with_scope() << ", format: " << cal_format;
  auto inputs_format_position = iter->second.first;
  for (const auto &input_format_position : inputs_format_position) {
    if (input_format_position >= inputs_format->size()) {
      MS_LOG(EXCEPTION) << "The position [" << input_format_position << "] is out of range of the input size ["
                        << inputs_format->size() << "] #kernel_node [" << kernel_node->fullname_with_scope() << "]";
    }
    (*inputs_format)[input_format_position] = cal_format;
  }
  auto outputs_format_position = iter->second.second;
  for (const auto &output_format_position : outputs_format_position) {
    if (output_format_position >= outputs_format->size()) {
      MS_LOG(EXCEPTION) << "The position [" << output_format_position << "] is out of range of the output size ["
                        << outputs_format->size() << "] #kernel_node [" << kernel_node->fullname_with_scope() << "]";
    }
    (*outputs_format)[output_format_position] = cal_format;
  }
  auto prim = AnfAlgo::GetCNodePrimitive(kernel_node);
  MS_EXCEPTION_IF_NULL(prim);
  if (prim->HasAttr("data_format")) {
    *origin_data_format = AnfAlgo::GetNodeAttr<std::string>(kernel_node, "data_format");
  }
 }
 }  // namespace

 void SetKernelInfo(const CNodePtr &kernel_node) {
  std::vector<std::string> inputs_format;
  std::vector<TypeId> inputs_type;
  std::shared_ptr<KernelBuildInfo::KernelBuildInfoBuilder> builder =
    std::make_shared<KernelBuildInfo::KernelBuildInfoBuilder>();
  for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(kernel_node); ++input_index) {
    inputs_format.emplace_back(kOpFormat_DEFAULT);
    inputs_type.push_back(AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, input_index));
  }
  builder->SetInputsFormat(inputs_format);
  builder->SetInputsDeviceType(inputs_type);
  std::vector<std::string> outputs_format;
  std::vector<TypeId> outputs_type;
  for (size_t output_index = 0; output_index < AnfAlgo::GetOutputTensorNum(kernel_node); ++output_index) {
    outputs_format.emplace_back(kOpFormat_DEFAULT);
    outputs_type.push_back(AnfAlgo::GetOutputInferDataType(kernel_node, output_index));
  }
  std::string origin_data_format = kOpFormat_DEFAULT;
  if (IsNeedProcessFormatInfo(kernel_node, inputs_type)) {
    UpdateKernelFormatInfo(kernel_node, inputs_type, &inputs_format, &outputs_format, &origin_data_format);
  }
  std::shared_ptr<KernelBuildInfo::KernelBuildInfoBuilder> builder =
    std::make_shared<KernelBuildInfo::KernelBuildInfoBuilder>();
  builder->SetOriginDataFormat(origin_data_format);
  builder->SetInputsFormat(inputs_format);
  builder->SetInputsDeviceType(inputs_type);
  builder->SetOutputsFormat(outputs_format);
  builder->SetOutputsDeviceType(outputs_type);

--- a/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.h
+++ b/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.h
@@ -20,13 +20,35 @@
 #include <utility>
 #include <string>
 #include <vector>
 #include <map>
 #include "ir/anf.h"
 #include "ir/dtype.h"
 #include "utils/utils.h"
 #include "frontend/operator/ops.h"

 namespace mindspore {
 namespace device {
 namespace gpu {
 // map<opName, (inputFormatPosition, outputFormatPosition)>, used for getting the insert position of format transform.
 static std::map<std::string, std::pair<std::vector<size_t>, std::vector<size_t>>> kKernelFormatPositionMap = {
  {prim::kPrimConv2D->name(), {{0, 1}, {0}}},
  {prim::kPrimConv2DBackpropInput->name(), {{0, 1}, {0}}},
  {prim::kPrimConv2DBackpropFilter->name(), {{0, 1}, {0}}},
  {prim::kPrimRelu->name(), {{0}, {0}}},
  {prim::kPrimReluGrad->name(), {{0, 1}, {0}}},
  {prim::kPrimMaxPool->name(), {{0}, {0}}},
  {prim::kPrimMaxPoolGrad->name(), {{0, 1, 2}, {0}}},
  {kAvgPoolOpName, {{0}, {0}}},
  {kAvgPoolGradGpuOpName, {{0, 1, 2}, {0}}},
  {kTensorAddOpName, {{0, 1}, {0}}},
  {kFusedBatchNormEx, {{0}, {0}}},
  {kFusedBatchNormExWithActivation, {{0}, {0}}},
  {kFusedBatchNormExWithAddAndActivation, {{0, 5}, {0}}},
  {kFusedBatchNormGradEx, {{0, 1}, {0}}},
  {kFusedBatchNormGradExWithActivation, {{0, 1, 7}, {0}}},
  {kFusedBatchNormGradExWithAddAndActivation, {{0, 1, 7}, {0, 3}}},
 };

 void SetKernelInfo(const CNodePtr &apply_kernel_ptr);

 class KernelAttr {
--- a/mindspore/ccsrc/utils/utils.h
+++ b/mindspore/ccsrc/utils/utils.h
@@ -189,6 +189,9 @@ constexpr auto kPullOpName = "Pull";
 constexpr auto kEmbeddingLookupOpName = "EmbeddingLookup";
 constexpr auto kEmbeddingLookupProxyOpName = "EmbeddingLookupProxy";
 constexpr auto kPaddingOpName = "Padding";
 constexpr auto kAvgPoolOpName = "AvgPool";
 constexpr auto kAvgPoolGradGpuOpName = "AvgPoolGradGpu";
 constexpr auto kTensorAddOpName = "TensorAdd";

 // attr key name
 constexpr auto kAttrInputNames = "input_names";