remove ge depend in cpu

5 years ago · b18f634912
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,11 +42,13 @@ else()
    include(${CMAKE_SOURCE_DIR}/cmake/dependency_graphengine.cmake)
 endif()

 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/graphengine/inc)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/graphengine/inc/external)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/graphengine/inc/framework)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/graphengine/third_party/fwkacllib/inc)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/graphengine/third_party/fwkacllib/inc/toolchain)
 if (ENABLE_GE OR ENABLE_D OR ENABLE_TESTCASES)
    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/graphengine/inc)
    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/graphengine/inc/external)
    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/graphengine/inc/framework)
    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/graphengine/third_party/fwkacllib/inc)
    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/graphengine/third_party/fwkacllib/inc/toolchain)
 endif()

 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
 add_subdirectory(mindspore/ccsrc)
--- a/cmake/mind_expression.cmake
+++ b/cmake/mind_expression.cmake
@@ -40,7 +40,7 @@ if (ENABLE_GE)
    include_directories(${CMAKE_SOURCE_DIR}/third_party/ge/include)
    include_directories(${CMAKE_SOURCE_DIR}/third_party/ge/include/external)
    include_directories(${CMAKE_SOURCE_DIR}/third_party/ge/include/external/graph)
 else()
 elseif(ENABLE_D OR ENABLE_TESTCASES)
    include_directories(${CMAKE_SOURCE_DIR}/graphengine/inc)
    include_directories(${CMAKE_SOURCE_DIR}/graphengine/inc/ops)
    include_directories(${CMAKE_SOURCE_DIR}/graphengine/inc/external)
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -34,6 +34,8 @@ if(ENABLE_GPU)
            "device/gpu/*.cu"
            "kernel/gpu/*.cu"
            "kernel/akg/gpu/*.cc"
            "kernel/akg/akgkernelbuild.cc"
            "kernel/akg/akg_kernel_attrs_process.cc"
            )
    file(GLOB_RECURSE GPU_KERNEL_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
            "kernel/gpu/*.cc"
@@ -100,14 +102,14 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "debug/*.cc"
        "onnx/onnx_exporter.cc"
        "operator/*.cc"
        "transform/*.cc"
        "session/kernel_graph.cc"
        "utils/node_utils.cc"
        "session/session_basic.cc"
        "session/session_factory.cc"
        "session/anf_runtime_algorithm.cc"
        "vm/*.cc"
        "pynative/*.cc"
        "pynative/base.cc"
        "pynative/pynative_execute.cc"
        "pybind_api/*.cc"
        "device/common/*.cc"
        "kernel/kernel_query.cc"
@@ -117,7 +119,6 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "device/kernel_runtime.cc"
        "device/kernel_runtime_manager.cc"
        "device/convert_tensor_utils.cc"
        "pre_activate/ascend/*.cc"
        "pre_activate/common/*.cc"
        "pre_activate/pass/*.cc"
        "pre_activate/gpu/*.cc"
@@ -168,6 +169,15 @@ if(ENABLE_DUMP_PROTO)
    add_compile_definitions(ENABLE_DUMP_PROTO)
 endif()

 if(ENABLE_GE)
    file(GLOB_RECURSE GE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
            "transform/*.cc"
            "pynative/pynative_execute_ge.cc"
            "pipeline/pipeline_ge.cc"
            )
    list(APPEND MINDSPORE_SRC_LIST ${GE_SRC_LIST})
 endif()

 if(ENABLE_D)
    include_directories("${CMAKE_BINARY_DIR}/kernel/aicpu")
    file(GLOB_RECURSE PROTO_IN RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
@@ -188,6 +198,9 @@ if(ENABLE_D)
            "device/kernel_adjust.cc"
            "kernel/kernel_fusion.cc"
            "kernel/tbe/*.cc"
            "pre_activate/ascend/*.cc"
            "transform/*.cc"
            "pipeline/pipeline_ge.cc"
            )
    list(APPEND MINDSPORE_SRC_LIST ${D_SRC_LIST})
    list(APPEND MINDSPORE_PROTO_AICPU_LIST ${PROTOSRCS})
@@ -246,9 +259,11 @@ if (ENABLE_GE)
        target_link_libraries(mindspore graph ge_client)
    endif()
    target_link_libraries(mindspore tsdclient)
 else()
 elseif(ENABLE_D)
    add_compile_definitions(NO_GE_CLIENT)
    target_link_libraries(mindspore graph)
 else()
    add_compile_definitions(NO_GE_CLIENT)
 endif()

 if(ENABLE_D)
@@ -288,8 +303,6 @@ endif()
 set(PYTHON_MODULE_SOURCE
        pipeline/init.cc
        kernel/oplib/oplib.cc
        kernel/akg/akgkernelbuild.cc
        kernel/akg/akg_kernel_attrs_process.cc
    ${MS_STEPS_SRC_LIST} ${MS_CCE_SRC_LIST} ${MS_AICPU_SRC_LIST} ${MS_TASKINFO_LIST} ${MS_RT_SRC_LIST}
    ${GPU_NCCL_LIST} ${MS_HCCL_SRC_LIST} ${MS_PREDICT_SRC_LIST} ${CPU_SRC_LIST} ${MEM_REUSE_SRC_LIST} ${GPU_KERNEL_SRC_LIST})

@@ -350,6 +363,7 @@ if(ENABLE_GPU)
    assign_source_group("Include" ${GROUP_INCLUDE})

    file(GLOB COMPILER_SRCS
        "pre_activate/gpu/*.cc"
        ${TVM_DIR}/src/api/*.cc
        ${TVM_DIR}/src/arithmetic/*.cc
        ${TVM_DIR}/src/autotvm/*.cc
--- a/mindspore/ccsrc/debug/e2e_dump.cc
+++ b/mindspore/ccsrc/debug/e2e_dump.cc
@@ -49,7 +49,7 @@ bool Dump::IsKernelNeedDump(const std::string& kernel_name) {
  return false;
 }

 bool Dump::ParseDumpConfig(const string& dump_config_file) {
 bool Dump::ParseDumpConfig(const std::string& dump_config_file) {
  std::ifstream jsonFile(dump_config_file);
  if (!jsonFile.is_open()) {
    MS_LOG(ERROR) << dump_config_file << " open failed.";
--- a/mindspore/ccsrc/device/ascend/kernel_build_ascend.cc
+++ b/mindspore/ccsrc/device/ascend/kernel_build_ascend.cc
@@ -94,7 +94,7 @@ static bool KernelBuildParallelCompile(const mindspore::session::KernelGraph *ke
  return ret;
 }

 static vector<int> CalCleanZerosSize(const CNodePtr &pre_node) {
 static std::vector<int> CalCleanZerosSize(const CNodePtr &pre_node) {
  MS_EXCEPTION_IF_NULL(pre_node);
  std::vector<int> clean_size_list;
  // clean output
--- a/mindspore/ccsrc/device/ascend/profiling/profiling_manager.cc
+++ b/mindspore/ccsrc/device/ascend/profiling/profiling_manager.cc
@@ -27,6 +27,7 @@
 #include "utils/log_adapter.h"
 #include "utils/context/ms_context.h"
 #include "common/utils.h"
 #include "utils/convert_utils.h"

 using std::vector;
 using Json = nlohmann::json;
--- a/mindspore/ccsrc/device/ascend/tasksink/task_generator.cc
+++ b/mindspore/ccsrc/device/ascend/tasksink/task_generator.cc
@@ -121,8 +121,8 @@ bool TaskGenerator::LaunchKernel(const CNodePtr &anf_node_ptr, uint32_t stream_i
    LaunchAddrCleanKernel(anf_node_ptr, &kernel_inputs);
  }

  std::vector<TaskInfoPtr> task_info_ptrs =
    kernel_mod->GenTask(kernel_inputs, kernel_workspaces, kernel_outputs, stream_id);
  std::vector<TaskInfoPtr> task_info_ptrs = dynamic_cast<kernel::AscendKernelMod *>(kernel_mod)
                                              ->GenTask(kernel_inputs, kernel_workspaces, kernel_outputs, stream_id);
  task_info_list->insert(task_info_list->end(), task_info_ptrs.begin(), task_info_ptrs.end());
  return true;
 }
--- a/mindspore/ccsrc/device/ascend/tasksink/task_generator.h
+++ b/mindspore/ccsrc/device/ascend/tasksink/task_generator.h
@@ -24,7 +24,7 @@
 #include <vector>
 #include "device/kernel_runtime.h"
 #include "ir/anf.h"
 #include "kernel/kernel.h"
 #include "kernel/ascend_kernel_mod.h"
 #include "framework/ge_runtime/task_info.h"

 namespace mindspore {
--- a/mindspore/ccsrc/device/gpu/gpu_kernel_build.cc
+++ b/mindspore/ccsrc/device/gpu/gpu_kernel_build.cc
@@ -21,7 +21,6 @@
 #include "kernel/gpu/gpu_kernel_factory.h"
 #include "operator/ops.h"
 #include "pybind11/stl.h"
 #include "transform/convert.h"
 #include "session/anf_runtime_algorithm.h"
 namespace mindspore {
 namespace device {
--- a/mindspore/ccsrc/device/gpu/kernel_info_setter.cc
+++ b/mindspore/ccsrc/device/gpu/kernel_info_setter.cc
@@ -91,7 +91,7 @@ std::string SupportedTypeList(const CNodePtr& kernel_node) {
  return supported_type_lists;
 }

 bool SelectAkgKernel(const CNodePtr& kernel_node, const shared_ptr<KernelBuildInfo>& selected_kernel_info) {
 bool SelectAkgKernel(const CNodePtr& kernel_node, const std::shared_ptr<KernelBuildInfo>& selected_kernel_info) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  MS_EXCEPTION_IF_NULL(selected_kernel_info);
  std::vector<std::shared_ptr<KernelBuildInfo>> kernel_info_list;
--- a/mindspore/ccsrc/device/kernel_adjust.cc
+++ b/mindspore/ccsrc/device/kernel_adjust.cc
@@ -32,6 +32,7 @@
 #include "device/ascend/profiling/profiling_manager.h"
 #include "device/ascend/kernel_select_ascend.h"
 #include "device/kernel_info.h"
 #include "runtime/base.h"

 constexpr auto kLoopCountParamName = "loop_count";
 constexpr auto kIterLoopParamName = "iter_loop";
--- a/mindspore/ccsrc/ir/anf.cc
+++ b/mindspore/ccsrc/ir/anf.cc
@@ -197,6 +197,23 @@ PrimitivePtr GetCNodePrimitive(const AnfNodePtr& node) {
  return nullptr;
 }

 std::string GetCNodeFuncName(const CNodePtr cnode) {
  if (cnode->inputs().empty()) {
    return "";
  }

  AnfNodePtr valuenode = cnode->input(0);
  if (valuenode->isa<ValueNode>()) {
    auto value = GetValueNode(valuenode);
    // check whether the valuenode is primitive
    if (value->isa<Primitive>()) {
      return value->cast<PrimitivePtr>()->name();
    }
    return value->ToString();
  }
  return "";
 }

 bool IsPrimitive(const AnfNodePtr& node, const PrimitivePtr& value) {
  if (IsValueNode<Primitive>(node)) {
    PrimitivePtr fn_value = GetValueNode<PrimitivePtr>(node);
--- a/mindspore/ccsrc/ir/anf.h
+++ b/mindspore/ccsrc/ir/anf.h
@@ -384,6 +384,8 @@ static S GetValue(const ValuePtr &value) {
  return v;
 }

 std::string GetCNodeFuncName(CNodePtr cnode);

 // used to check whether an AnfNode is a cnode with a kind of Primitive as first input
 bool IsPrimitiveCNode(const AnfNodePtr &node, const PrimitivePtr &value);

--- a/mindspore/ccsrc/ir/meta_tensor.cc
+++ b/mindspore/ccsrc/ir/meta_tensor.cc
@@ -25,7 +25,6 @@
 #include "device/device_address.h"
 #include "pybind_api/api_register.h"
 #include "pybind_api/export_flags.h"
 #include "pynative/pynative_execute.h"
 #include "pipeline/static_analysis/abstract_value.h"

 namespace mindspore {
--- a/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_mod.h
+++ b/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_mod.h
@@ -18,11 +18,11 @@
 #include <vector>
 #include <memory>
 #include <string>
 #include "kernel/kernel.h"
 #include "kernel/ascend_kernel_mod.h"
 #include "kernel/aicpu/aicpu_util.h"
 namespace mindspore {
 namespace kernel {
 class AicpuOpKernelMod : public KernelMod {
 class AicpuOpKernelMod : public AscendKernelMod {
 public:
  AicpuOpKernelMod();
  ~AicpuOpKernelMod() override;
--- a/mindspore/ccsrc/kernel/akg/akgkernelbuild.cc
+++ b/mindspore/ccsrc/kernel/akg/akgkernelbuild.cc
@@ -35,7 +35,6 @@
 #include "utils/convert_utils.h"
 #include "utils/any.h"
 #include "utils/utils.h"
 #include "transform/convert.h"
 #include "session/anf_runtime_algorithm.h"
 #include "kernel/akg/akg_kernel_attrs_process.h"

@@ -240,8 +239,8 @@ bool AkgKernelBuild::CreateOutputDescJson(const AnfNodePtr &anf_node, nlohmann::
  return true;
 }

 void GetJson(const AnfNodePtr &anf_node, const vector<int> &dyn_input_sizes, const shared_ptr<OpAttr> &op_attr,
             nlohmann::json *const attr_json, const ValuePtr &attr_value) {
 void GetJson(const AnfNodePtr &anf_node, const std::vector<int> &dyn_input_sizes,
             const std::shared_ptr<OpAttr> &op_attr, nlohmann::json *const attr_json, const ValuePtr &attr_value) {
  MS_EXCEPTION_IF_NULL(anf_node);
  MS_EXCEPTION_IF_NULL(op_attr);
  MS_EXCEPTION_IF_NULL(attr_json);
--- a/mindspore/ccsrc/kernel/ascend_kernel_mod.h
+++ b/mindspore/ccsrc/kernel/ascend_kernel_mod.h
@@ -0,0 +1,36 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_ASCEND_KERNEL_MOD_H_
 #define MINDSPORE_CCSRC_KERNEL_ASCEND_KERNEL_MOD_H_

 #include <vector>
 #include <memory>
 #include "framework/ge_runtime/task_info.h"
 #include "kernel/kernel.h"

 using TaskInfoPtr = std::shared_ptr<ge::model_runner::TaskInfo>;
 namespace mindspore {
 namespace kernel {
 class AscendKernelMod : public KernelMod {
 public:
  virtual std::vector<TaskInfoPtr> GenTask(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
                                           const std::vector<AddressPtr> &, uint32_t) = 0;
 };
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_ASCEND_KERNEL_MOD_H_
--- a/mindspore/ccsrc/kernel/common_utils.cc
+++ b/mindspore/ccsrc/kernel/common_utils.cc
@@ -19,7 +19,6 @@
 #include <map>
 #include <iostream>
 #include <fstream>
 #include "runtime/rt.h"
 #include "nlohmann/json.hpp"
 #include "session/anf_runtime_algorithm.h"
 #include "common/utils.h"
@@ -490,7 +489,7 @@ void SaveJsonInfo(const std::string &json_name, const std::string &info) {
  if (!filewrite.is_open()) {
    return;
  }
  filewrite << info << endl;
  filewrite << info << std::endl;
  filewrite.close();
  if (nullptr == realpath(path.c_str(), real_path)) {
    MS_LOG(DEBUG) << "dir " << path << " does not exit.";
--- a/mindspore/ccsrc/kernel/gpu/nn/lstm_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/lstm_gpu_kernel.h
@@ -226,12 +226,12 @@ class LstmGpuKernel : public GpuKernel {
  size_t reserved_size_;

  // input desc
  unique_ptr<cudnnTensorDescriptor_t[]> x_desc_;
  std::unique_ptr<cudnnTensorDescriptor_t[]> x_desc_;
  cudnnTensorDescriptor_t hx_desc_;
  cudnnTensorDescriptor_t cx_desc_;
  cudnnFilterDescriptor_t w_desc_;
  cudnnDropoutDescriptor_t dropout_desc_;
  unique_ptr<cudnnTensorDescriptor_t[]> y_desc_;
  std::unique_ptr<cudnnTensorDescriptor_t[]> y_desc_;
  cudnnTensorDescriptor_t hy_desc_;
  cudnnTensorDescriptor_t cy_desc_;
  cudnnRNNDescriptor_t rnn_desc_;
--- a/mindspore/ccsrc/kernel/gpu/nn/lstm_grad_data_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/lstm_grad_data_gpu_kernel.h
@@ -258,8 +258,8 @@ class LstmGradDataGpuKernel : public GpuKernel {
  cudnnRNNDescriptor_t rnn_desc_;

  // input desc
  unique_ptr<cudnnTensorDescriptor_t[]> y_desc_;
  unique_ptr<cudnnTensorDescriptor_t[]> dy_desc_;
  std::unique_ptr<cudnnTensorDescriptor_t[]> y_desc_;
  std::unique_ptr<cudnnTensorDescriptor_t[]> dy_desc_;
  cudnnTensorDescriptor_t dhy_desc_;
  cudnnTensorDescriptor_t dcy_desc_;
  cudnnFilterDescriptor_t w_desc_;
@@ -269,7 +269,7 @@ class LstmGradDataGpuKernel : public GpuKernel {
  cudnnDropoutDescriptor_t dropout_desc_;

  // output desc
  unique_ptr<cudnnTensorDescriptor_t[]> dx_desc_;
  std::unique_ptr<cudnnTensorDescriptor_t[]> dx_desc_;
  cudnnTensorDescriptor_t dhx_desc_;
  cudnnTensorDescriptor_t dcx_desc_;

--- a/mindspore/ccsrc/kernel/gpu/nn/lstm_grad_weight_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/lstm_grad_weight_gpu_kernel.h
@@ -214,9 +214,9 @@ class LstmGradWeightGpuKernel : public GpuKernel {
  cudnnDropoutDescriptor_t dropout_desc_;

  // input desc
  unique_ptr<cudnnTensorDescriptor_t[]> x_desc_;
  std::unique_ptr<cudnnTensorDescriptor_t[]> x_desc_;
  cudnnTensorDescriptor_t hx_desc_;
  unique_ptr<cudnnTensorDescriptor_t[]> y_desc_;
  std::unique_ptr<cudnnTensorDescriptor_t[]> y_desc_;

  // output desc
  cudnnFilterDescriptor_t dw_desc_;
--- a/mindspore/ccsrc/kernel/hccl/hccl_kernel.h
+++ b/mindspore/ccsrc/kernel/hccl/hccl_kernel.h
@@ -23,14 +23,14 @@
 #include <vector>
 #include <algorithm>
 #include <utility>
 #include "kernel/kernel.h"
 #include "kernel/ascend_kernel_mod.h"
 #include "kernel/hccl/hcom_util.h"
 #include "hccl/hcom.h"
 #include "common/utils.h"

 namespace mindspore {
 namespace kernel {
 class HcclKernel : public KernelMod {
 class HcclKernel : public AscendKernelMod {
 public:
  HcclKernel();
  ~HcclKernel() override;
--- a/mindspore/ccsrc/kernel/kernel.h
+++ b/mindspore/ccsrc/kernel/kernel.h
@@ -25,7 +25,6 @@
 #include "ir/meta_tensor.h"
 #include "pipeline/static_analysis/dshape.h"
 #include "utils/log_adapter.h"
 #include "framework/ge_runtime/task_info.h"

 namespace mindspore {
 enum KernelType : int { UNKNOWN_KERNEL_TYPE = 0, AUTO_DIFF_KERNEL, AICPU_KERNEL, RT_KERNEL, HCCL_KERNEL, TBE_KERNEL };
@@ -111,7 +110,6 @@ struct Address {
  size_t size;
 };
 using AddressPtr = std::shared_ptr<Address>;
 using TaskInfoPtr = std::shared_ptr<ge::model_runner::TaskInfo>;

 class KernelMod {
 public:
@@ -120,10 +118,6 @@ class KernelMod {
  virtual const std::vector<size_t> &GetWorkspaceSizeList() const = 0;
  virtual bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                      const std::vector<AddressPtr> &outputs, uintptr_t stream_ptr) = 0;
  virtual std::vector<TaskInfoPtr> GenTask(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
                                           const std::vector<AddressPtr> &, uint32_t) {
    return {};
  }
  virtual std::vector<size_t> GenParameters() { return {}; }

  virtual ~KernelMod() = default;
--- a/mindspore/ccsrc/kernel/mng/rt_kernel.h
+++ b/mindspore/ccsrc/kernel/mng/rt_kernel.h
@@ -22,12 +22,12 @@
 #include <memory>
 #include <map>
 #include <string>
 #include "kernel/kernel.h"
 #include "kernel/ascend_kernel_mod.h"
 #include "kernel/task_stream.h"

 namespace mindspore {
 namespace kernel {
 class RtKernel : public KernelMod {
 class RtKernel : public AscendKernelMod {
 public:
  RtKernel();
  ~RtKernel() override;
--- a/mindspore/ccsrc/kernel/oplib/oplib.cc
+++ b/mindspore/ccsrc/kernel/oplib/oplib.cc
@@ -19,7 +19,7 @@
 #include <unordered_map>
 #include <memory>
 #include "utils/log_adapter.h"
 #include "kernel/oplib/opinfo.h"
 #include "utils/overload.h"
 #include "utils/context/ms_context.h"

 namespace mindspore {
@@ -50,7 +50,7 @@ constexpr auto kNeedCompile = "need_compile";
 constexpr auto kShape = "shape";
 std::vector<std::shared_ptr<OpInfo>> OpLib::op_info_;

 string ImplTypeToStr(OpImplyType impl_type) {
 std::string ImplTypeToStr(OpImplyType impl_type) {
  switch (impl_type) {
    case kTBE:
      return kTbe;
--- a/mindspore/ccsrc/kernel/tbe/tbe_kernel_build.h
+++ b/mindspore/ccsrc/kernel/tbe/tbe_kernel_build.h
@@ -48,7 +48,7 @@ class TbeKernelBuild {
 private:
  TbeKernelBuild() = default;
  ~TbeKernelBuild() = default;
  static bool GenFusionDataInputJson(const shared_ptr<mindspore::AnfNode> &data_input, nlohmann::json *data_str,
  static bool GenFusionDataInputJson(const std::shared_ptr<mindspore::AnfNode> &data_input, nlohmann::json *data_str,
                                     size_t *index);
  static bool GenFusionComputeJson(const mindspore::AnfNodePtr &compute_node,
                                   std::vector<std::vector<mindspore::AnfNodePtr>>::iterator *layer_iter,
@@ -56,12 +56,13 @@ class TbeKernelBuild {
  static bool GenFusionComputeInputeJson(const mindspore::CNodePtr &cnode,
                                         std::vector<std::vector<mindspore::AnfNodePtr>>::iterator *layer_iter,
                                         std::vector<nlohmann::json> *input_desc_list, size_t *index);
  static void GenDescJson(const shared_ptr<mindspore::AnfNode> &anf_node, size_t out_idx, nlohmann::json *output_desc);
  static void GenReusedOutputDesc(const shared_ptr<mindspore::AnfNode> &anf_node, size_t index, size_t output_index,
                                  nlohmann::json *output_desc);
  static void GenDescJson(const std::shared_ptr<mindspore::AnfNode> &anf_node, size_t out_idx,
                          nlohmann::json *output_desc);
  static void GenReusedOutputDesc(const std::shared_ptr<mindspore::AnfNode> &anf_node, size_t index,
                                  size_t output_index, nlohmann::json *output_desc);
  static size_t GetIOSizeImpl(const nlohmann::json &desc);
  static bool GetInputLayers(const vector<mindspore::AnfNodePtr> &input_nodes,
                             const vector<mindspore::AnfNodePtr> &compute_nodes,
  static bool GetInputLayers(const std::vector<mindspore::AnfNodePtr> &input_nodes,
                             const std::vector<mindspore::AnfNodePtr> &compute_nodes,
                             std::vector<std::vector<mindspore::AnfNodePtr>> *input_layers);
  static bool IsDynamicInput(const CNodePtr &cnode);
  static size_t GetOptionalInput(const CNodePtr &cnode, bool is_dynamic_input);
@@ -82,15 +83,17 @@ class TbeKernelJsonCreator {
  bool GenTbeAttrJson(const std::shared_ptr<AnfNode> &anf_node, const std::shared_ptr<OpInfo> &op_info,
                      nlohmann::json *attrs_json);
  void ParseAttrValue(const std::string &type, const ValuePtr &value, nlohmann::json *attr_obj);
  bool GenInputDescJson(const shared_ptr<AnfNode> &anf_node, size_t real_input_index, bool value,
                        const shared_ptr<OpIOInfo> &input_ptr, const string &op_input_name, size_t input_i,
                        vector<nlohmann::json> *input_list);
  bool GenOutputDescJson(const shared_ptr<AnfNode> &anf_node, const vector<std::shared_ptr<OpIOInfo>> &outputs_ptr,
                         nlohmann::json *outputs_json);
  bool GenInputList(const shared_ptr<AnfNode> &anf_node, size_t input_tensor_num, const shared_ptr<OpIOInfo> &input_ptr,
                    size_t *real_input_index, string *op_input_name, vector<nlohmann::json> *input_list);
  void GenOutputList(const shared_ptr<AnfNode> &anf_node, const size_t &output_obj_num,
                     const shared_ptr<OpIOInfo> &output_ptr, size_t *output_idx, vector<nlohmann::json> *output_list);
  bool GenInputDescJson(const std::shared_ptr<AnfNode> &anf_node, size_t real_input_index, bool value,
                        const std::shared_ptr<OpIOInfo> &input_ptr, const string &op_input_name, size_t input_i,
                        std::vector<nlohmann::json> *input_list);
  bool GenOutputDescJson(const std::shared_ptr<AnfNode> &anf_node,
                         const std::vector<std::shared_ptr<OpIOInfo>> &outputs_ptr, nlohmann::json *outputs_json);
  bool GenInputList(const std::shared_ptr<AnfNode> &anf_node, size_t input_tensor_num,
                    const std::shared_ptr<OpIOInfo> &input_ptr, size_t *real_input_index, string *op_input_name,
                    std::vector<nlohmann::json> *input_list);
  void GenOutputList(const std::shared_ptr<AnfNode> &anf_node, const size_t &output_obj_num,
                     const std::shared_ptr<OpIOInfo> &output_ptr, size_t *output_idx,
                     std::vector<nlohmann::json> *output_list);
  kCreaterType creater_type_;
  std::string json_name_;
  std::string json_info_;
--- a/mindspore/ccsrc/kernel/tbe/tbe_kernel_mod.h
+++ b/mindspore/ccsrc/kernel/tbe/tbe_kernel_mod.h
@@ -21,12 +21,12 @@
 #include <string>
 #include <vector>
 #include <utility>
 #include "kernel/kernel.h"
 #include "kernel/ascend_kernel_mod.h"
 #include "kernel/tbe/tbe_utils.h"

 namespace mindspore {
 namespace kernel {
 class TbeKernelMod : public KernelMod {
 class TbeKernelMod : public AscendKernelMod {
 public:
  explicit TbeKernelMod(KernelPackPtr kernel_pack) : kernel_pack_(std::move(kernel_pack)) {}
  ~TbeKernelMod() override = default;
--- a/mindspore/ccsrc/kernel/tbe/tbe_kernel_parallel_build.h
+++ b/mindspore/ccsrc/kernel/tbe/tbe_kernel_parallel_build.h
@@ -55,8 +55,9 @@ class ParallelBuildManager {
  bool WaitOne(int *task_id, char **task_result) const;
  bool IsAllTaskFinish() const;
  std::pair<int32_t, KernelModPtr> TaskFinishProcess(int32_t task_id, bool set_kernel_mod = true);
  KernelModPtr GenKernelMod(const string &json_name, const string &processor, const vector<size_t> &input_size_list,
                            const vector<size_t> &output_size_list, const KernelPackPtr &kernel_pack) const;
  KernelModPtr GenKernelMod(const string &json_name, const string &processor,
                            const std::vector<size_t> &input_size_list, const std::vector<size_t> &output_size_list,
                            const KernelPackPtr &kernel_pack) const;

 private:
  PyObject *tbe_parallel_compiler_;
--- a/mindspore/ccsrc/kernel/tbe/tbe_kernel_select.cc
+++ b/mindspore/ccsrc/kernel/tbe/tbe_kernel_select.cc
@@ -168,7 +168,7 @@ bool ParseDynamicFormatJson(const std::string &jsonStr, std::vector<std::shared_
  return true;
 }

 std::string OpSelectFormat(const shared_ptr<AnfNode> &anf_node) {
 std::string OpSelectFormat(const std::shared_ptr<AnfNode> &anf_node) {
  nlohmann::json kernel_json;
  std::string res_json_str;
  TbeKernelJsonCreator creator(OP_SELECT_FORMAT);
@@ -182,7 +182,7 @@ std::string OpSelectFormat(const shared_ptr<AnfNode> &anf_node) {
  return res_json_str;
 }

 void SetTidyInputsInfo(const shared_ptr<AnfNode> &anf_node,
 void SetTidyInputsInfo(const std::shared_ptr<AnfNode> &anf_node,
                       const std::shared_ptr<KernelBuildInfo::KernelBuildInfoBuilder> &builder,
                       const std::vector<std::shared_ptr<OpIOInfo>> &inputs) {
  std::vector<TypeId> inputs_type;
@@ -231,7 +231,7 @@ void SetTidyInputsInfo(const shared_ptr<AnfNode> &anf_node,
  builder->SetInputsFormat(inputs_format);
 }

 void SetTidyOutputsInfo(const shared_ptr<AnfNode> &anf_node,
 void SetTidyOutputsInfo(const std::shared_ptr<AnfNode> &anf_node,
                        const std::shared_ptr<KernelBuildInfo::KernelBuildInfoBuilder> &builder,
                        const std::vector<std::shared_ptr<OpIOInfo>> &outputs) {
  std::vector<TypeId> outputs_type;
@@ -268,7 +268,8 @@ void SetTidyOutputsInfo(const shared_ptr<AnfNode> &anf_node,
  builder->SetOutputsFormat(outputs_format);
 }

 void GenTidyKernelBuildInfo(const shared_ptr<AnfNode> &anf_node, const std::vector<std::shared_ptr<OpIOInfo>> &inputs,
 void GenTidyKernelBuildInfo(const std::shared_ptr<AnfNode> &anf_node,
                            const std::vector<std::shared_ptr<OpIOInfo>> &inputs,
                            const std::vector<std::shared_ptr<OpIOInfo>> &outputs) {
  auto builder_tmp = std::make_shared<KernelBuildInfo::KernelBuildInfoBuilder>();
  builder_tmp->SetKernelType(TBE_KERNEL);
--- a/mindspore/ccsrc/kernel/tbe/tbe_utils.cc
+++ b/mindspore/ccsrc/kernel/tbe/tbe_utils.cc
@@ -26,6 +26,7 @@
 #include <iostream>
 #include <fstream>

 #include "runtime/kernel.h"
 #include "kernel/oplib/oplib.h"
 #include "utils/utils.h"
 #include "session/anf_runtime_algorithm.h"
--- a/mindspore/ccsrc/pipeline/base.h
+++ b/mindspore/ccsrc/pipeline/base.h
@@ -0,0 +1,64 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_PIPELINE_BASE_H_
 #define MINDSPORE_CCSRC_PIPELINE_BASE_H_

 #include <mutex>
 #include <memory>
 #include <string>
 #include <sstream>

 #include "ir/anf.h"
 #include "pipeline/resource.h"
 #include "utils/context/ms_context.h"

 namespace mindspore {
 namespace pipeline {

 struct ExecutorInfo {
  FuncGraphPtr func_graph;
  ResourcePtr resource;
  std::size_t arg_list_size;
 };

 using ExecutorInfoPtr = std::shared_ptr<ExecutorInfo>;

 inline std::string GetPhasePrefix(const std::string& phase) {
  auto pos = phase.find('.');
  if (pos == std::string::npos) {
    MS_LOG(EXCEPTION) << "phase has no . for prefix" << phase;
  }
  return phase.substr(0, pos);
 }

 inline std::string GetFilePathName(const std::string& file_name) {
  std::ostringstream oss;
  auto ms_context = MsContext::GetInstance();
  if (ms_context == nullptr) {
    MS_LOG(EXCEPTION) << "ms_context is nullptr";
  }
  auto save_graphs_path = ms_context->save_graphs_path();
  if (save_graphs_path.empty()) {
    save_graphs_path = ".";
  }
  oss << save_graphs_path << "/" << file_name;
  return oss.str();
 }
 }  // namespace pipeline
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_PIPELINE_BASE_H_
--- a/mindspore/ccsrc/pipeline/init.cc
+++ b/mindspore/ccsrc/pipeline/init.cc
@@ -73,7 +73,7 @@ PYBIND11_MODULE(_c_expression, m) {
         "Get CNode Strategy Dictionary.")
    .def("get_allreduce_fusion", &ExecutorPy::GetAllreduceFusion, py::arg("phase") = py::str("train"),
         "Get Allreduce Fusion Dictionary.")
    .def("build_data_graph", &ExecutorPy::BuildDFGraph, py::arg("build_params"), py::arg("phase") = py::str("train"),
    .def("build_data_graph", &ExecutorPy::BuildGraph, py::arg("build_params"), py::arg("phase") = py::str("train"),
         py::arg("broadcast_params") = py::dict(), "Build data graph.")
    .def("has_compiled", &ExecutorPy::HasCompiled, py::arg("phase") = py::str(""), "get if cell compiled.")
    .def("run_init_graph", &ExecutorPy::RunInitGraph, "Run init Graph.");
@@ -86,19 +86,17 @@ PYBIND11_MODULE(_c_expression, m) {

  (void)m.def("generate_key", &mindspore::pipeline::GenerateKey, "Generate the function graph key.");
  (void)m.def("real_run_op", &mindspore::pynative::RunOp, "Run op pynatively.");
  (void)m.def("initialize_distribute", &mindspore::pipeline::InitDistribute, "Initialize for Distribute.")
    .def("init_ge", &mindspore::pipeline::InitGe, "Init GE");
  (void)m.def("reset_op_id", &mindspore::pipeline::ResetOpId, "Reset Operator Id");
  (void)m.def("init_hccl", &mindspore::pipeline::InitHccl, "Init Hccl");
  (void)m.def("finalize_ge", &mindspore::pipeline::FinalizeGe, "Finalize Ge");
  (void)m.def("finalize_hccl", &mindspore::pipeline::FinalizeHccl, "Finalize Hccl");
  (void)m.def("set_ge_option", &mindspore::pipeline::SetGeOption, "API for set ge option.");
  (void)m.def("verify_inputs_signature", &mindspore::pipeline::VerifyInputSignature, "Verify input signature.");
  (void)m.def("init_exec_dataset", &mindspore::pipeline::InitExecDataset, py::arg("queue_name"), py::arg("size"),
              py::arg("batch_size"), py::arg("types"), py::arg("shapes"), py::arg("input_indexs"),
              py::arg("phase") = py::str("dataset"), "Init and exec dataset.");
  (void)m.def("_set_dataset_mode_config", &mindspore::ConfigManager::SetDatasetModeConfig, "API for set dataset mode.");
  (void)m.def("export_graph", &mindspore::pipeline::ExportDFGraph, "Export Graph.");
  (void)m.def("init_ge", &mindspore::pipeline::InitGe, "Init GE");

  (void)m.def("export_graph", &mindspore::pipeline::ExportGraph, "Export Graph.");

  (void)py::class_<mindspore::MsContext, std::shared_ptr<mindspore::MsContext>>(m, "MSContext")
    .def_static("get_instance", &mindspore::MsContext::GetInstance, "Get ms context instance.")
--- a/mindspore/ccsrc/pipeline/parse/python_adapter.cc
+++ b/mindspore/ccsrc/pipeline/parse/python_adapter.cc
@@ -27,6 +27,7 @@ static std::shared_ptr<py::scoped_interpreter> scoped_ = nullptr;
 //  true: start process from python, false: start process from c++
 static bool python_env_ = false;
 static bool use_signature_in_resolve_ = true;
 void ResetPythonScope() { scoped_ = nullptr; }
 void set_use_signature_in_resolve(bool use_signature) noexcept { use_signature_in_resolve_ = use_signature; }
 bool UseSignatureInResolve() { return use_signature_in_resolve_; }
 void set_python_env_flag(bool python_env) noexcept { python_env_ = python_env; }
--- a/mindspore/ccsrc/pipeline/parse/python_adapter.h
+++ b/mindspore/ccsrc/pipeline/parse/python_adapter.h
@@ -55,6 +55,7 @@ void set_use_signature_in_resolve(bool use_signature) noexcept;
 bool UseSignatureInResolve();

 std::shared_ptr<py::scoped_interpreter> set_python_scoped();
 void ResetPythonScope();
 bool IsPythonEnv();
 void SetPythonPath(const std::string& path);
 void set_python_env_flag(bool python_env) noexcept;
--- a/mindspore/ccsrc/pipeline/pipeline.cc
+++ b/mindspore/ccsrc/pipeline/pipeline.cc
@@ -27,11 +27,6 @@
 #include "pipeline/pass.h"
 #include "pipeline/parse/data_converter.h"
 #include "optimizer/ad/dfunctor.h"
 #include "ir/meta_tensor.h"
 #include "transform/convert.h"
 #include "transform/df_graph_manager.h"
 #include "transform/graph_builder.h"
 #include "transform/graph_runner.h"
 #include "debug/anf_ir_dump.h"
 #include "debug/anf_ir_utils.h"
 #include "utils/config_manager.h"
@@ -44,6 +39,12 @@
 #include "device/kernel_runtime_manager.h"
 #include "debug/trace.h"

 #if (ENABLE_GE || ENABLE_D)
 #include "pipeline/pipeline_ge.h"
 #include "transform/convert.h"
 #include "transform/df_graph_manager.h"
 #endif

 namespace mindspore {
 // namespace to support intermediate representation definition
 namespace pipeline {
@@ -54,12 +55,6 @@ using mindspore::abstract::AbstractTensor;
 using mindspore::abstract::AbstractTensorPtr;
 using mindspore::abstract::AbstractTuple;
 using mindspore::abstract::AbstractTuplePtr;
 using mindspore::transform::DfGraphConvertor;
 using mindspore::transform::DfGraphManager;
 using mindspore::transform::GeTensorPtr;
 using mindspore::transform::MeTensorPtr;
 using mindspore::transform::Status;
 using mindspore::transform::TransformUtil;

 const char IR_TYPE_ANF[] = "anf_ir";
 const char IR_TYPE_ONNX[] = "onnx_ir";
@@ -85,65 +80,8 @@ std::string GetBaseNameForIR(int stage_idx, const std::string& action_name) {
  oss << save_graphs_path << "/" << stage_idx << "_" << action_name;
  return oss.str();
 }

 std::string GetFilePathName(const std::string& file_name) {
  std::ostringstream oss;
  auto ms_context = MsContext::GetInstance();
  if (ms_context == nullptr) {
    MS_LOG(EXCEPTION) << "ms_context is nullptr";
  }
  auto save_graphs_path = ms_context->save_graphs_path();
  if (save_graphs_path.empty()) {
    save_graphs_path = ".";
  }
  oss << save_graphs_path << "/" << file_name;
  return oss.str();
 }
 }  // namespace

 // We will not execute graph when output is constant or just input itself.
 static bool IsGraphOutputValueNodeOrParameter(const AnfNodePtr& output, const py::tuple& args,
                                              const std::shared_ptr<py::object>& ret_val) {
  if (output->isa<ValueNode>()) {
    MS_LOG(INFO) << "Graph's output is a constant. No need to execute.";
    ValuePtr value = GetValueNode(output);
    *ret_val = ValuePtrToPyData(value);
    return true;
  }

  // Adapter will transform values in __init__() and construct() to parameters, this could cause
  // inputs (a.k.a args in current function) size less than parameters'.
  if (output->isa<Parameter>()) {
    MS_LOG(INFO) << "Graph's output is a parameter. If all params are inputs, no need to execute.";
    if (args.empty()) {
      MS_LOG(EXCEPTION) << "Inputs size is 0, let graph to be executed.";
    }
    // Find the right parameter as ret_val.
    auto func_graph = output->func_graph();
    MS_EXCEPTION_IF_NULL(func_graph);
    auto params = func_graph->parameters();
    if (params.empty()) {
      MS_EXCEPTION(UnknownError) << "Graph's parameters size is 0";
    }
    if (args.size() != params.size()) {
      MS_LOG(EXCEPTION) << "Input size " << args.size() << " not equal to params size " << params.size()
                        << ", let graph to be executed.";
    }

    auto it = std::find(params.begin(), params.end(), output);
    if (it == params.end()) {
      MS_EXCEPTION(UnknownError) << "When graph output is Parameter,  it should be found in graph parameters";
    }
    size_t index = it - params.cbegin();
    if (index >= args.size()) {
      MS_EXCEPTION(UnknownError) << "Index " << index << " equal or larger than args size " << args.size() << ".";
    }
    *ret_val = args[index];
    return true;
  }
  return false;
 }

 py::tuple GenerateKey(const std::string& name, const std::unordered_map<std::string, py::object>& defaults) {
  MS_LOG(DEBUG) << "GenerateKey args size:" << defaults.size();
  abstract::AbstractBasePtrList args_spec;
@@ -207,11 +145,7 @@ py::bool_ VerifyInputSignature(const py::list input_signature, const py::tuple i
  return true;
 }

 ExecutorPy::ExecutorPy() {
  // because Ge only support one Session exist at the same time ,so we delete the old one
  DfGraphManager::GetInstance().DeleteGraphRunner();
  DfGraphManager::GetInstance().DeleteGeSession();
 }
 ExecutorPy::ExecutorPy() {}

 ResourcePtr ExecutorPy::GetResource(const std::string& phase) {
  MS_LOG(DEBUG) << "phase size:" << info_.size();
@@ -221,14 +155,6 @@ ResourcePtr ExecutorPy::GetResource(const std::string& phase) {
  return info_[phase]->resource;
 }

 std::string GetPhasePrefix(const std::string& phase) {
  auto pos = phase.find('.');
  if (pos == std::string::npos) {
    MS_LOG(EXCEPTION) << "phase has no . for prefix" << phase;
  }
  return phase.substr(0, pos);
 }

 FuncGraphPtr ExecutorPy::GetFuncGraph(const std::string& phase) {
  if (info_.count(phase) == 0) {
    MS_LOG(EXCEPTION) << "no phase in executor:" << GetPhasePrefix(phase);
@@ -323,11 +249,15 @@ void ExecutorPy::DelNetRes(const std::string& id) {
      }
    }

    MS_LOG(INFO) << "Delete flag:" << flag;
 #ifdef ENABLE_GE
    if (flag && info_.size() == 0) {
      DfGraphManager::GetInstance().DeleteGraphRunner();
      DfGraphManager::GetInstance().EraseAnfGraph();
      DfGraphManager::GetInstance().DeleteGeSession();
      // because Ge only support one Session exist at the same time ,so we delete the old one
      transform::DfGraphManager::GetInstance().DeleteGraphRunner();
      transform::DfGraphManager::GetInstance().EraseAnfGraph();
      transform::DfGraphManager::GetInstance().DeleteGeSession();
    }
 #endif
  }
 }

@@ -405,7 +335,8 @@ bool ExecutorPy::CompileInner(const py::object& obj, const py::tuple& args, cons

  use_vm = ChangeExportGeirUseVmFlag(use_vm, phase_s);

  if (use_vm) {
  std::string backend = MsContext::GetInstance()->backend_policy();
  if (use_vm && backend != "ge") {
    // Create backend and session
    resource->results()[kBackend] = compile::CreateBackend();
    p_actions = VmPipeline();
@@ -497,30 +428,6 @@ bool ExecutorPy::Compile(const py::object& obj, const py::tuple& args, const py:
  return ret_value;
 }

 void SetGeOption(const std::map<std::string, std::string>& options) {
  ConfigManager::GetInstance().set_ge_initialize_options(options);
 }

 bool InitDistribute(const std::map<std::string, std::string>& options) {
  ConfigManager::GetInstance().set_parallel_strategy(ParallelStrategy::DISTRIBUTION);
  MS_LOG(INFO) << "ME run in DISTRIBUTION strategy mode";

  SetGeOption(options);
 #ifdef ENABLE_GE
  auto ge_options = ConfigManager::GetInstance().ge_initialize_options();
  {
    // Release GIL before calling into (potentially long-running) C++ code
    py::gil_scoped_release release;
    if (ge::GEInitialize(ge_options) != ge::GRAPH_SUCCESS) {
      MS_LOG(ERROR) << "Initialize GE failed!";
      return false;
    }
  }
 #endif
  MS_LOG(DEBUG) << "Initialize Ge success";
  return true;
 }

 #ifdef ENABLE_LOAD_ANF_IR
 // get MindSpore Intermediate Representation File
 std::string GetMsIrFile(void) {
@@ -704,9 +611,25 @@ py::object ExecutorPy::Run(const py::tuple& args, const py::object& phase) {
  }
  auto phase_s = py::cast<std::string>(phase);
  std::string backend = MsContext::GetInstance()->backend_policy();
 #ifdef ENABLE_GE
  if (backend == "ge") {
    return ExecDFGraph(info_, args, phase_s);
  }
 #else
  MS_LOG(WARNING) << "In ut test " << size << phase_s;
  if (backend == "ge") {
    return ExecDFGraph(args, phase_s);
    std::shared_ptr<py::object> ret_val = std::make_shared<py::object>();
    if (info_.count(phase_s) != 0 && info_[phase_s]->func_graph != nullptr) {
      if (IsGraphOutputValueNodeOrParameter(info_[phase_s]->func_graph->output(), args, ret_val)) {
        return *ret_val;
      }
    }
    if (args.size() > 0) {
      return args[0];
    }
    return args;
  }
 #endif
  std::size_t full_arg_size = ArgListSize(phase_s);
  if (size > full_arg_size) {
    MS_LOG(WARNING) << "The arg num : size = " << size << ". full_arg_size = " << full_arg_size;
@@ -719,435 +642,25 @@ py::object ExecutorPy::Run(const py::tuple& args, const py::object& phase) {
    MS_LOG(EXCEPTION) << "Can't find run graph func for " << phase_s;
  }

  MS_LOG(DEBUG) << "eval run";
  MS_LOG(DEBUG) << "eval run" << backend;
  BaseRef value = (*run)(arg_list);
  MS_LOG(DEBUG) << "run end";
  return BaseRefToPyData(value);
 }

 py::object ExtractGeneralCnodeRet(const AbstractBasePtr& cnode_data, const py::tuple& data, size_t* count) {
  MS_EXCEPTION_IF_NULL(cnode_data);
  if (*count >= data.size()) {
    MS_LOG(EXCEPTION) << "The number of elements in the outputs : " << data.size()
                      << " less than the number of elements required. ";
  }

  if (cnode_data->isa<AbstractTensor>()) {
    BaseShapePtr shape = cnode_data->BuildShape();
    auto shape_act = shape->cast<abstract::ShapePtr>()->shape();
    Tensor tensor_exp = py::cast<Tensor>(data[*count]);
    if (shape_act != tensor_exp.shape()) {
      MS_LOG(EXCEPTION) << "The shape of the tensor returned from GE is not the same as "
                           "the shape of the tensor derived from ME.";
    }
    return data[(*count)++];
  }

  if (!cnode_data->isa<AbstractTuple>()) {
    MS_LOG(EXCEPTION) << "The output of operator in the final anf graph could "
                      << "only be a tensor or a tuple of tensor, but got " << cnode_data->BuildValue()->ToString()
                      << ".";
  }
  auto data_tp = cnode_data->cast<AbstractTuplePtr>();
  auto elements = data_tp->elements();
  size_t size = data_tp->size();
  py::tuple tp = py::tuple(size);
  for (size_t i = 0; i < size; i++) {
    tp[i] = ExtractGeneralCnodeRet(elements[i], data, count);
  }
  return std::move(tp);
 }

 py::object StructureOutput(const AnfNodePtr& output_node, const py::tuple& data, size_t* count) {
  MS_EXCEPTION_IF_NULL(output_node);

  if (output_node->isa<ValueNode>()) {
    return ValuePtrToPyData(GetValueNode(output_node));
  }

  if (*count >= data.size()) {
    MS_LOG(EXCEPTION) << "The number of elements in the outputs : " << data.size()
                      << " less than the number of elements required. ";
  }
  if (output_node->isa<Parameter>()) {
    return data[(*count)++];
  }

  auto output_c = output_node->cast<CNodePtr>();
  if (output_c == nullptr) {
    MS_LOG(EXCEPTION) << "The final anf graph could only have constant, parameter, and operator, but got "
                      << output_node->ToString();
  }

  if (output_c->IsApply(prim::kPrimMakeTuple)) {
    auto input_list = output_c->inputs();
    size_t size = input_list.size();
    py::tuple tp = py::tuple(size - 1);
    for (size_t i = 1; i < size; i++) {
      tp[i - 1] = StructureOutput(input_list[i], data, count);
    }
    return std::move(tp);
  }
  if (output_c->IsApply(prim::kPrimDepend)) {
    return StructureOutput(output_c->input(1), data, count);
  }

  return ExtractGeneralCnodeRet(output_c->abstract(), data, count);
 }

 std::shared_ptr<py::object> DoExecGraph(const FuncGraphPtr& graph, const std::vector<MeTensorPtr>& inputs,
                                        const std::string& phase) {
  std::vector<GeTensorPtr> ge_tensors = TransformUtil::ConvertInputTensors(inputs, kOpFormat_NCHW);
  if (ge_tensors.size() != inputs.size()) {
    MS_LOG(ERROR) << "args convert to ge tensor error";
    return nullptr;
  }

  std::vector<GeTensorPtr> ge_outputs;
  transform::RunOptions run_options;

  run_options.name = phase;

  auto graph_runner = DfGraphManager::GetInstance().GetGraphRunner();

  if (graph_runner == nullptr) {
    MS_LOG(ERROR) << "Can not found GraphRunner";
    return nullptr;
  }

  {
    // Release GIL before calling into (potentially long-running) C++ code
    py::gil_scoped_release release;
    MS_LOG(DEBUG) << "Run graph begin, inputs size is: " << inputs.size();
    Status ret = graph_runner->RunGraph(run_options, ge_tensors, &ge_outputs);
    MS_LOG(DEBUG) << "Run graph finish, outputs size is: " << ge_outputs.size();
    if (ret != Status::SUCCESS) {
      MS_LOG(ERROR) << "Exec graph failed";
      return nullptr;
    }
  }

  std::vector<MeTensorPtr> me_outputs = TransformUtil::ConvertGeTensors(ge_outputs);
  if (me_outputs.size() != ge_outputs.size()) {
    MS_LOG(ERROR) << "Convert output Ge tensor to Me tensor failed";
  }

  py::tuple outputs(me_outputs.size());
  for (std::size_t i = 0; i < outputs.size(); i++) {
    outputs[i] = *me_outputs[i];
  }

  std::shared_ptr<py::object> ret = nullptr;

 #ifdef ENABLE_GE
  AnfNodePtr output_node = graph->get_return()->input(1);
  MS_EXCEPTION_IF_NULL(output_node);
  size_t count = 0;
  py::object oj = StructureOutput(output_node, outputs, &count);
  ret = std::make_shared<py::object>(oj);
 FuncGraphPtr ExecutorPy::BuildGraph(const py::dict& init_params, const std::string& phase,
                                    const py::object& broadcast_params) {
 #if (ENABLE_GE || ENABLE_D)
  return BuildDFGraph(info_, init_params, phase, broadcast_params);
 #else
  if (outputs.size() == 1) {
    ret = std::make_shared<py::object>(outputs[0]);
  } else {
    ret = std::make_shared<py::object>(outputs);
  }
 #endif

  return ret;
 }

 void DoExecNonInputGraph(const std::string& phase) {
  std::vector<GeTensorPtr> ge_tensors;
  std::vector<GeTensorPtr> ge_outputs;
  transform::RunOptions run_options;
  run_options.name = phase;
  auto graph_runner = DfGraphManager::GetInstance().GetGraphRunner();

  if (graph_runner == nullptr) {
    MS_LOG(ERROR) << "Can not found GraphRunner";
    return;
  }
  {
    // Release GIL before calling into (potentially long-running) C++ code
    py::gil_scoped_release release;
    Status ret = graph_runner->RunGraph(run_options, ge_tensors, &ge_outputs);
    if (ret != Status::SUCCESS) {
      MS_LOG(ERROR) << "Exec graph:" << run_options.name << " failed";
      return;
    }
  }
 }

 void ExecutorPy::ProcessGeArg(const py::tuple& args, const std::string& phase, std::vector<tensor::TensorPtr>* inputs) {
  // check the arg and use the ExecutorPy args
  std::size_t size = args.size();
  if (size != ArgListSize(phase)) {
    MS_LOG(EXCEPTION) << "The real arg num : size = " << size << ". graph_arg_size = " << ArgListSize(phase);
  }

  // process the first args of tensor
  // only in Dataset Feed Mode, fp_bp graph need input tensors
  if (ConfigManager::GetInstance().dataset_mode() == DS_FEED_MODE) {
    for (std::size_t i = 0; i < size; i++) {
      ValuePtr converted = nullptr;
      bool succ = parse::ConvertData(args[i], &converted);
      if (!succ) {
        MS_LOG(EXCEPTION) << "args convert error";
      }
      if (converted->isa<tensor::Tensor>()) {
        (*inputs).push_back(converted->cast<tensor::TensorPtr>());
      } else {
        MS_LOG(EXCEPTION) << "args, " << converted->ToString() << " is not tensor";
      }
    }
  }
 }

 py::object ExecutorPy::ExecDFGraph(const py::tuple& args, const std::string& phase) {
  std::string phase_prefix = GetPhasePrefix(phase);

  if (phase_prefix == "save") {
    DoExecNonInputGraph(phase);
    ConfigManager::GetInstance().ResetConfig();
    return py::none();
  }

  if (info_.count(phase) == 0) {
    MS_LOG(EXCEPTION) << "has no phase:" << phase;
  }

 #if (!defined ENABLE_GE) || (defined ENABLE_INFER)
  // Now don't use the graph because the exec ge function don't take effect
  MS_EXCEPTION_IF_NULL(info_[phase]->func_graph);
  if (ENABLE_TRAIN != info_[phase]->func_graph->flags()["training"]) {
    MS_LOG(ERROR) << "Graph training mode mismatch mode of libraries";
    ConfigManager::GetInstance().ResetConfig();
    return py::none();
  }
  return nullptr;
 #endif

  std::shared_ptr<py::object> ret_val = std::make_shared<py::object>();
  if (IsGraphOutputValueNodeOrParameter(info_[phase]->func_graph->output(), args, ret_val)) {
    ConfigManager::GetInstance().ResetConfig();
    return *ret_val;
  }

  std::vector<tensor::TensorPtr> inputs;
  ProcessGeArg(args, phase, &inputs);

  std::shared_ptr<py::object> ret = DoExecGraph(GetFuncGraph(phase), inputs, phase);
  ConfigManager::GetInstance().ResetConfig();
  if (ret != nullptr) {
    return *ret;
  } else {
    MS_LOG(EXCEPTION) << "exec graph failed";
  }
 }

 void ExecutorPy::RunInitGraph(const py::dict& init_params, const std::string& phase) {
  MS_LOG(DEBUG) << "ExecInitGraph start.";
  TensorOrderMap inputs_with_name{};
  ConvertObjectToTensors(init_params, &inputs_with_name);
  std::vector<tensor::TensorPtr> inputs;
  (void)std::transform(inputs_with_name.begin(), inputs_with_name.end(), std::back_inserter(inputs),
                       [](const std::pair<std::string, tensor::TensorPtr>& item) { return item.second; });

  std::vector<GeTensorPtr> ge_tensors = TransformUtil::ConvertInputTensors(inputs, kOpFormat_NCHW);
  if (ge_tensors.size() != inputs.size()) {
    MS_LOG(ERROR) << "Args convert to ge tensor error.";
    return;
  }
  MS_LOG(DEBUG) << "Run graph begin, inputs size is: " << inputs.size() << ".";

  std::vector<GeTensorPtr> ge_outputs;
  transform::RunOptions run_options;

  run_options.name = phase;
  if (DfGraphManager::GetInstance().GetGraphByName(phase) == nullptr) {
    MS_LOG(WARNING) << "Can not find " << phase << " sub graph, don't need data init subgraph in INFER mode.";
    return;
  }
  auto graph_runner = DfGraphManager::GetInstance().GetGraphRunner();
  if (graph_runner == nullptr) {
    MS_LOG(EXCEPTION) << "Can not found GraphRunner.";
  }
  {
    // Release GIL before calling into (potentially long-running) C++ code
    py::gil_scoped_release release;
    Status ret = graph_runner->RunGraph(run_options, ge_tensors, &ge_outputs);
    if (ret != Status::SUCCESS) {
      MS_LOG(EXCEPTION) << "Exec " << phase << " graph failed.";
    }

    MS_LOG(INFO) << "Exec " << phase << " graph success.";

    if ((ConfigManager::GetInstance().parallel_strategy() == ParallelStrategy::DISTRIBUTION) &&
        (DfGraphManager::GetInstance().GetGraphByName(BROADCAST_GRAPH_NAME) != nullptr)) {
      run_options.name = BROADCAST_GRAPH_NAME;
      ret = graph_runner->RunGraph(run_options, ge_tensors, &ge_outputs);
      if (ret != Status::SUCCESS) {
        MS_LOG(EXCEPTION) << "Exec BROADCAST_GRAPH_NAME failed.";
      }
      MS_LOG(INFO) << "Exec broadcast graph success.";
    }
  }
 }

 Status CreateSessionAndGraphRunner(bool is_training = true) {
  std::shared_ptr<ge::Session> sess = DfGraphManager::GetInstance().GetGeSession();
  if (sess == nullptr) {
    transform::SessionOptions options;
    if (is_training) {
      options["ge.trainFlag"] = "1";
      options["ge.streamNum"] = "100";
      options["ge.enabledLocalFmkop"] = "1";
      options["ge.hcomParallel"] = "1";
    } else {
      options["ge.trainFlag"] = "0";
    }

    options["ge.enablePrintOpPass"] = "0";
    sess = transform::GraphRunner::NewSession(options);
    if (sess == nullptr) {
      MS_LOG(ERROR) << "Init data graph failed, because of create Ge session failed";
      return Status::FAILED;
    } else {
      DfGraphManager::GetInstance().SetGeSession(sess);
    }
  }

  transform::GraphRunnerOptions options;
  options.sess_ptr = sess;
  auto graph_runner = std::make_shared<transform::GraphRunner>(options);
  if (graph_runner == nullptr) {
    MS_LOG(ERROR) << "Create new graph runner failed";
    return Status::FAILED;
  } else {
    DfGraphManager::GetInstance().SetGraphRunner(graph_runner);
  }

  return Status::SUCCESS;
 }

 void ExecutorPy::ConvertObjectToTensors(const py::dict& dict, TensorOrderMap* const tensors) {
  for (auto item : dict) {
    if ((!py::isinstance<py::str>(item.first))) {
      MS_LOG(WARNING) << "Type of key of py_dict is not string, ignore it.";
      continue;
    }
    std::shared_ptr<Tensor> tensor;
    std::string name = py::cast<std::string>(item.first);
    if (py::isinstance<py::float_>(item.second.attr("default_input"))) {
      // convert float to tensor with shape([1])
      tensor = std::make_shared<Tensor>(kNumberTypeFloat32, std::vector<int>({1}));
      *(static_cast<float*>(tensor->data_c(true))) = py::cast<float>(item.second.attr("default_input"));
    } else if (py::isinstance<py::int_>(item.second.attr("default_input"))) {
      // convert int to tensor with shape([1])
      tensor = std::make_shared<Tensor>(kNumberTypeInt32, std::vector<int>({1}));
      *(static_cast<float*>(tensor->data_c(true))) = py::cast<float>(item.second.attr("default_input"));
    } else if (py::hasattr(item.second.attr("default_input"), PYTHON_TENSOR_FLAG)) {
      // cast tensor
      tensor = py::cast<std::shared_ptr<Tensor>>(item.second.attr("default_input"));
    }

    if (tensor == nullptr) {
      MS_LOG(EXCEPTION) << "Get default value for " << name << " failed";
    }
    (void)tensors->emplace(name, tensor);
  }
 }

 bool ExecutorPy::AddDFGraph(const py::dict& init_params, const std::string& phase, const py::object& broadcast_params) {
  FuncGraphPtr anf_graph = info_[phase]->func_graph;
  DfGraphConvertor convertor(anf_graph);

  size_t pos = phase.find('.');
  std::string net_id = ((pos == std::string::npos || pos == phase.size() - 1) ? phase : phase.substr(pos + 1));
  std::string phase_prefix = phase.substr(0, pos);

  if (phase_prefix == "export") {
    MS_LOG(INFO) << "Set DfGraphConvertor training : false";
    convertor.set_training(false);
  }

  TensorOrderMap init_tensors{};
  ConvertObjectToTensors(init_params, &init_tensors);
  (void)convertor.ConvertAllNode().InitParam(init_tensors).BuildGraph();

  if (broadcast_params != py::none()) {
    if (!py::isinstance<py::dict>(broadcast_params)) {
      MS_LOG(ERROR) << "Invalid broadcast params, it must be py::dict type";
      return false;
    }
    py::dict broadcast = broadcast_params.cast<py::dict>();
    if (broadcast.empty()) {
      (void)convertor.GenerateBroadcastGraph(init_tensors);
    } else {
      TensorOrderMap broadcast_tensors{};
      ConvertObjectToTensors(broadcast, &broadcast_tensors);
      (void)convertor.GenerateBroadcastGraph(broadcast_tensors);
    }
    MS_LOG(INFO) << "Generate broadcast graph with params and broadcast_empty is " << broadcast.empty();
  }

  (void)convertor.GenerateCheckpointGraph();
  if (convertor.ErrCode() != 0) {
    DfGraphManager::GetInstance().ClearGraph();
    MS_LOG(ERROR) << "convert df graph failed, err:" << convertor.ErrCode();
    return false;
  }

  if (MsContext::GetInstance()->save_graphs_flag()) {
    convertor.DrawComputeGraph(GetFilePathName("ge_graph.dot"));                      // for debug
    convertor.DrawInitGraph(GetFilePathName("init_graph.dot"));                       // for debug
    convertor.DrawSaveCheckpointGraph(GetFilePathName("save_checkpoint_graph.dot"));  // for debug
  }
  std::string init_graph = "init_subgraph." + net_id;
  std::string checkpoint_name = "save." + net_id;
  if (phase.find("train") != std::string::npos) {
    (void)DfGraphManager::GetInstance().AddGraph(phase, convertor.GetComputeGraph(), {{"ge.exec.variable_acc", "1"}});
  } else {
    (void)DfGraphManager::GetInstance().AddGraph(phase, convertor.GetComputeGraph());
  }
  (void)DfGraphManager::GetInstance().AddGraph(init_graph, convertor.GetInitGraph());
  (void)DfGraphManager::GetInstance().AddGraph(BROADCAST_GRAPH_NAME, convertor.GetBroadcastGraph());
  Status ret = DfGraphManager::GetInstance().AddGraph(checkpoint_name, convertor.GetSaveCheckpointGraph());
  if (ret == Status::SUCCESS) {
    DfGraphManager::GetInstance().SetAnfGraph(checkpoint_name, anf_graph);
  }

  return true;
 }

 FuncGraphPtr ExecutorPy::BuildDFGraph(const py::dict& init_params, const std::string& phase,
                                      const py::object& broadcast_params) {
  if (info_.count(phase) == 0) {
    MS_LOG(EXCEPTION) << "no phase in executor:" << GetPhasePrefix(phase);
  }
  FuncGraphPtr anf_graph = info_[phase]->func_graph;

  if (MsContext::GetInstance()->save_graphs_flag()) {
    draw::Draw(GetFilePathName("anf_graph.dot"), anf_graph);  // for debug
    DumpIR(GetFilePathName("anf_graph.ir"), anf_graph, true);
  }

  if (!AddDFGraph(init_params, phase, broadcast_params)) {
    MS_LOG(ERROR) << "GenConvertor failed";
    return nullptr;
  }

 #if ENABLE_TRAIN
  (void)setenv("GE_TRAIN", "1", 1);
 #else
  (void)setenv("GE_TRAIN", "0", 1);
 #if ENABLE_GE
  RunGEInitGraph(init_params, phase);
 #endif

  if (CreateSessionAndGraphRunner(static_cast<bool>(ENABLE_TRAIN)) != Status::SUCCESS) {
    MS_LOG(ERROR) << "Create GE Session or GraphRunner failed.";
    return nullptr;
  }

  return anf_graph;
 }

 bool InitExecDataset(const std::string& queue_name, int64_t iter_num, int64_t batch_size,
@@ -1156,47 +669,16 @@ bool InitExecDataset(const std::string& queue_name, int64_t iter_num, int64_t ba
  std::string name = MsContext::GetInstance()->backend_policy();
  if (name == kMsConvert || name == kMsVm) {
    return InitExecDatasetVm(queue_name, iter_num, batch_size, types, shapes, input_indexes);
  } else {
    return InitExecDatasetGe(queue_name, iter_num, batch_size, types, shapes, input_indexes, phase);
  }
 }

 bool InitExecDatasetGe(const std::string& queue_name, int64_t size, int64_t batch_size,
                       const std::vector<TypePtr>& types, const std::vector<std::vector<int64_t>>& shapes,
                       const std::vector<int64_t>& input_indexes, const std::string& phase) {
  // Convert types to GE types and TF types
  std::vector<int64_t> ge_types;
  (void)std::transform(types.begin(), types.end(), std::back_inserter(ge_types), [](const TypePtr& i) -> int64_t {
    return transform::TransformUtil::ConvertDataType(i->type_id());
  });

  ConfigManager::GetInstance().set_dataset_mode(DatasetMode::DS_GRAPH_MODE);
  ConfigManager::GetInstance().set_iter_num(size);
  ConfigManager::GetInstance().set_dataset_phase(phase);

  DatasetGraphParam param(queue_name, size, batch_size, ge_types, shapes, input_indexes);
  ConfigManager::GetInstance().set_dataset_param(param);

  if (transform::BuildDatasetGraph(param, phase) != transform::SUCCESS) {
    MS_LOG(ERROR) << "Build dateset graph failed.";
    return false;
  }

 #if ENABLE_TRAIN
  (void)setenv("GE_TRAIN", "1", 1);
 #if ENABLE_GE
  return InitExecDatasetGe(queue_name, iter_num, batch_size, types, shapes, input_indexes, phase);
 #else
  (void)setenv("GE_TRAIN", "0", 1);
 #endif

  if (CreateSessionAndGraphRunner(static_cast<bool>(ENABLE_TRAIN)) != Status::SUCCESS) {
    MS_LOG(ERROR) << "Create GE Session or GraphRunner failed.";
    return false;
  std::string backend = MsContext::GetInstance()->backend_policy();
  if (backend == "ge") {
    return true;
  }

  MS_LOG(INFO) << "DoExecNonInputGraph:" << phase;
  DoExecNonInputGraph(phase);

  return true;
 #endif
  return false;
 }

 bool InitExecDatasetVm(const std::string& queue_name, int64_t size, int64_t batch_size,
@@ -1259,25 +741,6 @@ bool InitExecDatasetVm(const std::string& queue_name, int64_t size, int64_t batc
  return true;
 }

 void InitGe() {
  // set python env flag
  mindspore::parse::python_adapter::set_python_env_flag(true);
  // open tsd before ge initialize
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  if (!ms_context->OpenTsd()) {
    MS_LOG(EXCEPTION) << "open tsd failed";
  }
  (void)ms_context->InitGe();
 }

 void FinalizeGe() {
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
  (void)context_ptr->FinalizeGe();
  (void)context_ptr->CloseTsd();
 }

 void ResetOpId() { mindspore::id_generator::reset_id(); }

 void InitHccl() {
@@ -1309,24 +772,57 @@ void FinalizeHccl() {
  device::KernelRuntimeManager::Instance().ClearRuntimeResource();
 #endif
 }
 void ExportDFGraph(const std::string& file_name, const std::string&, const std::string& phase) {
  MS_LOG(DEBUG) << "ExportGraph Begin";
  transform::DfGraphWrapperPtr wrap_ptr = DfGraphManager::GetInstance().GetGraphByName(phase);
  if (wrap_ptr == nullptr) {
    MS_LOG(ERROR) << "Get graph form DfGraphManager failed!";
    return;
  }

  transform::DfGraphPtr ge_graph = wrap_ptr->graph_ptr_;
  if (nullptr == ge_graph) {
    MS_LOG(ERROR) << "The export graph is null";
    return;
 void ExportGraph(const std::string& file_name, const std::string&, const std::string& phase) {
 #if (ENABLE_GE || ENABLE_D)
  ExportDFGraph(file_name, phase);
 #endif
  MS_LOG(WARNING) << "In ut test no export_graph";
 }

 void ReleaseGeTsd() {
  auto context_ptr = MsContext::GetInstance();
  if (context_ptr != nullptr) {
    (void)context_ptr->FinalizeGe(true);
    (void)context_ptr->CloseTsd(true);
  }
 }

  (void)ge_graph->SaveToFile(file_name);
 void InitGe() {
  // set python env flag
  mindspore::parse::python_adapter::set_python_env_flag(true);
  // open tsd before ge initialize
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  if (!ms_context->OpenTsd()) {
    MS_LOG(EXCEPTION) << "open tsd failed";
  }
  (void)ms_context->InitGe();
 }

  MS_LOG(DEBUG) << "ExportGraph End";
 void FinalizeGe() {
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
  (void)context_ptr->FinalizeGe();
  (void)context_ptr->CloseTsd();
 }

 void ClearResAtexit() {
  MS_LOG(DEBUG) << "Pipeline clear all resource";
  device::KernelRuntimeManager::Instance().ClearRuntimeResource();

  ad::g_k_prims.clear();

  abstract::ClearPrimEvaluatorMap();
  compile::ClearConvertCache();
  pipeline::GetMethodMap().clear();
  pipeline::ExecutorPy::ClearRes();
 #ifdef ENABLE_GE
  transform::DfGraphManager::GetInstance().ClearGraph();
  transform::DfGraphConvertor::get_adpt_map().clear();
 #endif
  ReleaseGeTsd();
  parse::python_adapter::ResetPythonScope();
 }
 }  // namespace pipeline
 }  // namespace mindspore
--- a/mindspore/ccsrc/pipeline/pipeline.h
+++ b/mindspore/ccsrc/pipeline/pipeline.h
@@ -30,6 +30,7 @@
 #include "pipeline/action.h"
 #include "vm/segment_runner.h"
 #include "vm/transform.h"
 #include "pipeline/base.h"

 namespace mindspore {
 extern const char kMsConvert[];
@@ -55,14 +56,6 @@ class Pipeline {
  std::vector<ActionItem> actions_;
 };

 struct ExecutorInfo {
  FuncGraphPtr func_graph;
  ResourcePtr resource;
  std::size_t arg_list_size;
 };

 using ExecutorInfoPtr = std::shared_ptr<ExecutorInfo>;

 // A function pipeline.
 class ExecutorPy : public std::enable_shared_from_this<ExecutorPy> {
 public:
@@ -80,11 +73,7 @@ class ExecutorPy : public std::enable_shared_from_this<ExecutorPy> {
  bool CompileInner(const py::object& obj, const py::tuple& args, const py::object& phase, bool use_vm);
  bool Compile(const py::object& obj, const py::tuple& args, const py::object& phase, bool use_vm);

  // for graph mode
  py::object ExecDFGraph(const py::tuple& args, const std::string& phase = "train");

  void ProcessVmArg(const py::tuple& args, const std::string& phase, VectorRef* arg_list);
  void ProcessGeArg(const py::tuple& args, const std::string& phase, std::vector<tensor::TensorPtr>* inputs);

  // for pynative mode when use_vm is on
  py::object Run(const py::tuple& args, const py::object& phase);
@@ -95,9 +84,8 @@ class ExecutorPy : public std::enable_shared_from_this<ExecutorPy> {
  compile::VmEvalFuncPtr GetVmEvalFunc(const std::string& phase);
  bool HasCompiled(const std::string& phase) const;

  bool AddDFGraph(const py::dict& init_params, const std::string& phase, const py::object& broadcast_params);
  FuncGraphPtr BuildDFGraph(const py::dict& init_params, const std::string& phase,
                            const py::object& broadcast_params = {});
  FuncGraphPtr BuildGraph(const py::dict& init_params, const std::string& phase,
                          const py::object& broadcast_params = {});
  void RunInitGraph(const py::dict& init_params, const std::string& phase);
  py::dict GetParameterLayout(const std::string& phase);
  py::dict GetCNodeStrategy(const std::string& phase);
@@ -122,32 +110,29 @@ using ExecutorPyPtr = std::shared_ptr<ExecutorPy>;
 py::tuple GenerateKey(const std::string& name, const std::unordered_map<std::string, py::object>& defaults);
 py::bool_ VerifyInputSignature(const py::list input_signature, const py::tuple inputs);

 void SetGeOption(const std::map<std::string, std::string>& options);
 bool InitDistribute(const std::map<std::string, std::string>& options);

 void ResetOpId();
 void InitGe();
 void FinalizeGe();
 void InitHccl();
 void FinalizeHccl();
 void InitGe();
 void FinalizeGe();

 void ClearResAtexit();
 void ReleaseGeTsd();

 void ExportGraph(const std::string& file_name, const std::string&, const std::string& phase);

 // init and exec dataset sub graph
 bool InitExecDataset(const std::string& queue_name, int64_t iter_num, int64_t batch_size,
                     const std::vector<TypePtr>& types, const std::vector<std::vector<int64_t>>& shapes,
                     const std::vector<int64_t>& input_indexes, const std::string& phase);

 // init and exec dataset sub graph for GE backend
 bool InitExecDatasetGe(const std::string& queue_name, int64_t size, int64_t batch_size,
                       const std::vector<TypePtr>& types, const std::vector<std::vector<int64_t>>& shapes,
                       const std::vector<int64_t>& input_indexes, const std::string& phase);

 // Build and run dataset subgraph for ms backend
 bool InitExecDatasetVm(const std::string& queue_name, int64_t size, int64_t batch_size,
                       const std::vector<TypePtr>& types, const std::vector<std::vector<int64_t>>& shapes,
                       const std::vector<int64_t>& input_indexes);

 void ExportDFGraph(const std::string& file_name, const std::string&, const std::string& phase);

 }  // namespace pipeline
 }  // namespace mindspore

--- a/mindspore/ccsrc/pipeline/pipeline_ge.cc
+++ b/mindspore/ccsrc/pipeline/pipeline_ge.cc
@@ -0,0 +1,545 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "pipeline/pipeline_ge.h"

 #include <sstream>
 #include <map>
 #include <unordered_map>
 #include <cstdlib>
 #include <algorithm>

 #include "debug/anf_ir_dump.h"
 #include "ir/meta_tensor.h"
 #include "transform/convert.h"
 #include "transform/df_graph_manager.h"
 #include "transform/graph_builder.h"
 #include "transform/graph_runner.h"
 #include "debug/draw.h"
 #include "pipeline/static_analysis/abstract_value.h"

 namespace mindspore {
 namespace pipeline {
 using Tensor = mindspore::tensor::Tensor;
 using MetaTensor = mindspore::tensor::MetaTensor;
 using TensorOrderMap = std::map<std::string, std::shared_ptr<Tensor>>;
 using mindspore::abstract::AbstractTensor;
 using mindspore::abstract::AbstractTuple;
 using mindspore::abstract::AbstractTuplePtr;
 using mindspore::transform::DfGraphConvertor;
 using mindspore::transform::DfGraphManager;
 using mindspore::transform::GeTensorPtr;
 using mindspore::transform::MeTensorPtr;
 using mindspore::transform::Status;
 using mindspore::transform::TransformUtil;

 void DoExecNonInputGraph(const std::string& phase) {
  std::vector<GeTensorPtr> ge_tensors;
  std::vector<GeTensorPtr> ge_outputs;
  transform::RunOptions run_options;
  run_options.name = phase;
  auto graph_runner = DfGraphManager::GetInstance().GetGraphRunner();

  if (graph_runner == nullptr) {
    MS_LOG(ERROR) << "Can not found GraphRunner";
    return;
  }
  {
    // Release GIL before calling into (potentially long-running) C++ code
    py::gil_scoped_release release;
    Status ret = graph_runner->RunGraph(run_options, ge_tensors, &ge_outputs);
    if (ret != Status::SUCCESS) {
      MS_LOG(ERROR) << "Exec graph:" << run_options.name << " failed";
      return;
    }
  }
 }

 void SetGeOption(const std::map<std::string, std::string>& options) {
  ConfigManager::GetInstance().set_ge_initialize_options(options);
 }

 Status CreateSessionAndGraphRunner(bool is_training = true) {
  std::shared_ptr<ge::Session> sess = DfGraphManager::GetInstance().GetGeSession();
  if (sess == nullptr) {
    transform::SessionOptions options;
    if (is_training) {
      options["ge.trainFlag"] = "1";
      options["ge.streamNum"] = "100";
      options["ge.enabledLocalFmkop"] = "1";
      options["ge.hcomParallel"] = "1";
    } else {
      options["ge.trainFlag"] = "0";
    }

    options["ge.enablePrintOpPass"] = "0";
    sess = transform::GraphRunner::NewSession(options);
    if (sess == nullptr) {
      MS_LOG(ERROR) << "Init data graph failed, because of create Ge session failed";
      return Status::FAILED;
    } else {
      DfGraphManager::GetInstance().SetGeSession(sess);
    }
  }

  transform::GraphRunnerOptions options;
  options.sess_ptr = sess;
  auto graph_runner = std::make_shared<transform::GraphRunner>(options);
  if (graph_runner == nullptr) {
    MS_LOG(ERROR) << "Create new graph runner failed";
    return Status::FAILED;
  } else {
    DfGraphManager::GetInstance().SetGraphRunner(graph_runner);
  }

  return Status::SUCCESS;
 }

 bool InitExecDatasetGe(const std::string& queue_name, int64_t size, int64_t batch_size,
                       const std::vector<TypePtr>& types, const std::vector<std::vector<int64_t>>& shapes,
                       const std::vector<int64_t>& input_indexes, const std::string& phase) {
  std::vector<int64_t> ge_types;
  (void)std::transform(types.begin(), types.end(), std::back_inserter(ge_types), [](const TypePtr& i) -> int64_t {
    return transform::TransformUtil::ConvertDataType(i->type_id());
  });

  ConfigManager::GetInstance().set_dataset_mode(DatasetMode::DS_GRAPH_MODE);
  ConfigManager::GetInstance().set_iter_num(size);
  ConfigManager::GetInstance().set_dataset_phase(phase);

  DatasetGraphParam param(queue_name, size, batch_size, ge_types, shapes, input_indexes);
  ConfigManager::GetInstance().set_dataset_param(param);

  if (transform::BuildDatasetGraph(param, phase) != transform::SUCCESS) {
    MS_LOG(ERROR) << "Build dateset graph failed.";
    return false;
  }

 #if ENABLE_TRAIN
  (void)setenv("GE_TRAIN", "1", 1);
 #else
  (void)setenv("GE_TRAIN", "0", 1);
 #endif

  if (CreateSessionAndGraphRunner(static_cast<bool>(ENABLE_TRAIN)) != Status::SUCCESS) {
    MS_LOG(ERROR) << "Create GE Session or GraphRunner failed.";
    return false;
  }

  MS_LOG(INFO) << "DoExecNonInputGraph:" << phase;
  DoExecNonInputGraph(phase);

  return true;
 }

 void ConvertObjectToTensors(const py::dict& dict, TensorOrderMap* const tensors) {
  for (auto item : dict) {
    if ((!py::isinstance<py::str>(item.first))) {
      MS_LOG(WARNING) << "Type of key of py_dict is not string, ignore it.";
      continue;
    }
    std::shared_ptr<Tensor> tensor;
    std::string name = py::cast<std::string>(item.first);
    if (py::isinstance<py::float_>(item.second.attr("default_input"))) {
      // convert float to tensor with shape([1])
      tensor = std::make_shared<Tensor>(kNumberTypeFloat32, std::vector<int>({1}));
      *(static_cast<float*>(tensor->data_c(true))) = py::cast<float>(item.second.attr("default_input"));
    } else if (py::isinstance<py::int_>(item.second.attr("default_input"))) {
      // convert int to tensor with shape([1])
      tensor = std::make_shared<Tensor>(kNumberTypeInt32, std::vector<int>({1}));
      *(static_cast<float*>(tensor->data_c(true))) = py::cast<float>(item.second.attr("default_input"));
    } else if (py::hasattr(item.second.attr("default_input"), PYTHON_TENSOR_FLAG)) {
      // cast tensor
      tensor = py::cast<std::shared_ptr<Tensor>>(item.second.attr("default_input"));
    }

    if (tensor == nullptr) {
      MS_LOG(EXCEPTION) << "Get default value for " << name << " failed";
    }
    (void)tensors->emplace(name, tensor);
  }
 }

 bool AddDFGraph(const std::map<std::string, ExecutorInfoPtr>& info, const py::dict& init_params,
                const std::string& phase, const py::object& broadcast_params) {
  FuncGraphPtr anf_graph = info.at(phase)->func_graph;
  DfGraphConvertor convertor(anf_graph);

  size_t pos = phase.find('.');
  std::string net_id = ((pos == std::string::npos || pos == phase.size() - 1) ? phase : phase.substr(pos + 1));
  std::string phase_prefix = phase.substr(0, pos);

  if (phase_prefix == "export") {
    MS_LOG(INFO) << "Set DfGraphConvertor training : false";
    convertor.set_training(false);
  }

  TensorOrderMap init_tensors{};
  ConvertObjectToTensors(init_params, &init_tensors);
  (void)convertor.ConvertAllNode().InitParam(init_tensors).BuildGraph();

  if (broadcast_params != py::none()) {
    if (!py::isinstance<py::dict>(broadcast_params)) {
      MS_LOG(ERROR) << "Invalid broadcast params, it must be py::dict type";
      return false;
    }
    py::dict broadcast = broadcast_params.cast<py::dict>();
    if (broadcast.empty()) {
      (void)convertor.GenerateBroadcastGraph(init_tensors);
    } else {
      TensorOrderMap broadcast_tensors{};
      ConvertObjectToTensors(broadcast, &broadcast_tensors);
      (void)convertor.GenerateBroadcastGraph(broadcast_tensors);
    }
    MS_LOG(INFO) << "Generate broadcast graph with params and broadcast_empty is " << broadcast.empty();
  }

  (void)convertor.GenerateCheckpointGraph();
  if (convertor.ErrCode() != 0) {
    DfGraphManager::GetInstance().ClearGraph();
    MS_LOG(ERROR) << "convert df graph failed, err:" << convertor.ErrCode();
    return false;
  }

  if (MsContext::GetInstance()->save_graphs_flag()) {
    convertor.DrawComputeGraph(GetFilePathName("ge_graph.dot"));                      // for debug
    convertor.DrawInitGraph(GetFilePathName("init_graph.dot"));                       // for debug
    convertor.DrawSaveCheckpointGraph(GetFilePathName("save_checkpoint_graph.dot"));  // for debug
  }
  std::string init_graph = "init_subgraph." + net_id;
  std::string checkpoint_name = "save." + net_id;
  if (phase.find("train") != std::string::npos) {
    (void)DfGraphManager::GetInstance().AddGraph(phase, convertor.GetComputeGraph(), {{"ge.exec.variable_acc", "1"}});
  } else {
    (void)DfGraphManager::GetInstance().AddGraph(phase, convertor.GetComputeGraph());
  }
  (void)DfGraphManager::GetInstance().AddGraph(init_graph, convertor.GetInitGraph());
  (void)DfGraphManager::GetInstance().AddGraph(checkpoint_name, convertor.GetSaveCheckpointGraph());
  (void)DfGraphManager::GetInstance().AddGraph(BROADCAST_GRAPH_NAME, convertor.GetBroadcastGraph());

  DfGraphManager::GetInstance().SetAnfGraph(checkpoint_name, anf_graph);

  return true;
 }

 FuncGraphPtr BuildDFGraph(const std::map<std::string, ExecutorInfoPtr>& info, const py::dict& init_params,
                          const std::string& phase, const py::object& broadcast_params) {
  if (info.count(phase) == 0) {
    MS_LOG(EXCEPTION) << "no phase in executor:" << GetPhasePrefix(phase);
  }
  FuncGraphPtr anf_graph = info.at(phase)->func_graph;

  if (MsContext::GetInstance()->save_graphs_flag()) {
    draw::Draw(GetFilePathName("anf_graph.dot"), anf_graph);  // for debug
    DumpIR(GetFilePathName("anf_graph.ir"), anf_graph, true);
  }

  if (!AddDFGraph(info, init_params, phase, broadcast_params)) {
    MS_LOG(ERROR) << "GenConvertor failed";
    return nullptr;
  }

 #if ENABLE_TRAIN
  (void)setenv("GE_TRAIN", "1", 1);
 #else
  (void)setenv("GE_TRAIN", "0", 1);
 #endif

  if (CreateSessionAndGraphRunner(static_cast<bool>(ENABLE_TRAIN)) != Status::SUCCESS) {
    MS_LOG(ERROR) << "Create GE Session or GraphRunner failed.";
    return nullptr;
  }

  return anf_graph;
 }

 void RunGEInitGraph(const py::dict& init_params, const std::string& phase) {
  MS_LOG(DEBUG) << "ExecInitGraph start.";
  TensorOrderMap inputs_with_name{};
  ConvertObjectToTensors(init_params, &inputs_with_name);
  std::vector<tensor::TensorPtr> inputs;
  (void)std::transform(inputs_with_name.begin(), inputs_with_name.end(), std::back_inserter(inputs),
                       [](const std::pair<std::string, tensor::TensorPtr>& item) { return item.second; });

  std::vector<GeTensorPtr> ge_tensors = TransformUtil::ConvertInputTensors(inputs, kOpFormat_NCHW);
  if (ge_tensors.size() != inputs.size()) {
    MS_LOG(ERROR) << "Args convert to ge tensor error.";
    return;
  }
  MS_LOG(DEBUG) << "Run graph begin, inputs size is: " << inputs.size() << ".";

  std::vector<GeTensorPtr> ge_outputs;
  transform::RunOptions run_options;

  run_options.name = phase;
  if (DfGraphManager::GetInstance().GetGraphByName(phase) == nullptr) {
    MS_LOG(WARNING) << "Can not find " << phase << " sub graph, don't need data init subgraph in INFER mode.";
    return;
  }
  auto graph_runner = DfGraphManager::GetInstance().GetGraphRunner();
  if (graph_runner == nullptr) {
    MS_LOG(EXCEPTION) << "Can not found GraphRunner.";
  }
  {
    // Release GIL before calling into (potentially long-running) C++ code
    py::gil_scoped_release release;
    Status ret = graph_runner->RunGraph(run_options, ge_tensors, &ge_outputs);
    if (ret != Status::SUCCESS) {
      MS_LOG(EXCEPTION) << "Exec " << phase << " graph failed.";
    }

    MS_LOG(INFO) << "Exec " << phase << " graph success.";

    if ((ConfigManager::GetInstance().parallel_strategy() == ParallelStrategy::DISTRIBUTION) &&
        (DfGraphManager::GetInstance().GetGraphByName(BROADCAST_GRAPH_NAME) != nullptr)) {
      run_options.name = BROADCAST_GRAPH_NAME;
      ret = graph_runner->RunGraph(run_options, ge_tensors, &ge_outputs);
      if (ret != Status::SUCCESS) {
        MS_LOG(EXCEPTION) << "Exec BROADCAST_GRAPH_NAME failed.";
      }
      MS_LOG(INFO) << "Exec broadcast graph success.";
    }
  }
 }

 py::object ExtractGeneralCnodeRet(const AbstractBasePtr& cnode_data, const py::tuple& data, size_t* count) {
  MS_EXCEPTION_IF_NULL(cnode_data);
  if (*count >= data.size()) {
    MS_LOG(EXCEPTION) << "The number of elements in the outputs : " << data.size()
                      << " less than the number of elements required. ";
  }

  if (cnode_data->isa<AbstractTensor>()) {
    BaseShapePtr shape = cnode_data->BuildShape();
    auto shape_act = shape->cast<abstract::ShapePtr>()->shape();
    Tensor tensor_exp = py::cast<Tensor>(data[*count]);
    if (shape_act != tensor_exp.shape()) {
      MS_LOG(EXCEPTION) << "The shape of the tensor returned from GE is not the same as "
                           "the shape of the tensor derived from ME.";
    }
    return data[(*count)++];
  }

  if (!cnode_data->isa<AbstractTuple>()) {
    MS_LOG(EXCEPTION) << "The output of operator in the final anf graph could "
                      << "only be a tensor or a tuple of tensor, but got " << cnode_data->BuildValue()->ToString()
                      << ".";
  }
  auto data_tp = cnode_data->cast<AbstractTuplePtr>();
  auto elements = data_tp->elements();
  size_t size = data_tp->size();
  py::tuple tp = py::tuple(size);
  for (size_t i = 0; i < size; i++) {
    tp[i] = ExtractGeneralCnodeRet(elements[i], data, count);
  }
  return std::move(tp);
 }

 py::object StructureOutput(const AnfNodePtr& output_node, const py::tuple& data, size_t* count) {
  MS_EXCEPTION_IF_NULL(output_node);

  if (output_node->isa<ValueNode>()) {
    return ValuePtrToPyData(GetValueNode(output_node));
  }

  if (*count >= data.size()) {
    MS_LOG(EXCEPTION) << "The number of elements in the outputs : " << data.size()
                      << " less than the number of elements required. ";
  }
  if (output_node->isa<Parameter>()) {
    return data[(*count)++];
  }

  auto output_c = output_node->cast<CNodePtr>();
  if (output_c == nullptr) {
    MS_LOG(EXCEPTION) << "The final anf graph could only have constant, parameter, and operator, but got "
                      << output_node->ToString();
  }

  if (output_c->IsApply(prim::kPrimMakeTuple)) {
    auto input_list = output_c->inputs();
    size_t size = input_list.size();
    py::tuple tp = py::tuple(size - 1);
    for (size_t i = 1; i < size; i++) {
      tp[i - 1] = StructureOutput(input_list[i], data, count);
    }
    return std::move(tp);
  }
  if (output_c->IsApply(prim::kPrimDepend)) {
    return StructureOutput(output_c->input(1), data, count);
  }

  return ExtractGeneralCnodeRet(output_c->abstract(), data, count);
 }

 std::shared_ptr<py::object> DoExecGraph(const FuncGraphPtr& graph, const std::vector<MeTensorPtr>& inputs,
                                        const std::string& phase) {
  std::vector<GeTensorPtr> ge_tensors = TransformUtil::ConvertInputTensors(inputs, kOpFormat_NCHW);
  if (ge_tensors.size() != inputs.size()) {
    MS_LOG(ERROR) << "args convert to ge tensor error";
    return nullptr;
  }

  std::vector<GeTensorPtr> ge_outputs;
  transform::RunOptions run_options;

  run_options.name = phase;

  auto graph_runner = DfGraphManager::GetInstance().GetGraphRunner();

  if (graph_runner == nullptr) {
    MS_LOG(ERROR) << "Can not found GraphRunner";
    return nullptr;
  }

  {
    // Release GIL before calling into (potentially long-running) C++ code
    py::gil_scoped_release release;
    MS_LOG(DEBUG) << "Run graph begin, inputs size is: " << inputs.size();
    Status ret = graph_runner->RunGraph(run_options, ge_tensors, &ge_outputs);
    MS_LOG(DEBUG) << "Run graph finish, outputs size is: " << ge_outputs.size();
    if (ret != Status::SUCCESS) {
      MS_LOG(ERROR) << "Exec graph failed";
      return nullptr;
    }
  }

  std::vector<MeTensorPtr> me_outputs = TransformUtil::ConvertGeTensors(ge_outputs);
  if (me_outputs.size() != ge_outputs.size()) {
    MS_LOG(ERROR) << "Convert output Ge tensor to Me tensor failed";
  }

  py::tuple outputs(me_outputs.size());
  for (std::size_t i = 0; i < outputs.size(); i++) {
    outputs[i] = *me_outputs[i];
  }

  std::shared_ptr<py::object> ret = nullptr;

 #ifdef ENABLE_GE
  AnfNodePtr root = graph->get_return();
  MS_EXCEPTION_IF_NULL(root);
  AbstractBasePtr output = root->abstract();
  size_t count = 0;
  py::object oj = StructureOutput(output, outputs, &count);
  ret = std::make_shared<py::object>(oj);
 #else
  if (outputs.size() == 1) {
    ret = std::make_shared<py::object>(outputs[0]);
  } else {
    ret = std::make_shared<py::object>(outputs);
  }
 #endif

  return ret;
 }

 void ProcessGeArg(const std::map<std::string, ExecutorInfoPtr>& info, const py::tuple& args, const std::string& phase,
                  std::vector<tensor::TensorPtr>* inputs) {
  // check the arg and use the ExecutorPy args
  std::size_t size = args.size();

  if (info.count(phase) == 0) {
    MS_LOG(EXCEPTION) << "no phase in executor:" << GetPhasePrefix(phase);
  }

  auto arg_size = info.at(phase)->arg_list_size;
  if (size != arg_size) {
    MS_LOG(EXCEPTION) << "The real arg num : size = " << size << ". graph_arg_size = " << arg_size;
  }

  // process the first args of tensor
  // only in Dataset Feed Mode, fp_bp graph need input tensors
  if (ConfigManager::GetInstance().dataset_mode() == DS_FEED_MODE) {
    for (std::size_t i = 0; i < size; i++) {
      ValuePtr converted = nullptr;
      bool succ = parse::ConvertData(args[i], &converted);
      if (!succ) {
        MS_LOG(EXCEPTION) << "args convert error";
      }
      if (converted->isa<tensor::Tensor>()) {
        (*inputs).push_back(converted->cast<tensor::TensorPtr>());
      } else {
        MS_LOG(EXCEPTION) << "args, " << converted->ToString() << " is not tensor";
      }
    }
  }
 }

 py::object ExecDFGraph(const std::map<std::string, ExecutorInfoPtr>& info, const py::tuple& args,
                       const std::string& phase) {
  std::string phase_prefix = GetPhasePrefix(phase);

  if (phase_prefix == "save") {
    DoExecNonInputGraph(phase);
    ConfigManager::GetInstance().ResetConfig();
    return py::none();
  }

  if (info.count(phase) == 0) {
    MS_LOG(EXCEPTION) << "has no phase:" << phase;
  }

  FuncGraphPtr anf_graph = info.at(phase)->func_graph;

 #if (!defined ENABLE_GE) || (defined ENABLE_INFER)
  // Now don't use the graph because the exec ge function don't take effect
  MS_EXCEPTION_IF_NULL(info.at(phase)->func_graph);
  if (ENABLE_TRAIN != info.at(phase)->func_graph->flags()["training"]) {
    MS_LOG(ERROR) << "Graph training mode mismatch mode of libraries";
    ConfigManager::GetInstance().ResetConfig();
    return py::none();
  }
 #endif

  std::shared_ptr<py::object> ret_val = std::make_shared<py::object>();
  // We will not execute graph when output is constant or just input itself.
  if (IsGraphOutputValueNodeOrParameter(info.at(phase)->func_graph->output(), args, ret_val)) {
    ConfigManager::GetInstance().ResetConfig();
    return *ret_val;
  }

  std::vector<tensor::TensorPtr> inputs;
  ProcessGeArg(info, args, phase, &inputs);

  std::shared_ptr<py::object> ret = DoExecGraph(anf_graph, inputs, phase);
  ConfigManager::GetInstance().ResetConfig();
  if (ret != nullptr) {
    return *ret;
  } else {
    MS_LOG(EXCEPTION) << "exec graph failed";
  }
 }
 void ExportDFGraph(const std::string& file_name, const std::string& phase) {
  MS_LOG(DEBUG) << "ExportGraph Begin";
  transform::DfGraphWrapperPtr wrap_ptr = DfGraphManager::GetInstance().GetGraphByName(phase);
  if (wrap_ptr == nullptr) {
    MS_LOG(ERROR) << "Get graph form DfGraphManager failed!";
    return;
  }

  transform::DfGraphPtr ge_graph = wrap_ptr->graph_ptr_;
  if (nullptr == ge_graph) {
    MS_LOG(ERROR) << "The export graph is null";
    return;
  }

  (void)ge_graph->SaveToFile(file_name);

  MS_LOG(DEBUG) << "ExportGraph End";
 }
 }  // namespace pipeline
 }  // namespace mindspore
--- a/mindspore/ccsrc/pipeline/pipeline_ge.h
+++ b/mindspore/ccsrc/pipeline/pipeline_ge.h
@@ -0,0 +1,57 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_PIPELINE_PIPELINE_GE_H_
 #define MINDSPORE_CCSRC_PIPELINE_PIPELINE_GE_H_

 #include <vector>
 #include <utility>
 #include <string>
 #include <memory>
 #include <unordered_map>
 #include <map>
 #include <mutex>

 #include "pybind11/pybind11.h"
 #include "pipeline/base.h"
 #include "operator/ops.h"

 namespace mindspore {
 namespace pipeline {

 namespace py = pybind11;

 void SetGeOption(const std::map<std::string, std::string>& options);

 void RunGEInitGraph(const py::dict& init_params, const std::string& phase);

 py::object ExecDFGraph(const std::map<std::string, ExecutorInfoPtr>& info, const py::tuple& args,
                       const std::string& phase = "train");

 FuncGraphPtr BuildDFGraph(const std::map<std::string, ExecutorInfoPtr>& info, const py::dict& init_params,
                          const std::string& phase, const py::object& broadcast_params = {});

 // init and exec dataset sub graph for GE backend
 bool InitExecDatasetGe(const std::string& queue_name, int64_t size, int64_t batch_size,
                       const std::vector<TypePtr>& types, const std::vector<std::vector<int64_t>>& shapes,
                       const std::vector<int64_t>& input_indexes, const std::string& phase);

 void ExportDFGraph(const std::string& file_name, const std::string& phase);

 }  // namespace pipeline
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_PIPELINE_PIPELINE_GE_H_
--- a/mindspore/ccsrc/pipeline/resource.cc
+++ b/mindspore/ccsrc/pipeline/resource.cc
@@ -25,19 +25,13 @@
 #include "pipeline/parse/data_converter.h"
 #include "operator/ops.h"
 #include "utils/graph_utils.h"
 #include "transform/convert.h"
 #include "optimizer/ad/dfunctor.h"
 #include "vm/segment_runner.h"
 #include "utils/context/ms_context.h"
 #include "transform/df_graph_manager.h"
 #include "device/kernel_runtime_manager.h"

 namespace mindspore {
 // namespace to support opmap definition
 namespace pipeline {

 using MethodMap = std::unordered_map<int, std::unordered_map<std::string, Any>>;

 MethodMap& GetMethodMap() {
  static MethodMap method_map = {{kObjectTypeString,
                                  {
@@ -255,28 +249,5 @@ void Resource::Clean() {
  trace::ClearTraceStack();
  is_cleaned_ = true;
 }

 void ReleaseGeTsd() {
  auto context_ptr = MsContext::GetInstance();
  if (context_ptr != nullptr) {
    (void)context_ptr->FinalizeGe(true);
    (void)context_ptr->CloseTsd(true);
  }
 }

 void ClearResAtexit() {
  MS_LOG(DEBUG) << "pipeline clear all resource";
  device::KernelRuntimeManager::Instance().ClearRuntimeResource();
  transform::DfGraphManager::GetInstance().ClearGraph();
  ad::g_k_prims.clear();

  abstract::ClearPrimEvaluatorMap();
  compile::ClearConvertCache();
  transform::DfGraphConvertor::get_adpt_map().clear();
  pipeline::GetMethodMap().clear();
  pipeline::ExecutorPy::ClearRes();

  ReleaseGeTsd();
 }
 }  // namespace pipeline
 }  // namespace mindspore
--- a/mindspore/ccsrc/pipeline/resource.h
+++ b/mindspore/ccsrc/pipeline/resource.h
@@ -44,6 +44,10 @@ const char kOutput[] = "output";

 class InferenceResource;

 using MethodMap = std::unordered_map<int, std::unordered_map<std::string, Any>>;

 MethodMap& GetMethodMap();

 class ResourceBase {
 public:
  ResourceBase() { manager_ = MakeManager(); }
@@ -110,9 +114,6 @@ class Resource : public ResourceBase {

 using ResourcePtr = std::shared_ptr<pipeline::Resource>;

 void ClearResAtexit();
 void ReleaseGeTsd();

 }  // namespace pipeline
 }  // namespace mindspore

--- a/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.cc
@@ -21,7 +21,7 @@
 #include "pre_activate/ascend/ir_fission/bn_grad_split.h"
 #include "pre_activate/ascend/ir_fusion/fused_batch_norm_fusion.h"
 #include "pre_activate/ascend/ir_fission/layer_norm_grad_split.h"
 #include "pre_activate/ascend/ir_fusion/allreduce_fusion.h"
 #include "pre_activate/common/ir_fusion/allreduce_fusion.h"
 #include "pre_activate/ascend/ir_fusion/square_sum_fusion.h"
 #include "pre_activate/ascend/ir_fusion/clip_by_norm_no_div_square_sum_fusion.h"
 #include "pre_activate/ascend/ir_fusion/lamb_update_with_lr_rule_fusion.h"
--- a/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/buffer_fusion.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/buffer_fusion/buffer_fusion.cc
@@ -237,11 +237,11 @@ CNodePtr CreateFusionOp(const std::vector<AnfNodePtr> &inputs_list, const std::v

  std::vector<std::string> input_names;
  for (uint8_t i = 0; i < inputs_list.size(); i++) {
    input_names.emplace_back("input" + to_string(i));
    input_names.emplace_back("input" + std::to_string(i));
  }
  std::vector<std::string> output_names;
  for (uint8_t i = 0; i < outputs_list.size(); i++) {
    output_names.emplace_back("output" + to_string(i));
    output_names.emplace_back("output" + std::to_string(i));
  }

  ValuePtr input_names_v = MakeValue(input_names);
--- a/mindspore/ccsrc/pre_activate/common/ir_fusion/allreduce_fusion.cc
+++ b/mindspore/ccsrc/pre_activate/common/ir_fusion/allreduce_fusion.cc
@@ -13,7 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "pre_activate/ascend/ir_fusion/allreduce_fusion.h"
 #include "pre_activate/common/ir_fusion/allreduce_fusion.h"

 #include <vector>
 #include <string>
--- a/mindspore/ccsrc/pre_activate/common/ir_fusion/allreduce_fusion.h
+++ b/mindspore/ccsrc/pre_activate/common/ir_fusion/allreduce_fusion.h
@@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_IR_FUSION_ALLREDUCE_FUSION_H_
 #define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_IR_FUSION_ALLREDUCE_FUSION_H_
 #ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_COMMON_IR_FUSION_ALLREDUCE_FUSION_H_
 #define MINDSPORE_CCSRC_PRE_ACTIVATE_COMMON_IR_FUSION_ALLREDUCE_FUSION_H_
 #include <vector>

 #include "pre_activate/common/pass.h"
@@ -46,4 +46,4 @@ class AllReduceFusion : public Pass {
 };
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_IR_FUSION_ALLREDUCE_FUSION_H_
 #endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_COMMON_IR_FUSION_ALLREDUCE_FUSION_H_
--- a/mindspore/ccsrc/predict/converter/kernel2ms.cc
+++ b/mindspore/ccsrc/predict/converter/kernel2ms.cc
@@ -16,7 +16,7 @@

 #include "predict/converter/kernel2ms.h"
 #include <algorithm>
 #include "transform/convert.h"
 #include "ir/anf.h"
 #include "predict/converter/lite_model/op_attr_packer.h"
 #include "mindspore/ccsrc/operator/ops.h"

@@ -135,7 +135,7 @@ void Kernel2Ms::GetRealInpoutsPtr(const AnfNodePtr &node, std::vector<AnfNodePtr
  if (node->isa<CNode>()) {
    auto c_node = node->cast<CNodePtr>();
    MS_EXCEPTION_IF_NULL(c_node);
    std::string c_node_name = transform::GetCNodeFuncName(c_node);
    std::string c_node_name = GetCNodeFuncName(c_node);
    if (c_node_name == prim::kPrimTupleGetItem->name()) {
      auto v_node = c_node->inputs()[kTupleGetItemIndex]->cast<ValueNodePtr>();
      MS_EXCEPTION_IF_NULL(v_node);
@@ -321,7 +321,7 @@ bool Kernel2Ms::SetGraphInputTensors(const KernelGraphPtr &kernel_graph_ptr, con
  }
  for (const auto &input_node : kernel_graph_ptr->inputs()) {
    if (input_node->isa<Parameter>()) {
      ParameterPtr pk_node = dynamic_pointer_cast<Parameter>(input_node);
      ParameterPtr pk_node = std::dynamic_pointer_cast<Parameter>(input_node);
      TensorPtr device_tensor;
      if (convert_mode_ == kConvertCpuMode) {
        device_tensor = predict::utils::GetParaCpuTensor(input_node);
--- a/mindspore/ccsrc/pynative/base.h
+++ b/mindspore/ccsrc/pynative/base.h
@@ -0,0 +1,67 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_PYNATIVE_BASE_H_
 #define MINDSPORE_CCSRC_PYNATIVE_BASE_H_

 #include <vector>
 #include <utility>
 #include <string>
 #include <memory>
 #include <unordered_map>
 #include <unordered_set>

 #include "pybind11/pybind11.h"
 #include "ir/primitive.h"
 #include "pipeline/static_analysis/abstract_value.h"

 namespace mindspore {
 namespace pynative {

 namespace py = pybind11;

 enum PynativeStatusCode {
  PYNATIVE_SUCCESS = 0,
  PYNATIVE_OP_NOT_IMPLEMENTED_ERR = 1,
  PYNATIVE_OP_INPUTS_ERR = 2,
  PYNATIVE_OP_PARAMS_ERR = 3,
  PYNATIVE_OP_ATTRS_ERR = 4,
  PYNATIVE_GRAPH_MANAGER_ERR = 5,
  PYNATIVE_GRAPH_GE_BUILD_ERR = 6,
  PYNATIVE_GRAPH_GE_RUN_ERR = 7,
  PYNATIVE_UNKNOWN_STATE = 0XFF
 };

 enum RunOpArgsEnum { PY_PRIM = 0, PY_NAME, PY_INPUTS, PY_INPUT_MASK, PY_ARGS_NUM };

 struct OpExecInfo {
  PrimitivePyPtr py_primitive;
  std::string op_name;
  AbstractBasePtr abstract;

  py::tuple op_inputs;
  py::tuple inputs_mask;
  py::dict op_attrs;
 };
 using OpExecInfoPtr = std::shared_ptr<OpExecInfo>;
 OpExecInfoPtr GenerateOpExecInfo(const py::args& args);

 const std::unordered_set<std::string> ignore_infer_prim = {"partial"};

 }  // namespace pynative
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_PYNATIVE_BASE_H_
--- a/mindspore/ccsrc/pynative/pynative_execute.cc
+++ b/mindspore/ccsrc/pynative/pynative_execute.cc
@@ -29,16 +29,18 @@
 #include "pipeline/static_analysis/prim.h"
 #include "session/session_factory.h"

 #include "pynative/base.h"

 #ifdef ENABLE_GE
 #include "pynative/pynative_execute_ge.h"
 #endif

 const char SINGLE_OP_GRAPH[] = "single_op_graph";
 // primitive unable to infer value for constant input in pynative mode
 const std::unordered_set<std::string> ignore_infer_prim = {"partial"};
 const std::unordered_set<std::string> vm_operators = {"partial", "depend"};

 namespace mindspore {
 namespace pynative {
 using transform::GraphRunner;
 using transform::GraphRunnerOptions;
 using transform::OperatorPtr;
 inline ValuePtr PyAttrValue(const py::object& obj) {
  ValuePtr converted_ret = nullptr;
  bool converted = parse::ConvertData(obj, &converted_ret);
@@ -48,32 +50,12 @@ inline ValuePtr PyAttrValue(const py::object& obj) {
  return converted_ret;
 }

 MeTensorPtr ConvertPyObjToTensor(const py::object& obj) {
  MeTensorPtr me_tensor_ptr = nullptr;
  if (py::isinstance<MeTensor>(obj)) {
    me_tensor_ptr = py::cast<MeTensorPtr>(obj);
  } else if (py::isinstance<py::tuple>(obj)) {
    me_tensor_ptr = std::make_shared<MeTensor>(py::cast<py::tuple>(obj), nullptr);
  } else if (py::isinstance<py::float_>(obj)) {
    me_tensor_ptr = std::make_shared<MeTensor>(py::cast<py::float_>(obj), nullptr);
  } else if (py::isinstance<py::int_>(obj)) {
    me_tensor_ptr = std::make_shared<MeTensor>(py::cast<py::int_>(obj), nullptr);
  } else if (py::isinstance<py::list>(obj)) {
    me_tensor_ptr = std::make_shared<MeTensor>(py::cast<py::list>(obj), nullptr);
  } else if (py::isinstance<py::array>(obj)) {
    me_tensor_ptr = std::make_shared<MeTensor>(py::cast<py::array>(obj), nullptr);
  } else {
    MS_LOG(EXCEPTION) << "run op inputs type is invalid!";
  }
  return me_tensor_ptr;
 }

 void PynativeInfer(const PrimitivePyPtr& prim, const py::tuple& py_args, OpExecInfo* const op_exec_info) {
  size_t size = py_args.size();
  AbstractBasePtrList args_spec_list;
  for (size_t i = 0; i < size; i++) {
    ValuePtr input_value = PyAttrValue(py_args[i]);
    if (py::isinstance<MeTensor>(py_args[i])) {
    if (py::isinstance<tensor::Tensor>(py_args[i])) {
      args_spec_list.emplace_back(abstract::FromValueInside(input_value, true));
    } else {
      args_spec_list.emplace_back(abstract::FromValueInside(input_value, false));
@@ -140,241 +122,6 @@ std::string GetSingleOpGraphInfo(const OpExecInfoPtr& op_exec_info) {
  return graph_info;
 }

 bool SetInputsForSingleOpGraph(const OpExecInfoPtr& op_exec_info, const std::vector<GeTensorPtr>& inputs,
                               const OperatorPtr& op, std::vector<GeOperator>* graph_input_nodes) {
  MS_EXCEPTION_IF_NULL(op_exec_info);
  MS_EXCEPTION_IF_NULL(graph_input_nodes);
  auto op_inputs = op_exec_info->op_inputs;
  std::string op_name = op_exec_info->op_name;
  transform::OpAdapterPtr adapter = transform::DfGraphConvertor::FindAdapter(op_name, true);
  if (adapter == nullptr) {
    return false;
  }

  int op_input_idx = 1;
  size_t size = inputs.size();
  for (size_t i = 0; i < size; i++) {
    if (inputs[i] == nullptr) {
      continue;
    }
    auto const_op = std::make_shared<transform::Constant>();
    MS_EXCEPTION_IF_NULL(const_op);
    (void)const_op->set_attr_value(*inputs[i]);
    MeTensorPtr me_tensor_ptr = ConvertPyObjToTensor(op_inputs[i]);
    MS_EXCEPTION_IF_NULL(me_tensor_ptr);
    auto const_op_desc =
      transform::TransformUtil::GetGeTensorDesc(me_tensor_ptr->shape_c(), me_tensor_ptr->data_type(), kOpFormat_NCHW);
    if (const_op_desc == nullptr) {
      MS_LOG(ERROR) << "Create variable " << op_name << " ouptut descriptor failed!";
      return false;
    }
    auto pointer_cast_const_op = std::static_pointer_cast<transform::Constant>(const_op);
    MS_EXCEPTION_IF_NULL(pointer_cast_const_op);
    (void)pointer_cast_const_op->update_output_desc_y(*const_op_desc);
    auto& input_map = adapter->getInputMap();
    if (input_map.find(op_input_idx) == input_map.end()) {
      continue;
    }
    if (adapter->setInput(op, op_input_idx++, const_op)) {
      MS_LOG(ERROR) << "fail to set params, index is " << op_input_idx;
      return false;
    }
    graph_input_nodes->push_back(*const_op);
  }
  return true;
 }

 bool BuildSingleOpGraph(const OpExecInfoPtr& op_exec_info, const std::vector<GeTensorPtr>& inputs,
                        const std::unordered_map<std::string, ValuePtr>& attrs, const GeGraphPtr& graph) {
  MS_EXCEPTION_IF_NULL(op_exec_info);
  std::string op_name = op_exec_info->op_name;
  auto op_inputs = op_exec_info->op_inputs;
  transform::OpAdapterPtr adapter = transform::DfGraphConvertor::FindAdapter(op_name, true);
  if (adapter == nullptr) {
    MS_LOG(ERROR) << "Unable to find Adapter for " << ((std::string)py::str(op_name));
    return false;
  }
  OperatorPtr op = adapter->generate(op_name);
  MS_EXCEPTION_IF_NULL(op);

  std::vector<GeOperator> graph_input_nodes;
  // hold param nodes after setting input and output for the graph
  // set input
  if (!SetInputsForSingleOpGraph(op_exec_info, inputs, op, &graph_input_nodes)) {
    return false;
  }
  // set attributes
  for (auto attr : attrs) {
    (void)adapter->setAttr(op, attr.first, attr.second);
  }
  // set default attributes
  auto extra_attrs = adapter->GetExtraAttr();
  for (auto attr : extra_attrs) {
    (void)adapter->setAttr(op, attr.first, attr.second);
  }
  // set input attributes
  auto& input_attr_map = adapter->getInputAttrMap();
  for (auto& it : input_attr_map) {
    if (op_inputs.size() < it.first) {
      continue;
    }
    auto const_value = PyAttrValue(op_inputs[it.first - 1]);
    if (const_value->isa<None>()) {
      continue;
    }
    it.second.set_attr(op, const_value);
  }
  // construct output data nodes
  std::vector<GeOperator> graph_outputs{*op};
  // set input and output nodes for the graph
  MS_EXCEPTION_IF_NULL(graph);
  (void)graph->SetInputs(graph_input_nodes).SetOutputs(graph_outputs);
  MS_LOG(INFO) << "BuildSingleOpGraph done";
  return true;
 }

 void ToTensorPtr(const OpExecInfoPtr op_exec_info, std::vector<GeTensorPtr>* const inputs) {
  MS_EXCEPTION_IF_NULL(inputs);
  MS_EXCEPTION_IF_NULL(op_exec_info);
  auto op_inputs = op_exec_info->op_inputs;
  size_t size = op_inputs.size();
  for (size_t i = 0; i < size; i++) {
    if (py::isinstance<py::none>(op_inputs[i])) {
      inputs->emplace_back(nullptr);
      continue;
    }
    MeTensorPtr me_tensor_ptr = ConvertPyObjToTensor(op_inputs[i]);
    auto ge_tensor_ptr = transform::TransformUtil::ConvertTensor(me_tensor_ptr, kOpFormat_NCHW);
    if (ge_tensor_ptr == nullptr) {
      MS_LOG(EXCEPTION) << "convert inputs to GE tensor failed in op " << op_exec_info->op_name << ".";
    }
    // set inputs for operator to build single node graph
    inputs->push_back(ge_tensor_ptr);
  }
 }

 PynativeStatusCode ConvertAttributes(const OpExecInfoPtr& op_exec_info, const std::vector<GeTensorPtr>& inputs) {
  MS_EXCEPTION_IF_NULL(op_exec_info);
  auto op_attrs = op_exec_info->op_attrs;
  std::unordered_map<std::string, ValuePtr> attrs{};

  for (auto& item : op_attrs) {
    if (!py::isinstance<py::str>(item.first)) {
      MS_LOG(ERROR) << "type error in py dict convert";
      return PYNATIVE_OP_ATTRS_ERR;
    }
    std::string name = py::cast<std::string>(item.first);
    auto attr_value = PyAttrValue(py::cast<py::object>(item.second));
    (void)attrs.emplace(name, attr_value);
  }

  // build graph
  GeGraphPtr graph = std::make_shared<GeGraph>(op_exec_info->op_name);
  if (BuildSingleOpGraph(op_exec_info, inputs, attrs, graph) == false) {
    MS_LOG(ERROR) << "Fail to BuildSingleOpGraph";
    return PYNATIVE_GRAPH_GE_BUILD_ERR;
  }

  // add the single op graph into the graph manager, which will be iterated by session.
  transform::Status ret =
    transform::DfGraphManager::GetInstance().AddGraph(SINGLE_OP_GRAPH, std::shared_ptr<transform::DfGraph>(graph));
  if (ret != transform::SUCCESS) {
    MS_LOG(ERROR) << "Fail to AddGraph into graph manager";
    return PYNATIVE_GRAPH_MANAGER_ERR;
  }

  return PYNATIVE_SUCCESS;
 }

 std::vector<MeTensorPtr> ConvertOutputTensors(const OpExecInfoPtr& op_exec_info,
                                              const std::vector<GeTensorPtr>& ge_tensors) {
  std::vector<MeTensorPtr> outputs;
  AbstractBasePtr abs_base = op_exec_info->abstract;
  std::vector<std::vector<int>> shapes;
  if (abs_base != nullptr && abs_base->isa<abstract::AbstractTensor>()) {
    auto arg_tensor = dyn_cast<abstract::AbstractTensor>(abs_base);
    shapes.emplace_back(arg_tensor->shape()->shape());
    outputs = transform::TransformUtil::ConvertGeTensors(ge_tensors, shapes);
    return outputs;
  }
  if (abs_base != nullptr && abs_base->isa<abstract::AbstractTuple>()) {
    auto arg_tuple = dyn_cast<abstract::AbstractTuple>(abs_base);
    size_t len = arg_tuple->size();

    for (size_t i = 0; i < len; i++) {
      if (arg_tuple->elements()[i]->isa<abstract::AbstractTensor>()) {
        auto arg_tensor = dyn_cast<abstract::AbstractTensor>(arg_tuple->elements()[i]);
        shapes.emplace_back(arg_tensor->shape()->shape());
      }
    }
    outputs = transform::TransformUtil::ConvertGeTensors(ge_tensors, shapes);
    return outputs;
  }
  for (auto& it : ge_tensors) {
    auto tensor = transform::TransformUtil::ConvertGeTensor(it);
    if (tensor != nullptr) {
      outputs.emplace_back(tensor);
    }
  }
  return outputs;
 }

 py::object RunOpInGE(const OpExecInfoPtr& op_exec_info, PynativeStatusCode* status) {
  MS_LOG(INFO) << "RunOpInGe start";
  MS_EXCEPTION_IF_NULL(op_exec_info);
  MS_EXCEPTION_IF_NULL(status);

  // returns a null py::tuple on error
  py::tuple err_ret(0);
  auto op_name = op_exec_info->op_name;
  transform::OpAdapterPtr adapter = transform::DfGraphConvertor::FindAdapter(op_name, true);
  if (adapter == nullptr) {
    MS_LOG(ERROR) << "Unable to find GE Adapter for " << ((std::string)py::str(op_name));
    *status = PYNATIVE_OP_NOT_IMPLEMENTED_ERR;
    return std::move(err_ret);
  }

  std::vector<GeTensorPtr> inputs{};
  ToTensorPtr(op_exec_info, &inputs);
  // convert me attr to ge AttrValue
  PynativeStatusCode ret = ConvertAttributes(op_exec_info, inputs);
  if (ret != PYNATIVE_SUCCESS) {
    *status = ret;
    return std::move(err_ret);
  }
  // run graph
  transform::RunOptions run_options;
  run_options.name = SINGLE_OP_GRAPH;
  std::vector<GeTensorPtr> ge_inputs;
  std::vector<GeTensorPtr> ge_outputs;
  transform::GraphRunnerOptions graph_runner_options;
  graph_runner_options.options["ge.trainFlag"] = "1";
  auto graph_runner = std::make_shared<transform::GraphRunner>(graph_runner_options);
  transform::Status run_ret;
  {
    // Release GIL before calling into (potentially long-running) C++ code
    py::gil_scoped_release release;
    run_ret = graph_runner->RunGraph(run_options, ge_inputs, &ge_outputs);
  }
  if (run_ret != transform::Status::SUCCESS) {
    MS_LOG(ERROR) << "GraphRunner Fails to Run Graph";
    *status = PYNATIVE_GRAPH_GE_RUN_ERR;
    return std::move(err_ret);
  }

  std::vector<MeTensorPtr> graph_outputs = ConvertOutputTensors(op_exec_info, ge_outputs);
  size_t output_size = graph_outputs.size();
  py::tuple result(output_size);
  for (size_t i = 0; i < output_size; i++) {
    MS_EXCEPTION_IF_NULL(graph_outputs[i]);
    result[i] = *graph_outputs[i];
  }

  *status = PYNATIVE_SUCCESS;
  MS_LOG(INFO) << "RunOpInGe end";
  return std::move(result);
 }

 py::object RunOpInVM(const OpExecInfoPtr& op_exec_info, PynativeStatusCode* status) {
  MS_LOG(INFO) << "RunOpInVM start";

@@ -423,12 +170,6 @@ py::object RunOpWithBackendPolicy(MsBackendPolicy backend_policy, const OpExecIn
  MS_EXCEPTION_IF_NULL(status);
  py::object result;
  switch (backend_policy) {
    case kMsBackendGeOnly: {
      // use GE only
      MS_LOG(INFO) << "RunOp use GE only backend";
      result = RunOpInGE(op_exec_info, status);
      break;
    }
    case kMsBackendVmOnly: {
      // use vm only
      MS_LOG(INFO) << "RunOp use VM only backend";
@@ -436,22 +177,14 @@ py::object RunOpWithBackendPolicy(MsBackendPolicy backend_policy, const OpExecIn
      break;
    }
    case kMsBackendGePrior: {
 #ifdef ENABLE_GE
      // use GE first, use vm when GE fails
      MS_LOG(INFO) << "RunOp use GE first backend";
      result = RunOpInGE(op_exec_info, status);
      if (*status != PYNATIVE_SUCCESS) {
        result = RunOpInVM(op_exec_info, status);
      }
      break;
    }
    case kMsBackendVmPrior: {
      // GE_VM_SILENT
      // (should not use this policy) use vm first, use GE when vm fails
      MS_LOG(INFO) << "RunOp use VM first backend";
      result = RunOpInVM(op_exec_info, status);
      if (*status != PYNATIVE_SUCCESS) {
        result = RunOpInGE(op_exec_info, status);
      }
 #endif
      break;
    }
    case kMsBackendMsPrior: {
--- a/mindspore/ccsrc/pynative/pynative_execute.h
+++ b/mindspore/ccsrc/pynative/pynative_execute.h
@@ -25,55 +25,14 @@

 #include "pybind11/pybind11.h"

 #include "transform/convert.h"
 #include "transform/graph_runner.h"
 #include "transform/types.h"
 #include "pynative/base.h"
 #include "utils/context/ms_context.h"

 namespace mindspore {
 namespace pynative {

 using MeTensor = mindspore::tensor::Tensor;
 using MeTensorPtr = mindspore::tensor::TensorPtr;
 using GeTensor = ge::Tensor;
 using GeTensorPtr = std::shared_ptr<GeTensor>;
 using GeGraph = ge::Graph;
 using GeGraphPtr = std::shared_ptr<GeGraph>;
 using GeOperator = ge::Operator;
 using GeOperatorPtr = std::shared_ptr<GeOperator>;

 namespace py = pybind11;

 enum PynativeStatusCode {
  PYNATIVE_SUCCESS = 0,
  PYNATIVE_OP_NOT_IMPLEMENTED_ERR = 1,
  PYNATIVE_OP_INPUTS_ERR = 2,
  PYNATIVE_OP_PARAMS_ERR = 3,
  PYNATIVE_OP_ATTRS_ERR = 4,
  PYNATIVE_GRAPH_MANAGER_ERR = 5,
  PYNATIVE_GRAPH_GE_BUILD_ERR = 6,
  PYNATIVE_GRAPH_GE_RUN_ERR = 7,
  PYNATIVE_UNKNOWN_STATE = 0XFF
 };

 enum RunOpArgsEnum { PY_PRIM = 0, PY_NAME, PY_INPUTS, PY_INPUT_MASK, PY_ARGS_NUM };

 struct OpExecInfo {
  PrimitivePyPtr py_primitive;
  std::string op_name;
  AbstractBasePtr abstract;

  py::tuple op_inputs;
  py::tuple inputs_mask;
  py::dict op_attrs;
 };
 using OpExecInfoPtr = std::shared_ptr<OpExecInfo>;
 OpExecInfoPtr GenerateOpExecInfo(const py::args& args);
 bool BuildSingleOpGraph(const OpExecInfoPtr& op_exec_info, const std::vector<GeTensorPtr>& inputs,
                        const std::unordered_map<std::string, ValuePtr>& attrs, const GeGraphPtr& graph);

 py::object RunOpInGE(const OpExecInfoPtr& op_exec_info, PynativeStatusCode* status);

 py::object RunOpInVM(const OpExecInfoPtr& op_exec_info, PynativeStatusCode* status);

 py::tuple RunOp(const py::args& args);
--- a/mindspore/ccsrc/pynative/pynative_execute_ge.cc
+++ b/mindspore/ccsrc/pynative/pynative_execute_ge.cc
@@ -0,0 +1,311 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "pynative/pynative_execute_ge.h"

 #include <typeinfo>
 #include <map>
 #include <set>
 #include <unordered_set>

 #include "utils/any.h"
 #include "utils/utils.h"
 #include "utils/context/ms_context.h"
 #include "operator/ops.h"
 #include "pipeline/parse/data_converter.h"
 #include "pipeline/static_analysis/prim.h"
 #include "session/session_factory.h"

 const char SINGLE_OP_GRAPH[] = "single_op_graph";

 namespace mindspore {
 namespace pynative {

 using MeTensor = mindspore::tensor::Tensor;
 using MeTensorPtr = mindspore::tensor::TensorPtr;
 using GeOperator = ge::Operator;
 using GeOperatorPtr = std::shared_ptr<GeOperator>;

 using transform::GraphRunner;
 using transform::GraphRunnerOptions;
 using transform::OperatorPtr;
 static std::shared_ptr<session::SessionBasic> session = nullptr;
 inline ValuePtr PyAttrValue(const py::object& obj) {
  ValuePtr converted_ret = nullptr;
  bool converted = parse::ConvertData(obj, &converted_ret);
  if (!converted) {
    MS_LOG(EXCEPTION) << "attribute convert error with type:" << std::string(py::str(obj));
  }
  return converted_ret;
 }

 MeTensorPtr ConvertPyObjToTensor(const py::object& obj) {
  MeTensorPtr me_tensor_ptr = nullptr;
  if (py::isinstance<MeTensor>(obj)) {
    me_tensor_ptr = py::cast<MeTensorPtr>(obj);
  } else if (py::isinstance<py::tuple>(obj)) {
    me_tensor_ptr = std::make_shared<MeTensor>(py::cast<py::tuple>(obj), nullptr);
  } else if (py::isinstance<py::float_>(obj)) {
    me_tensor_ptr = std::make_shared<MeTensor>(py::cast<py::float_>(obj), nullptr);
  } else if (py::isinstance<py::int_>(obj)) {
    me_tensor_ptr = std::make_shared<MeTensor>(py::cast<py::int_>(obj), nullptr);
  } else if (py::isinstance<py::list>(obj)) {
    me_tensor_ptr = std::make_shared<MeTensor>(py::cast<py::list>(obj), nullptr);
  } else if (py::isinstance<py::array>(obj)) {
    me_tensor_ptr = std::make_shared<MeTensor>(py::cast<py::array>(obj), nullptr);
  } else {
    MS_LOG(EXCEPTION) << "run op inputs type is invalid!";
  }
  return me_tensor_ptr;
 }

 bool SetInputsForSingleOpGraph(const OpExecInfoPtr& op_exec_info, const std::vector<GeTensorPtr>& inputs,
                               const OperatorPtr& op, std::vector<GeOperator>* graph_input_nodes) {
  MS_EXCEPTION_IF_NULL(op_exec_info);
  MS_EXCEPTION_IF_NULL(graph_input_nodes);
  auto op_inputs = op_exec_info->op_inputs;
  std::string op_name = op_exec_info->op_name;
  transform::OpAdapterPtr adapter = transform::DfGraphConvertor::FindAdapter(op_name, true);
  if (adapter == nullptr) {
    return false;
  }

  int op_input_idx = 1;
  size_t size = inputs.size();
  for (size_t i = 0; i < size; i++) {
    if (inputs[i] == nullptr) {
      continue;
    }
    auto const_op = std::make_shared<transform::Constant>();
    MS_EXCEPTION_IF_NULL(const_op);
    (void)const_op->set_attr_value(*inputs[i]);
    MeTensorPtr me_tensor_ptr = ConvertPyObjToTensor(op_inputs[i]);
    MS_EXCEPTION_IF_NULL(me_tensor_ptr);
    auto const_op_desc =
      transform::TransformUtil::GetGeTensorDesc(me_tensor_ptr->shape_c(), me_tensor_ptr->data_type(), kOpFormat_NCHW);
    if (const_op_desc == nullptr) {
      MS_LOG(ERROR) << "Create variable " << op_name << " ouptut descriptor failed!";
      return false;
    }
    auto pointer_cast_const_op = std::static_pointer_cast<transform::Constant>(const_op);
    MS_EXCEPTION_IF_NULL(pointer_cast_const_op);
    (void)pointer_cast_const_op->update_output_desc_y(*const_op_desc);
    auto& input_map = adapter->getInputMap();
    if (input_map.find(op_input_idx) == input_map.end()) {
      continue;
    }
    if (adapter->setInput(op, op_input_idx++, const_op)) {
      MS_LOG(ERROR) << "fail to set params, index is " << op_input_idx;
      return false;
    }
    graph_input_nodes->push_back(*const_op);
  }
  return true;
 }

 bool BuildSingleOpGraph(const OpExecInfoPtr& op_exec_info, const std::vector<GeTensorPtr>& inputs,
                        const std::unordered_map<std::string, ValuePtr>& attrs, const GeGraphPtr& graph) {
  MS_EXCEPTION_IF_NULL(op_exec_info);
  std::string op_name = op_exec_info->op_name;
  auto op_inputs = op_exec_info->op_inputs;
  transform::OpAdapterPtr adapter = transform::DfGraphConvertor::FindAdapter(op_name, true);
  if (adapter == nullptr) {
    MS_LOG(ERROR) << "Unable to find Adapter for " << ((std::string)py::str(op_name));
    return false;
  }
  OperatorPtr op = adapter->generate(op_name);
  MS_EXCEPTION_IF_NULL(op);

  std::vector<GeOperator> graph_input_nodes;
  // hold param nodes after setting input and output for the graph
  // set input
  if (!SetInputsForSingleOpGraph(op_exec_info, inputs, op, &graph_input_nodes)) {
    return false;
  }
  // set attributes
  for (auto attr : attrs) {
    (void)adapter->setAttr(op, attr.first, attr.second);
  }
  // set default attributes
  auto extra_attrs = adapter->GetExtraAttr();
  for (auto attr : extra_attrs) {
    (void)adapter->setAttr(op, attr.first, attr.second);
  }
  // set input attributes
  auto& input_attr_map = adapter->getInputAttrMap();
  for (auto& it : input_attr_map) {
    if (op_inputs.size() < it.first) {
      continue;
    }
    auto const_value = PyAttrValue(op_inputs[it.first - 1]);
    if (const_value->isa<None>()) {
      continue;
    }
    it.second.set_attr(op, const_value);
  }
  // construct output data nodes
  std::vector<GeOperator> graph_outputs{*op};
  // set input and output nodes for the graph
  MS_EXCEPTION_IF_NULL(graph);
  (void)graph->SetInputs(graph_input_nodes).SetOutputs(graph_outputs);
  MS_LOG(INFO) << "BuildSingleOpGraph done";
  return true;
 }

 void ToTensorPtr(const OpExecInfoPtr op_exec_info, std::vector<GeTensorPtr>* const inputs) {
  MS_EXCEPTION_IF_NULL(inputs);
  MS_EXCEPTION_IF_NULL(op_exec_info);
  auto op_inputs = op_exec_info->op_inputs;
  size_t size = op_inputs.size();
  for (size_t i = 0; i < size; i++) {
    if (py::isinstance<py::none>(op_inputs[i])) {
      inputs->emplace_back(nullptr);
      continue;
    }
    MeTensorPtr me_tensor_ptr = ConvertPyObjToTensor(op_inputs[i]);
    auto ge_tensor_ptr = transform::TransformUtil::ConvertTensor(me_tensor_ptr, kOpFormat_NCHW);
    if (ge_tensor_ptr == nullptr) {
      MS_LOG(EXCEPTION) << "convert inputs to GE tensor failed in op " << op_exec_info->op_name << ".";
    }
    // set inputs for operator to build single node graph
    inputs->push_back(ge_tensor_ptr);
  }
 }

 PynativeStatusCode ConvertAttributes(const OpExecInfoPtr& op_exec_info, const std::vector<GeTensorPtr>& inputs) {
  MS_EXCEPTION_IF_NULL(op_exec_info);
  auto op_attrs = op_exec_info->op_attrs;
  std::unordered_map<std::string, ValuePtr> attrs{};

  for (auto& item : op_attrs) {
    if (!py::isinstance<py::str>(item.first)) {
      MS_LOG(ERROR) << "type error in py dict convert";
      return PYNATIVE_OP_ATTRS_ERR;
    }
    std::string name = py::cast<std::string>(item.first);
    auto attr_value = PyAttrValue(py::cast<py::object>(item.second));
    (void)attrs.emplace(name, attr_value);
  }

  // build graph
  GeGraphPtr graph = std::make_shared<GeGraph>(op_exec_info->op_name);
  if (BuildSingleOpGraph(op_exec_info, inputs, attrs, graph) == false) {
    MS_LOG(ERROR) << "Fail to BuildSingleOpGraph";
    return PYNATIVE_GRAPH_GE_BUILD_ERR;
  }

  // add the single op graph into the graph manager, which will be iterated by session.
  transform::Status ret =
    transform::DfGraphManager::GetInstance().AddGraph(SINGLE_OP_GRAPH, std::shared_ptr<transform::DfGraph>(graph));
  if (ret != transform::SUCCESS) {
    MS_LOG(ERROR) << "Fail to AddGraph into graph manager";
    return PYNATIVE_GRAPH_MANAGER_ERR;
  }

  return PYNATIVE_SUCCESS;
 }

 std::vector<MeTensorPtr> ConvertOutputTensors(const OpExecInfoPtr& op_exec_info,
                                              const std::vector<GeTensorPtr>& ge_tensors) {
  std::vector<MeTensorPtr> outputs;
  AbstractBasePtr abs_base = op_exec_info->abstract;
  std::vector<std::vector<int>> shapes;
  if (abs_base != nullptr && abs_base->isa<abstract::AbstractTensor>()) {
    auto arg_tensor = dyn_cast<abstract::AbstractTensor>(abs_base);
    shapes.emplace_back(arg_tensor->shape()->shape());
    outputs = transform::TransformUtil::ConvertGeTensors(ge_tensors, shapes);
    return outputs;
  }
  if (abs_base != nullptr && abs_base->isa<abstract::AbstractTuple>()) {
    auto arg_tuple = dyn_cast<abstract::AbstractTuple>(abs_base);
    size_t len = arg_tuple->size();

    for (size_t i = 0; i < len; i++) {
      if (arg_tuple->elements()[i]->isa<abstract::AbstractTensor>()) {
        auto arg_tensor = dyn_cast<abstract::AbstractTensor>(arg_tuple->elements()[i]);
        shapes.emplace_back(arg_tensor->shape()->shape());
      }
    }
    outputs = transform::TransformUtil::ConvertGeTensors(ge_tensors, shapes);
    return outputs;
  }
  for (auto& it : ge_tensors) {
    auto tensor = transform::TransformUtil::ConvertGeTensor(it);
    if (tensor != nullptr) {
      outputs.emplace_back(tensor);
    }
  }
  return outputs;
 }

 py::object RunOpInGE(const OpExecInfoPtr& op_exec_info, PynativeStatusCode* status) {
  MS_LOG(INFO) << "RunOpInGe start";
  MS_EXCEPTION_IF_NULL(op_exec_info);
  MS_EXCEPTION_IF_NULL(status);

  // returns a null py::tuple on error
  py::tuple err_ret(0);
  auto op_name = op_exec_info->op_name;
  transform::OpAdapterPtr adapter = transform::DfGraphConvertor::FindAdapter(op_name, true);
  if (adapter == nullptr) {
    MS_LOG(ERROR) << "Unable to find GE Adapter for " << ((std::string)py::str(op_name));
    *status = PYNATIVE_OP_NOT_IMPLEMENTED_ERR;
    return std::move(err_ret);
  }

  std::vector<GeTensorPtr> inputs{};
  ToTensorPtr(op_exec_info, &inputs);
  // convert me attr to ge AttrValue
  PynativeStatusCode ret = ConvertAttributes(op_exec_info, inputs);
  if (ret != PYNATIVE_SUCCESS) {
    *status = ret;
    return std::move(err_ret);
  }
  // run graph
  transform::RunOptions run_options;
  run_options.name = SINGLE_OP_GRAPH;
  std::vector<GeTensorPtr> ge_inputs;
  std::vector<GeTensorPtr> ge_outputs;
  transform::GraphRunnerOptions graph_runner_options;
  graph_runner_options.options["ge.trainFlag"] = "1";
  auto graph_runner = std::make_shared<transform::GraphRunner>(graph_runner_options);
  transform::Status run_ret;
  {
    // Release GIL before calling into (potentially long-running) C++ code
    py::gil_scoped_release release;
    run_ret = graph_runner->RunGraph(run_options, ge_inputs, &ge_outputs);
  }
  if (run_ret != transform::Status::SUCCESS) {
    MS_LOG(ERROR) << "GraphRunner Fails to Run Graph";
    *status = PYNATIVE_GRAPH_GE_RUN_ERR;
    return std::move(err_ret);
  }

  std::vector<MeTensorPtr> graph_outputs = ConvertOutputTensors(op_exec_info, ge_outputs);
  size_t output_size = graph_outputs.size();
  py::tuple result(output_size);
  for (size_t i = 0; i < output_size; i++) {
    MS_EXCEPTION_IF_NULL(graph_outputs[i]);
    result[i] = *graph_outputs[i];
  }

  *status = PYNATIVE_SUCCESS;
  MS_LOG(INFO) << "RunOpInGe end";
  return std::move(result);
 }
 }  // namespace pynative

 }  // namespace mindspore
--- a/mindspore/ccsrc/pynative/pynative_execute_ge.h
+++ b/mindspore/ccsrc/pynative/pynative_execute_ge.h
@@ -0,0 +1,46 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_PYNATIVE_PYNATIVE_EXECUTE_GE_H_
 #define MINDSPORE_CCSRC_PYNATIVE_PYNATIVE_EXECUTE_GE_H_

 #include <vector>
 #include <utility>
 #include <string>
 #include <memory>
 #include <unordered_map>

 #include "pynative/base.h"
 #include "transform/convert.h"
 #include "transform/graph_runner.h"
 #include "transform/types.h"
 #include "utils/context/ms_context.h"

 using GeTensor = ge::Tensor;
 using GeTensorPtr = std::shared_ptr<GeTensor>;
 using GeGraph = ge::Graph;
 using GeGraphPtr = std::shared_ptr<GeGraph>;

 namespace mindspore {
 namespace pynative {
 bool BuildSingleOpGraph(const OpExecInfoPtr& op_exec_info, const std::vector<GeTensorPtr>& inputs,
                        const std::unordered_map<std::string, ValuePtr>& attrs, const GeGraphPtr& graph);

 py::object RunOpInGE(const OpExecInfoPtr& op_exec_info, PynativeStatusCode* status);
 }  // namespace pynative
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_PYNATIVE_PYNATIVE_EXECUTE_GE_H_
--- a/mindspore/ccsrc/session/ascend_session.cc
+++ b/mindspore/ccsrc/session/ascend_session.cc
@@ -35,6 +35,7 @@
 #include "pre_activate/common/helper.h"
 #include "device/kernel_runtime_manager.h"
 #include "kernel/tbe/tbe_python_funcs.h"
 #include "utils/config_manager.h"

 namespace mindspore {
 namespace session {
--- a/mindspore/ccsrc/session/gpu_session.cc
+++ b/mindspore/ccsrc/session/gpu_session.cc
@@ -19,7 +19,7 @@
 #include "device/gpu/gpu_kernel_runtime.h"
 #include "pre_activate/common/optimizer.h"
 #include "pre_activate/common/pass_manager.h"
 #include "pre_activate/ascend/ir_fusion/allreduce_fusion.h"
 #include "pre_activate/common/ir_fusion/allreduce_fusion.h"
 #include "device/kernel_runtime_manager.h"
 #include "predict/predict.h"
 #include "common/utils.h"
--- a/mindspore/ccsrc/transform/convert.cc
+++ b/mindspore/ccsrc/transform/convert.cc
@@ -381,24 +381,6 @@ std::unordered_map<std::string, OpAdapterDescPtr> &DfGraphConvertor::get_adpt_ma
 }

 // ---------------implement of DfGraphConvertor-------------
 std::string GetCNodeFuncName(const CNodePtr cnode) {
  if (cnode->inputs().empty()) {
    return "";
  }

  AnfNodePtr valuenode = cnode->input(0);
  if (valuenode->isa<ValueNode>()) {
    auto value = GetValueNode(valuenode);
    // check whether the valuenode is primitive
    if (value->isa<Primitive>()) {
      return value->cast<PrimitivePtr>()->name();
    } else {
      return value->ToString();
    }
  }
  return "";
 }

 PrimType GetCNodeFuncType(const CNodePtr cnode) {
  if (cnode->inputs().empty()) {
    return kPrimTypeUnknown;
--- a/mindspore/ccsrc/transform/convert.h
+++ b/mindspore/ccsrc/transform/convert.h
@@ -253,7 +253,6 @@ class DfGraphConvertor {
  bool distribute_ = false;
 };

 extern std::string GetCNodeFuncName(CNodePtr cnode);
 }  // namespace transform
 }  // namespace mindspore

--- a/mindspore/ccsrc/utils/callbacks.cc
+++ b/mindspore/ccsrc/utils/callbacks.cc
@@ -20,16 +20,16 @@
 #include <memory>
 #include <vector>
 #include "pybind11/pybind11.h"
 #ifdef ENABLE_GE
 #include "transform/df_graph_manager.h"
 #include "transform/util.h"
 #endif
 #include "pipeline/parse/data_converter.h"
 #include "pipeline/parse/python_adapter.h"
 #include "utils/visible.h"

 namespace mindspore {
 namespace callbacks {
 using mindspore::transform::Status;
 using mindspore::transform::TransformUtil;

 const char PYTHON_MOD_CALLBACK_MODULE[] = "mindspore.train.callback";
 const char PYTHON_FUN_PROCESS_CHECKPOINT[] = "_checkpoint_cb_for_save_op";
@@ -38,6 +38,10 @@ const char kSummary[] = "Summary";
 const char kCheckPoint[] = "Save";
 const int ONE_SHAPE = 1;

 #ifdef ENABLE_GE
 using mindspore::transform::Status;
 using mindspore::transform::TransformUtil;

 bool GetParameterShape(const FuncGraphPtr& graph, const std::string& param_name,
                       const std::shared_ptr<std::vector<int>>& shape) {
  if (graph == nullptr) {
@@ -181,6 +185,7 @@ uint32_t MS_EXPORT SummarySaveCallback(uint32_t graph_id, const std::map<std::st
  MS_LOG(DEBUG) << "End the summary save callback function.";
  return Status::SUCCESS;
 }
 #endif

 // Cache the summary callback data from ME session
 // Remove the GE module on new architecture
@@ -208,10 +213,10 @@ uint32_t MS_EXPORT SummarySaveCallback(uint32_t graph_id, const std::map<std::st
  auto bool_ret = py::cast<bool>(ret);
  if (!bool_ret) {
    MS_LOG(ERROR) << "Python checkpoint return false during callback";
    return Status::FAILED;
    return kCallbackFalied;
  }
  MS_LOG(DEBUG) << "End the summary save callback function.";
  return Status::SUCCESS;
  return kCallbackOk;
 }
 }  // namespace callbacks
 }  // namespace mindspore
--- a/mindspore/ccsrc/utils/callbacks.h
+++ b/mindspore/ccsrc/utils/callbacks.h
@@ -20,8 +20,11 @@
 #include <string>
 #include <vector>
 #include <memory>
 #include "ir/meta_tensor.h"
 #ifdef ENABLE_GE
 #include "transform/types.h"
 #include "transform/util.h"
 #endif

 namespace mindspore {
 namespace callbacks {
@@ -36,10 +39,16 @@ extern const char kSummary[];
 extern const char kCheckPoint[];
 extern const std::string kPythonCheckpointModuleName;
 extern const std::string kPythonCheckpointFuncName;

 const int kCallbackOk = 0;
 const int kCallbackFalied = 1;

 bool GetParameterShape(const FuncGraphPtr& anf_graph, const std::string& param_name,
                       const std::shared_ptr<std::vector<int>>& shape);
 #ifdef ENABLE_GE
 uint32_t CheckpointSaveCallback(uint32_t, const std::map<std::string, ge::Tensor>&);
 uint32_t SummarySaveCallback(uint32_t, const std::map<std::string, ge::Tensor>&);
 #endif
 uint32_t SummarySaveCallback(uint32_t, const std::map<std::string, TensorPtr>&);

 }  // namespace callbacks
--- a/mindspore/ccsrc/utils/context/ms_context.cc
+++ b/mindspore/ccsrc/utils/context/ms_context.cc
@@ -26,13 +26,15 @@
 #include "tdt/tdt_host_interface.h"
 #include "tdt/data_common.h"
 #endif
 #ifdef ENABLE_GE
 #include "transform/df_graph_manager.h"
 #endif
 #include "ir/meta_tensor.h"

 namespace mindspore {
 #ifdef ENABLE_GE
 using mindspore::transform::DfGraphManager;
 using transform::GraphRunner;
 using transform::GraphRunnerOptions;
 #endif

 std::atomic<bool> thread_1_must_end(false);

@@ -81,6 +83,7 @@ MsContext::MsContext(const std::string& policy, const std::string& target) {

 std::shared_ptr<MsContext> MsContext::GetInstance() {
  if (inst_context_ == nullptr) {
    MS_LOG(DEBUG) << "Create new mindspore context";
 #ifdef ENABLE_GE
    inst_context_.reset(new (std::nothrow) MsContext("ge", kAscendDevice));
 #elif defined(ENABLE_D)
--- a/mindspore/ccsrc/utils/context/ms_context.h
+++ b/mindspore/ccsrc/utils/context/ms_context.h
@@ -23,7 +23,6 @@
 #include <vector>
 #include <string>
 #include <utility>
 #include "transform/graph_runner.h"
 #include "utils/log_adapter.h"

 namespace mindspore {
--- a/mindspore/ccsrc/utils/convert_utils.cc
+++ b/mindspore/ccsrc/utils/convert_utils.cc
@@ -373,4 +373,45 @@ AbstractBasePtr PyListDtype2AbstractTensor(const py::object &shape_obj, const py
    MS_LOG(EXCEPTION) << "Python evaluator return invalid shape or type. " << (std::string)py::str(type_obj);
  }
 }
 bool IsGraphOutputValueNodeOrParameter(const AnfNodePtr &output, const py::tuple &args,
                                       const std::shared_ptr<py::object> &ret_val) {
  if (output->isa<ValueNode>()) {
    MS_LOG(INFO) << "Graph's output is a constant. No need to execute.";
    ValuePtr value = GetValueNode(output);
    *ret_val = ValuePtrToPyData(value);
    return true;
  }

  // Adapter will transform values in __init__() and construct() to parameters, this could cause
  // inputs (a.k.a args in current function) size less than parameters'.
  if (output->isa<Parameter>()) {
    MS_LOG(INFO) << "Graph's output is a parameter. If all params are inputs, no need to execute.";
    if (args.empty()) {
      MS_LOG(EXCEPTION) << "Inputs size is 0, let graph to be executed.";
    }
    // Find the right parameter as ret_val.
    auto func_graph = output->func_graph();
    MS_EXCEPTION_IF_NULL(func_graph);
    auto params = func_graph->parameters();
    if (params.empty()) {
      MS_EXCEPTION(UnknownError) << "Graph's parameters size is 0";
    }
    if (args.size() != params.size()) {
      MS_LOG(EXCEPTION) << "Input size " << args.size() << " not equal to params size " << params.size()
                        << ", let graph to be executed.";
    }

    auto it = std::find(params.begin(), params.end(), output);
    if (it == params.end()) {
      MS_EXCEPTION(UnknownError) << "When graph output is Parameter,  it should be found in graph parameters";
    }
    size_t index = it - params.cbegin();
    if (index >= args.size()) {
      MS_EXCEPTION(UnknownError) << "Index " << index << " equal or larger than args size " << args.size() << ".";
    }
    *ret_val = args[index];
    return true;
  }
  return false;
 }
 }  // namespace mindspore
--- a/mindspore/ccsrc/utils/convert_utils.h
+++ b/mindspore/ccsrc/utils/convert_utils.h
@@ -18,6 +18,7 @@
 #define MINDSPORE_CCSRC_UTILS_CONVERT_UTILS_H_

 #include <limits>
 #include <memory>
 #include "pybind11/pybind11.h"

 #include "utils/any.h"
@@ -120,6 +121,9 @@ inline uint8_t *AddressOffset(void *address, size_t offset) {

 AbstractBasePtr PyListDtype2AbstractTensor(const py::object &shape_obj, const py::object &type_obj);

 bool IsGraphOutputValueNodeOrParameter(const AnfNodePtr &output, const py::tuple &args,
                                       const std::shared_ptr<py::object> &ret_val);

 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_UTILS_CONVERT_UTILS_H_
--- a/mindspore/ccsrc/vm/segment_runner.cc
+++ b/mindspore/ccsrc/vm/segment_runner.cc
@@ -178,14 +178,12 @@ LinConvertResult Convert(const AnfNodePtrList& lst) {
 }

 LinkFuncType MsVmConvert = Convert<VM>;
 LinkFuncType GeVmConvert = Convert<GeVM>;

 std::unordered_map<std::string, LinkFuncType> backends = {{kMsVm, MsVmConvert}, {kGeVm, GeVmConvert}};
 std::unordered_map<std::string, LinkFuncType> backends = {{kMsVm, MsVmConvert}};

 std::set<std::string> backend_list = {
  kMsConvert,
  kMsVm,
  kGeVm,
 };

 }  // namespace compile
--- a/mindspore/ccsrc/vm/transform.cc
+++ b/mindspore/ccsrc/vm/transform.cc
@@ -24,7 +24,9 @@
 #include <vector>

 #include "pipeline/static_analysis/abstract_value.h"
 #ifdef ENABLE_GE
 #include "transform/convert.h"
 #endif
 #include "utils/graph_utils.h"
 #include "utils/context/ms_context.h"
 #include "debug/trace.h"
@@ -55,7 +57,6 @@ CompileGraph::CompileGraph(const BackendPtr& backend, const std::vector<Primitiv
    MS_LOG(INFO) << "Attribute 'is_gevm_convert' is true";
    is_gevm_convert_ = true;
  }
  is_graph_cut = false;
 }

 bool CompileGraph::IsCut(const AnfNodePtr& node) {
@@ -80,14 +81,15 @@ bool CompileGraph::IsCut(const AnfNodePtr& node) {
      }
    }

 #ifdef ENABLE_GE
    if (is_gevm_convert_) {
      auto name = transform::GetCNodeFuncName(cnode);
      auto name = GetCNodeFuncName(cnode);
      auto adpt = transform::DfGraphConvertor::FindAdapter(name);
      if (adpt == nullptr) {
        is_graph_cut = true;
        return true;
      }
      return true;
    }
 #endif
  }

  return false;
@@ -605,12 +607,6 @@ FinalVMPtr CompileGraphs::CompileAndLink(const FuncGraphPtr& graph) {
  (void)WrapPrimitives(graph);
  Compile(graph);

 #ifdef ENABLE_GE
  if (!transform_->IsGraphCut()) {
    return nullptr;
  }
 #endif

  FinalVMPtr rt = Link(graph);
  Reset();
  MS_LOG(DEBUG) << "End";
--- a/mindspore/ccsrc/vm/transform.h
+++ b/mindspore/ccsrc/vm/transform.h
@@ -55,7 +55,6 @@ class CompileGraph {

  InstSet Run(const FuncGraphPtr& func_graph);
  InstSet GenMultiGraphsSinkInst(const FuncGraphPtr& graph);
  bool IsGraphCut() const { return is_graph_cut; }
  bool IsCut(const AnfNodePtr& node);
  void Push(const AnfNodePtr& node);
  void Tie(const AnfNodePtr& n1, const AnfNodePtr& n2) { slots_[n2] = slots_[n1]; }
@@ -101,7 +100,6 @@ class CompileGraph {
  BackendPtr backend_;
  LinkFuncType lin_convert_;
  bool is_gevm_convert_;
  bool is_graph_cut;
  int height_{0};
  int max_height_{0};
  std::vector<PrimitivePtr> cut_list_;
--- a/mindspore/ccsrc/vm/vmimpl.cc
+++ b/mindspore/ccsrc/vm/vmimpl.cc
@@ -26,8 +26,6 @@
 #include <memory>
 #include <set>

 #include "transform/graph_runner.h"
 #include "transform/convert.h"
 #include "ir/meta_tensor.h"
 #include "operator/ops.h"
 #include "ir/manager.h"
@@ -40,39 +38,6 @@ namespace compile {

 using PrimitivePyPtr = std::shared_ptr<PrimitivePy>;

 static const char SEGMENT_GRAPH_NAME[] = "runnable_segment";

 VectorRef GeVM::RunGraph(const FuncGraphPtr& anf_graph, const VectorRef& args) {
  // Convert graph
  transform::DfGraphConvertor convertor(anf_graph);

  (void)convertor.ConvertAllNode().BuildGraph();
  if (convertor.ErrCode() == 0) {
    (void)transform::DfGraphManager::GetInstance().AddGraph(SEGMENT_GRAPH_NAME, convertor.GetComputeGraph());
  } else {
    MS_LOG(EXCEPTION) << "convert df graph failed";
  }

  // Run graph
  transform::GraphRunnerOptions options;
  transform::GraphRunner graph_runner(options);
  transform::RunOptions run_options;
  run_options.name = SEGMENT_GRAPH_NAME;

  std::vector<tensor::TensorPtr> inputs;
  (void)std::transform(std::begin(args), std::end(args), std::back_inserter(inputs),
                       [](const BaseRef& arg) -> tensor::TensorPtr {
                         auto value_ref = utils::cast<PyObjectRef>(arg);
                         auto value = value_ref.object_;
                         return py::cast<tensor::TensorPtr>(value);
                       });
  std::vector<tensor::TensorPtr> outputs;
  (void)graph_runner.RunGraph(run_options, inputs, &outputs);
  std::vector<BaseRef> ret;
  (void)std::copy(outputs.begin(), outputs.end(), std::back_inserter(ret));
  return VectorRef(ret);
 }

 // Indicate a call to a new frame.
 struct CallWrap : public Base {
  explicit CallWrap(const VMFramePtr& vm_frame) : frame(vm_frame) {}
--- a/mindspore/ccsrc/vm/vmimpl.h
+++ b/mindspore/ccsrc/vm/vmimpl.h
@@ -64,12 +64,6 @@ class VMImpl {
  virtual ~VMImpl() = default;
 };

 class GeVM : public VMImpl {
 public:
  VectorRef RunGraph(const FuncGraphPtr& fg, const VectorRef& args) override;
  ~GeVM() override = default;
 };

 // An execution frame.
 // This holds the state for an application of a graph. The nodes list
 // must contain free variables of graphs encountered before the
--- a/mindspore/common/api.py
+++ b/mindspore/common/api.py
@@ -22,7 +22,7 @@ from mindspore import context
 from mindspore import log as logger
 from mindspore.parallel._utils import _get_parallel_mode
 from .._c_expression import generate_key, Executor_, Tensor, MetaTensor
 from .._c_expression import verify_inputs_signature, init_exec_dataset, export_graph, _set_dataset_mode_config, init_ge
 from .._c_expression import verify_inputs_signature, init_exec_dataset, _set_dataset_mode_config, init_ge
 from .tensor import Tensor as MsTensor

 # store ms_function class compiled pipeline cache
@@ -501,6 +501,7 @@ class _Executor:
            file_name (str): File name of model to export
            file_format (str): MindSpore currently support 'GEIR' and 'ONNX' format for exported model
        """
        from .._c_expression import export_graph
        phase = 'export' + '.' + str(net.create_time)
        export_graph(file_name, file_format, phase)

--- a/mindspore/common/parameter.py
+++ b/mindspore/common/parameter.py
@@ -155,6 +155,18 @@ class Parameter:
    def data(self):
        return self.default_input

    def __add__(self, other):
        return self.default_input + other

    def __sub__(self, other):
        return self.default_input - other

    def __mul__(self, other):
        return self.default_input * other

    def __truediv__(self, other):
        return self.default_input / other

    def set_parameter_data(self, data):
        if isinstance(data, (Tensor, list, int, float,
                             np.float16, np.float32, np.int32, np.int16, np.ndarray)) and not isinstance(data, bool):
--- a/mindspore/common/tensor.py
+++ b/mindspore/common/tensor.py
@@ -89,6 +89,16 @@ class Tensor(Tensor_):
        out = self.__mul__(other)
        return out

    def __truediv__(self, other):
        if isinstance(other, (int, float)):
            other_tensor = Tensor(other, self.dtype())
        elif isinstance(other, Tensor):
            other_tensor = other
        else:
            raise TypeError("unsupported type for div operation")
        out = tensor_operator_registry.get('__div__')(self, other_tensor)
        return out

    def __sub__(self, other):
        if not isinstance(other, Tensor):
            raise TypeError("input_data must be a tensor")
--- a/mindspore/ops/functional.py
+++ b/mindspore/ops/functional.py
@@ -125,5 +125,5 @@ shape_mul = Primitive("shape_mul")
 stop_gradient = Primitive("stop_gradient")

 tensor_operator_registry.register('__add__', tensor_add)

 tensor_operator_registry.register('__mul__', tensor_mul)
 tensor_operator_registry.register('__div__', tensor_div)
--- a/mindspore/train/model.py
+++ b/mindspore/train/model.py
@@ -161,6 +161,9 @@ class Model:

    def _update_metrics(self, outputs):
        """Update metrics local values."""
        if not isinstance(outputs, tuple):
            raise ValueError("The `outputs` is not tuple.")

        if self._eval_indexes is not None and len(outputs) < 3:
            raise ValueError("The length of `outputs` must be greater than or equal to 3, \
                             but got {}".format(len(outputs)))
--- a/tests/ut/cpp/device/ascend_kernel_select_test.cc
+++ b/tests/ut/cpp/device/ascend_kernel_select_test.cc
@@ -231,7 +231,7 @@ void test_select(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel
  AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_info_ptr, kernel_node.get());
 }

 void SetParentAbstract(std::vector<AnfNodePtr> parent_list, std::vector<vector<size_t>> shapes,
 void SetParentAbstract(std::vector<AnfNodePtr> parent_list, std::vector<std::vector<size_t>> shapes,
                       std::vector<TypeId> types) {
  for (const auto &node : parent_list) {
    AnfAlgo::SetOutputInferTypeAndShape(types, shapes, node.get());
--- a/tests/ut/cpp/device/ascend_profiling_test.cc
+++ b/tests/ut/cpp/device/ascend_profiling_test.cc
@@ -16,10 +16,10 @@
 #include <iostream>
 #include <memory>

 #include "./prof_reporter.h"
 #include "common/common_test.h"
 #include "device/ascend/profiling/profiling_manager.h"
 #include "./common.h"
 #include "./prof_reporter.h"
 #define private public
 #include "device/ascend/profiling/plugin_impl.h"
 #undef private
--- a/tests/ut/cpp/pre_activate/common/ir_fusion/allreduce_fusion_test.cc
+++ b/tests/ut/cpp/pre_activate/common/ir_fusion/allreduce_fusion_test.cc
@@ -20,7 +20,7 @@
 #include "ir/manager.h"
 #include "debug/anf_ir_dump.h"
 #include "session/anf_runtime_algorithm.h"
 #include "pre_activate/ascend/ir_fusion/allreduce_fusion.h"
 #include "pre_activate/common/ir_fusion/allreduce_fusion.h"
 #include "pre_activate/common/optimizer.h"
 #include "device/kernel_info.h"
 #include "pre_activate/common/pass_manager.h"
--- a/tests/ut/cpp/pre_activate/pass/convert_const_input_to_tensor_input_test.cc
+++ b/tests/ut/cpp/pre_activate/pass/convert_const_input_to_tensor_input_test.cc
@@ -105,7 +105,7 @@ TEST_F(TestHWConstInputToTensorInput, test_value_tuple_tensor_input) {
  auto tensor = input1->cast<ValueNodePtr>()->value()->cast<tensor::TensorPtr>();
  ASSERT_TRUE(tensor != nullptr);
  auto data = tensor->data_c(false);
  EXPECT_EQ(vector<int>((int *)data, (int *)data + 4), vector<int>({2, 4, 2, 2}));
  EXPECT_EQ(std::vector<int>((int *)data, (int *)data + 4), std::vector<int>({2, 4, 2, 2}));
 }
 }  // namespace opt
 }  // namespace mindspore
--- a/tests/ut/python/ir/test_tensor.py
+++ b/tests/ut/python/ir/test_tensor.py
@@ -24,6 +24,8 @@ import pytest
 import mindspore as ms
 import mindspore.common.api as me
 import mindspore.nn as nn
 from mindspore.common.parameter import Parameter
 from mindspore.common.initializer import initializer
 from ..ut_filter import non_graph_engine


@@ -199,6 +201,21 @@ def test_sub():
    z = x - y
    assert isinstance(z, ms.Tensor)

@non_graph_engine
 def test_div():
    x = ms.Tensor(np.array([[2,6,10],[12, 4, 8]]).astype(np.float32))
    y = ms.Tensor(np.array([[2,2,5],[6, 1, 2]]).astype(np.float32))
    z = x / y
    z2 = x / 2
    assert isinstance(z, ms.Tensor)
    assert isinstance(z2, ms.Tensor)

@non_graph_engine
 def test_parameter():
    x = Parameter(initializer(1, [1], ms.float32), name="beta1_power")
    z = x / 2
    print(z)


 class Net(nn.Cell):
    """Net definition"""
@@ -378,3 +395,4 @@ def test_tensor_dtype_fp32_to_bool():
        input = np.random.randn(2, 3, 4, 5).astype(np.float32)
        input = ms.Tensor(input)
        input_me = ms.Tensor(input, dtype=ms.bool_)

--- a/tests/ut/python/ops/test_array_ops.py
+++ b/tests/ut/python/ops/test_array_ops.py
@@ -97,20 +97,6 @@ def test_select():
    assert np.all(output.asnumpy() == expect)


 def test_scalar_cast_grad():
    """ test_scalar_cast_grad """
    input_x = 255.5
    input_t = get_py_obj_dtype(ms.int8)

    def fx_cast(x):
        output = F.scalar_cast(x, input_t)
        return output

    gfn = C.grad(fx_cast)(input_x)
    expect_dx = 1
    assert gfn == expect_dx


 class CustomOP(PrimitiveWithInfer):
    __mindspore_signature__ = (sig_dtype.T, sig_dtype.T, sig_dtype.T1,
                               sig_dtype.T1, sig_dtype.T2, sig_dtype.T2,
--- a/tests/ut/python/parallel/init.py
+++ b/tests/ut/python/parallel/init.py
@@ -13,11 +13,14 @@
 # limitations under the License.

 import mindspore.context as context
 from mindspore.parallel._utils import _reset_op_id


 def setup_module(module):
    context.set_context(mode=context.GRAPH_MODE)
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False)
    _reset_op_id()


 def teardown_module():
    context.reset_auto_parallel_context()
    _reset_op_id()
--- a/tests/ut/python/parallel/test_alltoall.py
+++ b/tests/ut/python/parallel/test_alltoall.py
@@ -97,13 +97,10 @@ def test_all_to_all():
    strategys = all_to_all_common(strategy1)
    print(strategys)
    expect_dict = {'Default/network-_VirtualDatasetCell/_backbone-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits'
                   '/SoftmaxCrossEntropyWithLogits-op43': [[8, 1], [8, 1]],
                   'Default/network-_VirtualDatasetCell/_backbone-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits'
                   '/OneHot-op44': [[8, 1], [], []],
                   'Default/network-_VirtualDatasetCell/_backbone-WithLossCell/_backbone-AllToAllNet/Transpose-op1':
                       [[8, 1]],
                   'Default/network-_VirtualDatasetCell/_backbone-WithLossCell/_backbone-AllToAllNet/MatMul-op0':
                       [[1, 1], [1, 8]]}
                   '/SoftmaxCrossEntropyWithLogits-op3': [[8, 1], [8, 1]],
                   'Default/network-_VirtualDatasetCell/_backbone-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/OneHot-op4': [[8, 1], [], []],
                   'Default/network-_VirtualDatasetCell/_backbone-WithLossCell/_backbone-AllToAllNet/Transpose-op1': [[8, 1]],
                   'Default/network-_VirtualDatasetCell/_backbone-WithLossCell/_backbone-AllToAllNet/MatMul-op0': [[1, 1], [1, 8]]}
    assert (strategys == expect_dict)
    context.set_context(save_graphs=False)

--- a/tests/ut/python/parallel/test_auto_parallel_arithmetic.py
+++ b/tests/ut/python/parallel/test_auto_parallel_arithmetic.py
@@ -65,8 +65,8 @@ def test_auto_parallel_arithmetic():
    b = Tensor(np.ones([64, 128]), dtype=ms.float32)
    _executor.compile(net, x, y, b, phase='train')
    strategies = _executor._get_strategy(net)
    expected_strategies = {'Default/network-Net/FloorDiv-op2': [[2, 4], [2, 4]],
                     'Default/network-Net/MatMul-op3': [[2, 1], [1, 4]]}
    expected_strategies = {'Default/network-Net/FloorDiv-op0': [[2, 4], [2, 4]],
                     'Default/network-Net/MatMul-op1': [[2, 1], [1, 4]]}
    assert strategies == expected_strategies

 def test_auto_parallel_arithmetic_broadcast_both():
@@ -91,8 +91,8 @@ def test_auto_parallel_arithmetic_broadcast_both():
    b = Tensor(np.ones([1, 64]), dtype=ms.float32)
    _executor.compile(net, x, y, b, phase='train')
    strategies = _executor._get_strategy(net)
    expected_strategies = {'Default/network-Net/FloorDiv-op2': [[8, 1], [1, 1]],
                           'Default/network-Net/MatMul-op3': [[8, 1], [1, 1]]}
    expected_strategies = {'Default/network-Net/FloorDiv-op0': [[8, 1], [1, 1]],
                           'Default/network-Net/MatMul-op1': [[8, 1], [1, 1]]}
    assert strategies == expected_strategies


@@ -118,8 +118,8 @@ def test_auto_parallel_arithmetic_broadcast_right():
    b = Tensor(np.ones([32]), dtype=ms.float32)
    _executor.compile(net, x, y, b, phase='train')
    strategies = _executor._get_strategy(net)
    expected_strategies = {'Default/network-Net/FloorDiv-op2': [[4, 2], [2]],
                           'Default/network-Net/MatMul-op3': [[4, 1], [1, 2]]}
    expected_strategies = {'Default/network-Net/FloorDiv-op0': [[4, 2], [2]],
                           'Default/network-Net/MatMul-op1': [[4, 1], [1, 2]]}
    assert strategies == expected_strategies


@@ -145,6 +145,6 @@ def test_auto_parallel_arithmetic_broadcast_left():
    b = Tensor(np.ones([128, 64, 32]), dtype=ms.float32)
    _executor.compile(net, x, y, b, phase="train")
    strategies = _executor._get_strategy(net)
    expected_strategies = {'Default/network-Net/FloorDiv-op2': [[4, 2], [1, 4, 2]],
                           'Default/network-Net/MatMul-op3': [[4, 1], [1, 2]]}
    assert strategies == expected_strategies
    expected_strategies = {'Default/network-Net/FloorDiv-op0': [[4, 2], [1, 4, 2]],
                           'Default/network-Net/MatMul-op1': [[4, 1], [1, 2]]}
    assert strategies == expected_strategies
--- a/tests/ut/python/parallel/test_auto_parallel_assign_sub_with_ref_key.py
+++ b/tests/ut/python/parallel/test_auto_parallel_assign_sub_with_ref_key.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import re
 import numpy as np
 from mindspore import context
 import mindspore.nn as nn
@@ -55,6 +56,9 @@ def test_auto_parallel_assign_sub_with_ref_key():

    _executor.compile(net, x, phase="train")
    strategies = _executor._get_strategy(net)
    expected_strategies = {'Default/network-PReLU/PReLU-op2': [[1, 1, 1, 8], [1]],
                           'Default/network-PReLU/ReLU-op3': [[1]]}
    assert strategies == expected_strategies
    for (k, v) in strategies.items():
        if re.search('PReLU-op', k) is not None:
            assert v == [[1, 1, 1, 8], [1]]
        elif re.search('ReLU-op', k) is not None:
            assert v == [[1]]

--- a/tests/ut/python/parallel/test_auto_parallel_cast.py
+++ b/tests/ut/python/parallel/test_auto_parallel_cast.py
@@ -75,9 +75,9 @@ def test_double_star_graph():

    _executor.compile(net, x, y, z, w, phase='train')
    strategies = _executor._get_strategy(net)
    expected_strategies = {'Default/network-Net/MatMul-op0': [[1, 8], [8, 1]],
                           'Default/network-Net/Cast-op7': [[8, 1]],
                           'Default/network-Net/MatMul-op8': [[8, 1], [1, 1]],
                           'Default/network-Net/Cast-op9': [[1, 8]],
                           'Default/network-Net/MatMul-op10': [[1, 1], [1, 8]]}
    assert strategies == expected_strategies
    expected_strategies = {'Default/network-Net/Cast-op1': [[8, 1]],
                           'Default/network-Net/Cast-op3': [[1, 8]],
                           'Default/network-Net/MatMul-op2': [[8, 1], [1, 1]],
                           'Default/network-Net/MatMul-op4': [[1, 1], [1, 8]],
                           'Default/network-Net/MatMul-op0': [[1, 8], [8, 1]]}
    assert strategies == expected_strategies
--- a/tests/ut/python/parallel/test_auto_parallel_matmul_prelu.py
+++ b/tests/ut/python/parallel/test_auto_parallel_matmul_prelu.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import re
 import numpy as np
 from mindspore import context
 import mindspore.nn as nn
@@ -66,7 +67,10 @@ def test_matmul_prelu():

    _executor.compile(net, x, y, b, phase='train')
    strategies = _executor._get_strategy(net)
    assert strategies['Default/network-Net/PReLU-op2'] == [[16, 1, 1, 1], [1]]
    assert strategies['Default/network-Net/Mul-op3'] == [[16, 1, 1, 1], [16, 1, 1, 1]]
    for (k, v) in strategies.items():
        if re.search('PReLU-op', k) is not None:
            assert v == [[16, 1, 1, 1], [1]]
        elif re.search('Mul-op', k) is not None:
            assert v == [[16, 1, 1, 1], [16, 1, 1, 1]]


--- a/tests/ut/python/parallel/test_auto_parallel_parameter_cast.py
+++ b/tests/ut/python/parallel/test_auto_parallel_parameter_cast.py
@@ -80,9 +80,9 @@ def test_common_parameter():

    _executor.compile(net, x, y, z, w, phase='train')
    strategies = _executor._get_strategy(net)
    expected_strategies = {'Default/network-Net/MatMul-op6': [[8, 1], [1, 1]],
                           'Default/network-Net/MatMul-op8': [[8, 1], [1, 1]],
                           'Default/network-Net/Cast-op7': [[1, 1]],
    expected_strategies = {'Default/network-Net/MatMul-op1': [[8, 1], [1, 1]],
                           'Default/network-Net/MatMul-op3': [[8, 1], [1, 1]],
                           'Default/network-Net/Cast-op2': [[1, 1]],
                           'Default/network-Net/MatMul-op0': [[8, 1], [1, 1]],
                           'Default/network-Net/Cast-op9': [[1, 1]]}
                           'Default/network-Net/Cast-op4': [[1, 1]]}
    assert strategies == expected_strategies
--- a/tests/ut/python/parallel/test_auto_parallel_transpose.py
+++ b/tests/ut/python/parallel/test_auto_parallel_transpose.py
@@ -71,8 +71,8 @@ def test_two_matmul_transpose():

    _executor.compile(net, x, y, b, phase='train')
    strategies = _executor._get_strategy(net)
    expected_strategies = {'Default/network-Net/Transpose-op4': [[1, 16]],
                           'Default/network-Net/Transpose-op5': [[16, 1]],
                           'Default/network-Net/MatMul-op6': [[16, 1], [1, 1]],
                           'Default/network-Net/MatMul-op7': [[16, 1], [1, 1]]}
    assert strategies == expected_strategies
    expected_strategies = {'Default/network-Net/Transpose-op0': [[1, 16]],
                           'Default/network-Net/Transpose-op1': [[16, 1]],
                           'Default/network-Net/MatMul-op2': [[16, 1], [1, 1]],
                           'Default/network-Net/MatMul-op3': [[16, 1], [1, 1]]}
    assert strategies == expected_strategies
--- a/tests/ut/python/parallel/test_auto_parallel_two_matmul.py
+++ b/tests/ut/python/parallel/test_auto_parallel_two_matmul.py
@@ -135,7 +135,6 @@ def test_two_matmul():
    
    _executor.compile(net, x, y, b, phase='train')
    strategies = _executor._get_strategy(net)
    expected_strategies = {'Default/network-Net/MatMul-op2': [[16, 1], [1, 1]],
                     'Default/network-Net/MatMul-op3': [[16, 1], [1, 1]]}
    expected_strategies = {'Default/network-Net/MatMul-op0': [[16, 1], [1, 1]],
                     'Default/network-Net/MatMul-op1': [[16, 1], [1, 1]]}
    assert strategies == expected_strategies

--- a/tests/ut/python/parallel/test_dataset_interface.py
+++ b/tests/ut/python/parallel/test_dataset_interface.py
@@ -84,7 +84,7 @@ def loss_scale_manager_common(strategy1):
    opt = Momentum(net.trainable_params(), learning_rate, momentum)
    scale_manager = DynamicLossScaleManager(32, 2, 2000)
    model = Model(net, loss, opt, loss_scale_manager=scale_manager)
    # if no GE exists, outputs = self._train_network(*next_element) outputs is None, TypeError is caught.
    # if no GE exists, outputs = self._train_network(*next_element) outputs inputs tensor.
    try:
        model.train(epoch_size, dataset, dataset_sink_mode=False)
    except TypeError:
--- a/tests/ut/python/parallel/test_one_dev.py
+++ b/tests/ut/python/parallel/test_one_dev.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import re
 from mindspore.train import Model, ParallelMode
 from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
 from mindspore.nn.optim.momentum import Momentum
@@ -89,16 +90,13 @@ def all_to_all_common():


 def test_one_dev():

    _reset_op_id()
    strategys = all_to_all_common()
    expect_dict = {'Default/network-_VirtualDatasetCell/_backbone-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits'
                   '/SoftmaxCrossEntropyWithLogits-op9': [[1, 1], [1, 1]],
                   'Default/network-_VirtualDatasetCell/_backbone-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits'
                   '/OneHot-op10': [[1, 1], [], []],
                   'Default/network-_VirtualDatasetCell/_backbone-WithLossCell/_backbone-AllToAllNet/Transpose-op11':
                       [[1, 1]],
                   'Default/network-_VirtualDatasetCell/_backbone-WithLossCell/_backbone-AllToAllNet/MatMul-op12':
                       [[1, 1], [1, 1]]}
    assert (strategys == expect_dict)
    strategies = all_to_all_common()
    for (k, v) in strategies.items():
        if re.search('SoftmaxCrossEntropyWithLogits-op', k) is not None:
            assert v == [[1, 1], [1, 1]]
        elif re.search('Transpose-op', k) is not None:
            assert v == [[1, 1]]
        elif re.search('MatMul-op', k) is not None:
            assert v == [[1, 1], [1, 1]]

--- a/tests/ut/python/pipeline/parse/test_create_obj.py
+++ b/tests/ut/python/pipeline/parse/test_create_obj.py
@@ -24,6 +24,7 @@
 import logging
 import numpy as np
 import mindspore.nn as nn
 from mindspore import context
 from mindspore.ops import operations as P
 from mindspore.common.api import ms_function
 from mindspore.common.tensor import Tensor
@@ -50,6 +51,7 @@ class Net(nn.Cell):
 def test_create_cell_object_on_construct():
    """ test_create_cell_object_on_construct """
    log.debug("begin test_create_object_on_construct")
    context.set_context(mode=context.GRAPH_MODE)
    np1 = np.random.randn(2, 3, 4, 5).astype(np.float32)
    input_me = Tensor(np1)

@@ -118,6 +120,7 @@ class NetC(nn.Cell):
 def test_create_cell_object_on_construct_use_many_parameter():
    """ test_create_cell_object_on_construct_use_many_parameter """
    log.debug("begin test_create_object_on_construct")
    context.set_context(mode=context.GRAPH_MODE)
    np1 = np.random.randn(2, 3, 4, 5).astype(np.float32)
    input_me = Tensor(np1)

--- a/tests/ut/python/pipeline/parse/test_dtype.py
+++ b/tests/ut/python/pipeline/parse/test_dtype.py
@@ -28,5 +28,4 @@ def try_type():


 def test_dtype_convert():
    with pytest.raises(RuntimeError):
        try_type()
    try_type()
--- a/tests/ut/python/pynative_mode/ops/test_grad.py
+++ b/tests/ut/python/pynative_mode/ops/test_grad.py
@@ -19,8 +19,10 @@ from mindspore.common.api import ms_function
 from mindspore import Tensor
 from mindspore.ops import composite as C
 from mindspore.ops.composite import grad_all_with_sens
 from mindspore.common.dtype import get_py_obj_dtype
 import mindspore.nn as nn
 import mindspore.ops.operations as P
 from mindspore.ops import functional as F
 from ...ut_filter import non_graph_engine


@@ -78,6 +80,20 @@ def test_cast_grad():
    assert np.all(gout[0].asnumpy() == expect)


 def test_scalar_cast_grad():
    """ test_scalar_cast_grad """
    input_x = 255.5
    input_t = get_py_obj_dtype(ms.int8)

    def fx_cast(x):
        output = F.scalar_cast(x, input_t)
        return output

    gfn = C.grad(fx_cast)(input_x)
    expect_dx = 1
    assert gfn == expect_dx


@non_graph_engine
 def test_reshape_grad():
    """ test_reshape_grad """
--- a/tests/ut/python/train/summary/test_summary_ops_params_valid_check.py
+++ b/tests/ut/python/train/summary/test_summary_ops_params_valid_check.py
@@ -163,12 +163,7 @@ def test_scalar_summary_use_invalid_tag_None():
 def test_scalar_summary_use_invalid_tag_Bool():
    log.debug("begin test_scalar_summary_use_invalid_tag_Bool")
    net = SummaryDemoTag(True, True, True)
    try:
        run_case(net)
    except:
        assert True
    else:
        assert False
    run_case(net)
    log.debug("finished test_scalar_summary_use_invalid_tag_Bool")


@@ -176,12 +171,7 @@ def test_scalar_summary_use_invalid_tag_Bool():
 def test_scalar_summary_use_invalid_tag_null():
    log.debug("begin test_scalar_summary_use_invalid_tag_null")
    net = SummaryDemoTag("", "", "")
    try:
        run_case(net)
    except:
        assert True
    else:
        assert False
    run_case(net)
    log.debug("finished test_scalar_summary_use_invalid_tag_null")


@@ -189,12 +179,7 @@ def test_scalar_summary_use_invalid_tag_null():
 def test_scalar_summary_use_invalid_tag_Int():
    log.debug("begin test_scalar_summary_use_invalid_tag_Int")
    net = SummaryDemoTag(1, 2, 3)
    try:
        run_case(net)
    except:
        assert True
    else:
        assert False
    run_case(net)
    log.debug("finished test_scalar_summary_use_invalid_tag_Int")


--- a/tests/ut/python/utils/test_serialize.py
+++ b/tests/ut/python/utils/test_serialize.py
@@ -30,7 +30,7 @@ from mindspore.nn import WithLossCell, TrainOneStepCell
 from mindspore.train.callback import _CheckpointManager
 from mindspore.train.serialization import save_checkpoint, load_checkpoint,load_param_into_net, \
                                          _exec_save_checkpoint, export, _save_graph
 from ..ut_filter import run_on_onnxruntime
 from ..ut_filter import run_on_onnxruntime, non_graph_engine
 from mindspore import context


@@ -306,6 +306,7 @@ class MYNET(nn.Cell):
        return out


@non_graph_engine
 def test_export():
    net = MYNET()
    input_data = Tensor(np.random.randint(0, 255, [1, 3, 224, 224]).astype(np.float32))