Merge branch 'master' into 04quant

6 years ago · 1239cd6135
--- a/mindspore/ccsrc/common/trans.cc
+++ b/mindspore/ccsrc/common/trans.cc
@@ -85,7 +85,7 @@ const std::map<TypeId, size_t> type_map = {{kNumberTypeBool, 1},    {kNumberType
  } while (0)

 template <typename T>
 T Ceil(T n1, T n2) {
 T DivCeil(T n1, T n2) {
  return (n2 != 0) ? (n1 - 1) / n2 + 1 : 0;
 }

@@ -371,15 +371,48 @@ std::vector<size_t> C1hwncoc0DeviceShape(const std::vector<size_t> &shape) {
  device_shape.push_back(kCubeSize);
  return device_shape;
 }

 std::vector<size_t> FracZc04DeviceShape(const std::vector<size_t> &shape) {
  if (!CheckDims(shape)) {
    MS_LOG(EXCEPTION) << "Check dims failed.";
  }
  std::vector<size_t> device_shape;
  size_t c0 = 4;
  size_t first_dim = DivCeil(c0 * shape[2] * shape[3], kCubeSize);
  size_t no = DivCeil(DivCeil(shape[0], kCubeSize) * kCubeSize, kCubeSize);
  device_shape.push_back(first_dim);
  device_shape.push_back(no);
  device_shape.push_back(kCubeSize);
  device_shape.push_back(kCubeSize);
  return device_shape;
 }

 std::vector<size_t> Nc1hwc04DeviceShape(const std::vector<size_t> &shape) {
  if (!CheckDims(shape)) {
    MS_LOG(EXCEPTION) << "Check dims failed.";
  }
  std::vector<size_t> device_shape;
  size_t C1 = 1;
  size_t C0 = 4;
  device_shape.push_back(shape[0]);
  device_shape.push_back(C1);
  device_shape.push_back(shape[2]);
  device_shape.push_back(shape[3]);
  device_shape.push_back(C0);
  return device_shape;
 }
 }  // namespace

 std::vector<size_t> TransShapeToDevice(const std::vector<size_t> &shape, const std::string &format) {
  using DeviceShapeTransfer = std::function<std::vector<size_t>(const std::vector<size_t> &)>;
  const std::map<std::string, DeviceShapeTransfer> device_shape_map{
    {kOpFormat_NCHW, NchwDeviceShape},       {kOpFormat_NHWC, NhwcDeviceShape},
    {kOpFormat_HWCN, HwchDeviceShape},       {kOpFormat_FRAC_Z, FracZDeviceShape},
    {kOpFormat_NC1HWC0, Nc1hwc0DeviceShape}, {kOpFormat_C1HWNCoC0, C1hwncoc0DeviceShape},
  };
  const std::map<std::string, DeviceShapeTransfer> device_shape_map{{kOpFormat_NCHW, NchwDeviceShape},
                                                                    {kOpFormat_NHWC, NhwcDeviceShape},
                                                                    {kOpFormat_HWCN, HwchDeviceShape},
                                                                    {kOpFormat_FRAC_Z, FracZDeviceShape},
                                                                    {kOpFormat_NC1HWC0, Nc1hwc0DeviceShape},
                                                                    {kOpFormat_C1HWNCoC0, C1hwncoc0DeviceShape},
                                                                    {kOpFormat_FRACTAL_Z_C04, FracZc04DeviceShape},
                                                                    {kOpFormat_NC1HWC0_C04, Nc1hwc04DeviceShape}};

  if (format == kOpFormat_ND || format == kOpFormat_DEFAULT) {
    return shape;
@@ -506,13 +539,13 @@ bool NchwToFracZ(const FormatArgs &args, void *result) {
    MS_LOG(ERROR) << "Illegal dtype.";
    return false;
  }
  size_t c1 = Ceil(c, c0);
  size_t c1 = DivCeil(c, c0);
  size_t hw = h * w;
  size_t chw = c * hw;
  size_t hwc0 = hw * c0;
  size_t nchw = n * chw;

  size_t hf_cnt = Ceil(n, kCubeSize);
  size_t hf_cnt = DivCeil(n, kCubeSize);
  size_t vf_cnt = c1 * hw;
  size_t fractal_ele_cnt = c0 * kCubeSize;
  size_t total_ele_cnt = hf_cnt * vf_cnt * fractal_ele_cnt;
@@ -775,7 +808,7 @@ bool NchwToNc1hwc0(const FormatArgs &args, void *result) {
    MS_LOG(ERROR) << "Illegal dtype.";
    return false;
  }
  size_t c1 = Ceil(c, c0);
  size_t c1 = DivCeil(c, c0);
  size_t hw = h * w;
  size_t chw = c * hw;
  size_t c1hwc0 = c1 * hw * c0;
--- a/mindspore/ccsrc/dataset/api/de_pipeline.cc
+++ b/mindspore/ccsrc/dataset/api/de_pipeline.cc
@@ -408,8 +408,13 @@ Status DEPipeline::ParseMindRecordOp(const py::dict &args, std::shared_ptr<Datas
  }

  std::shared_ptr<MindRecordOp::Builder> builder = std::make_shared<MindRecordOp::Builder>();
  (void)builder->SetDatasetFile(ToString(args["dataset_file"]));

  bool load_dataset = ToBool(args["load_dataset"]);
  if (load_dataset == true) {
    (void)builder->SetDatasetFile({ToString(args["dataset_file"])});
  } else {
    (void)builder->SetDatasetFile(ToStringVector(args["dataset_file"]));
  }
  (void)builder->SetLoadDataset(load_dataset);
  std::vector<std::string> in_col_names;
  if (!args["columns_list"].is_none()) {
    in_col_names = ToStringVector(args["columns_list"]);
--- a/mindspore/ccsrc/dataset/api/python_bindings.cc
+++ b/mindspore/ccsrc/dataset/api/python_bindings.cc
@@ -151,16 +151,17 @@ void bindDatasetOps(py::module *m) {
    });

  (void)py::class_<MindRecordOp, DatasetOp, std::shared_ptr<MindRecordOp>>(*m, "MindRecordOp")
    .def_static("get_num_rows", [](const std::string &path, const py::object &sampler) {
      int64_t count = 0;
      std::shared_ptr<mindrecord::ShardOperator> op;
      if (py::hasattr(sampler, "_create_for_minddataset")) {
        auto create = sampler.attr("_create_for_minddataset");
        op = create().cast<std::shared_ptr<mindrecord::ShardOperator>>();
      }
      THROW_IF_ERROR(MindRecordOp::CountTotalRows(path, op, &count));
      return count;
    });
    .def_static("get_num_rows",
                [](const std::vector<std::string> &paths, bool load_dataset, const py::object &sampler) {
                  int64_t count = 0;
                  std::shared_ptr<mindrecord::ShardOperator> op;
                  if (py::hasattr(sampler, "_create_for_minddataset")) {
                    auto create = sampler.attr("_create_for_minddataset");
                    op = create().cast<std::shared_ptr<mindrecord::ShardOperator>>();
                  }
                  THROW_IF_ERROR(MindRecordOp::CountTotalRows(paths, load_dataset, op, &count));
                  return count;
                });

  (void)py::class_<ManifestOp, DatasetOp, std::shared_ptr<ManifestOp>>(*m, "ManifestOp")
    .def_static("get_num_rows_and_classes",
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.cc
@@ -40,7 +40,7 @@ using mindrecord::ShardOperator;
 using mindrecord::ShardReader;

 // Builder constructor.  Creates the builder object.
 MindRecordOp::Builder::Builder() : build_dataset_file_("") {
 MindRecordOp::Builder::Builder() : build_dataset_file_({}) {
  // Some arguments to the MindRecordOp constructor have a default argument that is taken
  // from the client config.
  // The user may choose to change these values for the construction of the StorageOp by
@@ -63,9 +63,9 @@ Status MindRecordOp::Builder::Build(std::shared_ptr<MindRecordOp> *ptr) {
                  "Building a MindRecordOp that has not provided a file.");
  }

  new_mind_record_op = std::make_shared<MindRecordOp>(build_num_mind_record_workers_, build_rows_per_buffer_,
                                                      build_dataset_file_, build_op_connector_queue_size_,
                                                      build_columns_to_load_, build_operators_, build_block_reader_);
  new_mind_record_op = std::make_shared<MindRecordOp>(
    build_num_mind_record_workers_, build_rows_per_buffer_, build_dataset_file_, build_load_dataset_,
    build_op_connector_queue_size_, build_columns_to_load_, build_operators_, build_block_reader_);

  RETURN_IF_NOT_OK(new_mind_record_op->Init());

@@ -76,12 +76,14 @@ Status MindRecordOp::Builder::Build(std::shared_ptr<MindRecordOp> *ptr) {
 Status MindRecordOp::Builder::SanityCheck() const { return Status::OK(); }

 // Constructor of the MindRecordOp.
 MindRecordOp::MindRecordOp(int32_t num_mind_record_workers, int32_t rows_per_buffer, std::string dataset_file,
                           int32_t op_connector_queue_size, const std::vector<std::string> &columns_to_load,
 MindRecordOp::MindRecordOp(int32_t num_mind_record_workers, int32_t rows_per_buffer,
                           std::vector<std::string> dataset_file, bool load_dataset, int32_t op_connector_queue_size,
                           const std::vector<std::string> &columns_to_load,
                           const std::vector<std::shared_ptr<ShardOperator>> &operators, const bool &block_reader)
    : ParallelOp(num_mind_record_workers, op_connector_queue_size),
      rows_per_buffer_(rows_per_buffer),
      dataset_file_(dataset_file),
      load_dataset_(load_dataset),
      columns_to_load_(columns_to_load),
      operators_(operators),
      num_mind_record_workers_(num_mind_record_workers),
@@ -101,9 +103,10 @@ MindRecordOp::MindRecordOp(int32_t num_mind_record_workers, int32_t rows_per_buf
 // Private helper method to encapsulate some common construction/reset tasks
 Status MindRecordOp::Init() {
  shard_reader_ = std::make_unique<ShardReader>();
  auto rc = shard_reader_->Open(dataset_file_, num_mind_record_workers_, columns_to_load_, operators_, block_reader_);
  auto rc = shard_reader_->Open(dataset_file_, load_dataset_, num_mind_record_workers_, columns_to_load_, operators_,
                                block_reader_);

  CHECK_FAIL_RETURN_UNEXPECTED(rc != MSRStatus::FAILED,
  CHECK_FAIL_RETURN_UNEXPECTED(rc == MSRStatus::SUCCESS,
                               "MindRecordOp init failed. Error message: " + ErrnoToMessage(rc));

  data_schema_ = std::make_unique<DataSchema>();
@@ -201,8 +204,12 @@ void MindRecordOp::Print(std::ostream &out, bool show_all) const {
    // Call the super class for displaying any common detailed info
    ParallelOp::Print(out, show_all);
    // Then show any custom derived-internal stuff
    out << "\n1 Dataset file : " << dataset_file_ << "\nNumber of rows : " << num_rows_
        << "\nRows per buffer : " << rows_per_buffer_ << "\nNumber of buffers : " << buffers_needed_
    out << "\n Dataset file : ";
    for (auto &file : dataset_file_) {
      out << file << " ";
    }
    out << "\nNumber of rows : " << num_rows_ << "\nRows per buffer : " << rows_per_buffer_
        << "\nNumber of buffers : " << buffers_needed_
        << "\nNumber of ShardReader workers : " << num_mind_record_workers_ << "\n\n";
  }
 }
@@ -668,10 +675,10 @@ Status MindRecordOp::LaunchThreadAndInitOp() {
  return Status::OK();
 }

 Status MindRecordOp::CountTotalRows(const std::string dataset_path, const std::shared_ptr<ShardOperator> &op,
                                    int64_t *count) {
 Status MindRecordOp::CountTotalRows(const std::vector<std::string> dataset_path, bool load_dataset,
                                    const std::shared_ptr<ShardOperator> &op, int64_t *count) {
  std::unique_ptr<ShardReader> shard_reader = std::make_unique<ShardReader>();
  MSRStatus rc = shard_reader->CountTotalRows(dataset_path, op, count);
  MSRStatus rc = shard_reader->CountTotalRows(dataset_path, load_dataset, op, count);
  if (rc == MSRStatus::FAILED) {
    RETURN_STATUS_UNEXPECTED("MindRecordOp count total rows failed.");
  }
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.h
@@ -77,8 +77,8 @@ class MindRecordOp : public ParallelOp {
      return *this;
    }

    Builder &SetDatasetFile(const std::string &file) {
      build_dataset_file_ = file;
    Builder &SetDatasetFile(const std::vector<std::string> &files) {
      build_dataset_file_ = files;
      return *this;
    }

@@ -97,6 +97,11 @@ class MindRecordOp : public ParallelOp {
      return *this;
    }

    Builder &SetLoadDataset(bool load_dataset) {
      build_load_dataset_ = load_dataset;
      return *this;
    }

    Status SanityCheck() const;

    static int32_t num_mind_record_workers() { return kDefaultMindRecordWorkers; }
@@ -109,7 +114,8 @@ class MindRecordOp : public ParallelOp {
    int32_t builder_num_workers_;
    int32_t build_rows_per_buffer_;
    int32_t build_op_connector_queue_size_;
    std::string build_dataset_file_;
    std::vector<std::string> build_dataset_file_;
    bool build_load_dataset_;
    std::vector<std::string> build_columns_to_load_;
    std::vector<std::shared_ptr<ShardOperator>> build_operators_;
    bool build_block_reader_;
@@ -119,12 +125,12 @@ class MindRecordOp : public ParallelOp {
  // @note The builder class should be used to call it
  // @param num_mind_record_workers - The number of workers for the op (run by ShardReader)
  // @param rows_per_buffer - The requested number of rows per buffer
  // @param dataset_file - A shard file
  // @param dataset_file - dataset files
  // @param op_connector_queue_size - The output connector queue size
  // @param columns_to_load - The list of columns to use (column name)
  // @param operators - ShardOperators for Shuffle, Category, Sample
  MindRecordOp(int32_t num_mind_record_workers, int32_t rows_per_buffer, std::string dataset_file,
               int32_t op_connector_queue_size, const std::vector<std::string> &columns_to_load,
  MindRecordOp(int32_t num_mind_record_workers, int32_t rows_per_buffer, std::vector<std::string> dataset_file,
               bool load_dataset, int32_t op_connector_queue_size, const std::vector<std::string> &columns_to_load,
               const std::vector<std::shared_ptr<ShardOperator>> &operators, const bool &block_reader);

  // Destructor
@@ -169,21 +175,22 @@ class MindRecordOp : public ParallelOp {
  // Getter method
  int32_t num_rows() const { return num_rows_; }

  // Getter method
  static Status CountTotalRows(const std::string dataset_path, const std::shared_ptr<ShardOperator> &op,
                               int64_t *count);
  static Status CountTotalRows(const std::vector<std::string> dataset_path, bool load_dataset,
                               const std::shared_ptr<ShardOperator> &op, int64_t *count);

  // Getter method
  int32_t rows_per_buffer() const { return rows_per_buffer_; }

  // Getter method
  std::string dataset_file() const { return dataset_file_; }
  std::vector<std::string> dataset_file() const { return dataset_file_; }

  // Getter method
  std::vector<std::string> columns_to_load() const { return columns_to_load_; }

  bool block_reader() const { return block_reader_; }

  bool load_dataset() const { return load_dataset_; }

  Status Init();

  Status SetColumnsBlob();
@@ -246,7 +253,8 @@ class MindRecordOp : public ParallelOp {
  Status FetchBlockBuffer(const int32_t &buffer_id);

  int32_t rows_per_buffer_;                                // The number of requested rows per buffer.
  std::string dataset_file_;                               // A dataset file
  std::vector<std::string> dataset_file_;                  // dataset files
  bool load_dataset_;                                      // load dataset from single file or not
  std::vector<std::string> columns_to_load_;               // Columns to load from dataset
  std::vector<std::shared_ptr<ShardOperator>> operators_;  // ShardOperators to use
  int32_t num_mind_record_workers_;                        // number of workers to be spawned by ShardReader
--- a/mindspore/ccsrc/debug/trace_info.h
+++ b/mindspore/ccsrc/debug/trace_info.h
@@ -193,6 +193,14 @@ class TraceForAfter : public TraceInfo {
  TraceInfoPtr clone() override { return std::make_shared<TraceForAfter>(*shared_from_base<TraceForAfter>()); }
 };

 class TraceLoopEnd : public TraceInfo {
 public:
  explicit TraceLoopEnd(const DebugInfoPtr &info) : TraceInfo(info, "loop_end", "↓↓") {}
  MS_DECLARE_PARENT(TraceLoopEnd, TraceInfo);
  ~TraceLoopEnd() override = default;
  TraceInfoPtr clone() override { return std::make_shared<TraceLoopEnd>(*shared_from_base<TraceLoopEnd>()); }
 };

 class TraceEquiv : public TraceInfo {
 public:
  explicit TraceEquiv(const DebugInfoPtr &info) : TraceInfo(info, "equiv", "equiv") {}
--- a/mindspore/ccsrc/device/ascend/kernel_select_ascend.cc
+++ b/mindspore/ccsrc/device/ascend/kernel_select_ascend.cc
@@ -34,6 +34,7 @@ namespace ascend {
 namespace {
 const float kWegihtBaseScore = 1;
 const float kFeatureMapBaseScore = 10;
 constexpr auto kPriChoosenFormat = "pri_format";
 enum MatchCountPriority : int {
  MATCH_COUNT_PRIORITY_BEGIN = 0,
  MATCH_DTYPE_COUNT = MATCH_COUNT_PRIORITY_BEGIN,
@@ -85,6 +86,7 @@ string GetPriorityMatchFormat(const CNodePtr &cnode) {
  if (need_change_nd) {
    priority_matched_format = kOpFormat_DEFAULT;
  }
  AnfAlgo::SetNodeAttr(kPriChoosenFormat, MakeValue(priority_matched_format), cnode);
  return priority_matched_format;
 }
 /**
@@ -394,9 +396,9 @@ void PrintRaiseOrReducePrecisionSelectedInfo(const CNodePtr &cnode,
  std::ostringstream buffer;
  buffer << cnode->DebugString();
  if (precision_reduce) {
    buffer << " reduce precision, node datatype: ";
    buffer << " reduce precision, node datatype: \n";
  } else {
    buffer << " raise precision, node datatype: ";
    buffer << " raise precision, node datatype: \n";
  }
  PrintInputAndOutputInferType(buffer, cnode);
  buffer << ", select kernel:" << selected_kernel_build_info->ToString();
@@ -464,66 +466,57 @@ std::vector<std::shared_ptr<kernel::KernelBuildInfo>> FilterRaisedOrReducePrecis
 }
 }  // namespace

 std::shared_ptr<kernel::KernelBuildInfo> CanHitKernelInfo(
  int *status, const CNodePtr &kernel_node,
  const std::vector<std::shared_ptr<kernel::KernelBuildInfo>> &kernel_info_list) {
 KernelSelectStatus SetMatchedKernelInfo(const CNodePtr &kernel_node,
                                        const std::vector<std::shared_ptr<kernel::KernelBuildInfo>> &kernel_info_list) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  KernelSelectStatus select_status = kNoMatched;
  bool precision_reduce = false;
  std::shared_ptr<kernel::KernelBuildInfo> selected_kernel_info = nullptr;
  // Matched kernel info
  // Filter kernel info matched with me infered type
  auto filtered_kernel_info_list = GetAllMatchedFilteredKernelInfo(kernel_node, kernel_info_list);
  if (!filtered_kernel_info_list.empty()) {
    selected_kernel_info = ChooseMatchedKernelInfo(kernel_node, filtered_kernel_info_list);
    select_status = kStatusAllMatched;
  } else {
    // selected kernel info using raised precision or reduce precision
    filtered_kernel_info_list =
      FilterRaisedOrReducePrecisionMatchedKernelInfo(kernel_node, kernel_info_list, &precision_reduce);
    selected_kernel_info = ChooseMatchedKernelInfo(kernel_node, filtered_kernel_info_list);
    if (selected_kernel_info == nullptr) {
      return nullptr;
      return select_status;
    } else {
      PrintRaiseOrReducePrecisionSelectedInfo(kernel_node, selected_kernel_info, precision_reduce);
      *status = precision_reduce ? kStatusReducePrecision : kStatusRaisePrecision;
      select_status = precision_reduce ? kStatusReducePrecision : kStatusRaisePrecision;
    }
  }
  return selected_kernel_info;
  // Set kernel info to the anfnode
  AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_info, kernel_node.get());
  // Set format and data type for input tensor.
  SetTensorDeviceInfo(*selected_kernel_info, kernel_node);
  return select_status;
 }

 int SelectKernelInfo(const CNodePtr &kernel_node) {
 KernelSelectStatus SelectKernelInfo(const CNodePtr &kernel_node) {
  std::vector<std::shared_ptr<kernel::KernelBuildInfo>> kernel_info_list;
  int status = kStatusAllMatched;
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel::KernelQuery(kernel_node, &kernel_info_list);
  // filter kernel info matched with me infered type
  auto selected_kernel_info = CanHitKernelInfo(&status, kernel_node, kernel_info_list);
  if (selected_kernel_info == nullptr) {
  auto select_status = SetMatchedKernelInfo(kernel_node, kernel_info_list);
  // If aicore not find valid kernel info reloading aicpu kernel info list to find it
  if (select_status == kNoMatched) {
    MS_LOG(WARNING) << "The node [" << kernel_node->DebugString()
                    << "] cannot find valid TBE kernel info, try to get aicpu kernel info";
    kernel::AicpuQuery(kernel_node, &kernel_info_list);
    selected_kernel_info = CanHitKernelInfo(&status, kernel_node, kernel_info_list);
    kernel::AICpuQuery(kernel_node, &kernel_info_list);
    select_status = SetMatchedKernelInfo(kernel_node, kernel_info_list);
  }
  if (selected_kernel_info == nullptr) {
  // The kernel info not finded both in the aicpu kernel list & aicore kernel list
  if (select_status == kNoMatched) {
    std::ostringstream buffer;
    PrintInputAndOutputInferType(buffer, kernel_node);
    MS_EXCEPTION(TypeError) << "The node [" << kernel_node->DebugString()
                            << "] cannot find valid kernel info, not supported the type " << buffer.str();
  }
  AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_info, kernel_node.get());
  // Set format and data type for input tensor.
  SetTensorDeviceInfo(*selected_kernel_info, kernel_node);
  return status;
 }

 bool CheckKernelAccuracySupported(const CNodePtr &kernel_node,
                                  const kernel::KernelBuildInfoPtr &new_kernel_build_info) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  std::vector<std::shared_ptr<kernel::KernelBuildInfo>> kernel_info_list;
  kernel::KernelQuery(kernel_node, &kernel_info_list);
  auto result = std::find_if(kernel_info_list.begin(), kernel_info_list.end(),
                             [&new_kernel_build_info](const kernel::KernelBuildInfoPtr item) {
                               MS_EXCEPTION_IF_NULL(item);
                               return *item == *new_kernel_build_info;
                             });
  return result != kernel_info_list.end();
  return select_status;
 }
 }  // namespace ascend
 }  // namespace device
--- a/mindspore/ccsrc/device/ascend/kernel_select_ascend.h
+++ b/mindspore/ccsrc/device/ascend/kernel_select_ascend.h
@@ -21,8 +21,13 @@
 namespace mindspore {
 namespace device {
 namespace ascend {
 int SelectKernelInfo(const CNodePtr &kernel_node);
 bool CheckKernelAccuracySupported(const CNodePtr &kernel_node, const kernel::KernelBuildInfoPtr &new_kernel_build_info);
 enum KernelSelectStatus {
  kNoMatched = -1,
  kStatusAllMatched = 0,
  kStatusReducePrecision = 1,
  kStatusRaisePrecision = 2,
 };
 KernelSelectStatus SelectKernelInfo(const CNodePtr &kernel_node);
 }  // namespace ascend
 }  // namespace device
 }  // namespace mindspore
--- a/mindspore/ccsrc/kernel/gpu/arrays/unsorted_segment_sum_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/arrays/unsorted_segment_sum_gpu_kernel.h
@@ -69,9 +69,8 @@ class UnsortedSegmentSumGpuKernel : public GpuKernel {
 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(input_dim0_ * input_dim1_ * sizeof(T));
    input_size_list_.push_back(output_dim0_ * sizeof(S));
    input_size_list_.push_back(output_dim0_ * sizeof(int));
    output_size_list_.push_back(output_dim0_ * output_dim1_ * sizeof(S));
    input_size_list_.push_back(input_dim0_ * sizeof(S));
    output_size_list_.push_back(output_dim0_ * output_dim1_ * sizeof(T));
  }

 private:
--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_impl.cu
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_impl.cu
@@ -49,6 +49,21 @@ struct PowerFunc<half, half> {
  }
 };

 template <typename T, typename S>
 struct RealDivFunc {
  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs / rhs); }
 };

 template <typename T, typename S>
 struct MulFunc {
  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs * rhs); }
 };

 template <typename T, typename S>
 struct SubFunc {
  __device__ __forceinline__ S operator()(const T &lhs, const T &rhs) { return (lhs - rhs); }
 };

 template <>
 struct PowerFunc<half, bool> {
  // invalid branch
@@ -94,6 +109,15 @@ __global__ void BroadcastKernel(const int l0, const int l1, const int l2, const
    case BROADCAST_TYPE_POWER:
      return BroadcastOperator<T, S, PowerFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
                                                      output);
    case BROADCAST_TYPE_REALDIV:
      return BroadcastOperator<T, S, RealDivFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
                                                      output);
    case BROADCAST_TYPE_MUL:
      return BroadcastOperator<T, S, MulFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
                                                      output);
    case BROADCAST_TYPE_SUB:
      return BroadcastOperator<T, S, SubFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
                                                      output);
  }
 }

@@ -127,6 +151,12 @@ __global__ void NoBroadcastKernel(const int nums, enum BroadcastOpType op, const
      return NoBroadcastOperator<T, S, MaximumFunc<T, S>>(nums, input0, input1, output);
    case BROADCAST_TYPE_POWER:
      return NoBroadcastOperator<T, S, PowerFunc<T, S>>(nums, input0, input1, output);
    case BROADCAST_TYPE_REALDIV:
      return NoBroadcastOperator<T, S, RealDivFunc<T, S>>(nums, input0, input1, output);
    case BROADCAST_TYPE_MUL:
      return NoBroadcastOperator<T, S, MulFunc<T, S>>(nums, input0, input1, output);
    case BROADCAST_TYPE_SUB:
      return NoBroadcastOperator<T, S, SubFunc<T, S>>(nums, input0, input1, output);
  }
 }

--- a/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_impl.cuh
+++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/broadcast_impl.cuh
@@ -25,6 +25,9 @@ enum BroadcastOpType {
  BROADCAST_TYPE_MAXIMUM = 2,
  BROADCAST_TYPE_MINIMUM = 3,
  BROADCAST_TYPE_POWER = 4,
  BROADCAST_TYPE_REALDIV = 5,
  BROADCAST_TYPE_MUL = 6,
  BROADCAST_TYPE_SUB = 7,
  BROADCAST_TYPE_INVALID = 0xffffffff,
 };

--- a/mindspore/ccsrc/kernel/gpu/math/binary_op_gpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/math/binary_op_gpu_kernel.cc
@@ -1,42 +0,0 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "kernel/gpu/math/binary_op_gpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_ONE(
  RealDiv,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  BinaryOpGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(
  RealDiv,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  BinaryOpGpuKernel, half)
 MS_REG_GPU_KERNEL_ONE(
  Mul, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  BinaryOpGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(
  Mul, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  BinaryOpGpuKernel, half)
 MS_REG_GPU_KERNEL_ONE(
  Sub, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  BinaryOpGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(
  Sub, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  BinaryOpGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/kernel/gpu/math/binary_op_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/math/binary_op_gpu_kernel.h
@@ -1,237 +0,0 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_BINARYOP_GPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_BINARYOP_GPU_KERNEL_H_

 #include <cuda_runtime_api.h>
 #include <vector>
 #include <string>
 #include <map>
 #include "kernel/gpu/gpu_kernel.h"
 #include "kernel/gpu/gpu_kernel_factory.h"
 #include "kernel/gpu/cuda_impl/unary_op_impl.cuh"
 #include "kernel/gpu/kernel_constants.h"
 namespace mindspore {
 namespace kernel {
 enum BinaryOpType { BINARY_OP_ADD = 0, BINARY_OP_SUB, BINARY_OP_MUL, BINARY_OP_DIV, BINARY_OP_INVALID_TYPE = 255 };
 static const std::map<std::string, BinaryOpType> kBinaryOpTypeMap = {
  {"Sub", BINARY_OP_SUB}, {"Mul", BINARY_OP_MUL}, {"RealDiv", BINARY_OP_DIV}};
 template <typename T>
 class BinaryOpGpuKernel : public GpuKernel {
 public:
  BinaryOpGpuKernel()
      : cudnn_handle_(nullptr),
        binary_op_type_(BINARY_OP_INVALID_TYPE),
        tensor_op_(CUDNN_OP_TENSOR_MUL),
        inputA_descriptor_(nullptr),
        inputB_descriptor_(nullptr),
        opTensor_descriptor_(nullptr),
        cudnn_data_type_(CUDNN_DATA_FLOAT),
        is_null_input_(false),
        input_size_(0),
        output_size_(0),
        workspace_size_(0) {}
  ~BinaryOpGpuKernel() override { DestroyResource(); }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, uintptr_t stream_ptr) override {
    if (is_null_input_) {
      return true;
    }
    T *input_addr = GetDeviceAddress<T>(inputs, 0);
    T *input_addr2 = GetDeviceAddress<T>(inputs, 1);
    T *output_addr = GetDeviceAddress<T>(outputs, 0);
    const float alpha = 1;
    const float beta = 0;

    T *inputB_addr = nullptr;
    switch (binary_op_type_) {
      case BINARY_OP_SUB: {
        T *workspace_addr = GetDeviceAddress<T>(workspace, 0);
        Negative(input_addr2, workspace_addr, inputs[1]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
        inputB_addr = workspace_addr;
        break;
      }
      case BINARY_OP_MUL: {
        inputB_addr = input_addr2;
        break;
      }
      case BINARY_OP_DIV: {
        T *workspace_addr = GetDeviceAddress<T>(workspace, 0);
        Reciprocal(input_addr2, workspace_addr, inputs[1]->size / sizeof(T),
                   reinterpret_cast<cudaStream_t>(stream_ptr));
        inputB_addr = workspace_addr;
        break;
      }
      default: {
        MS_LOG(EXCEPTION) << "Binary operation " << binary_op_type_ << " is not supported.";
      }
    }
    if (inputs[0]->size > inputs[1]->size) {
      CHECK_CUDNN_RET_WITH_EXCEPT(
        cudnnOpTensor(cudnn_handle_, opTensor_descriptor_, &alpha, inputA_descriptor_, input_addr, &alpha,
                      inputB_descriptor_, inputB_addr, &beta, inputA_descriptor_, output_addr),
        "cudnnOpTensor failed");
    } else {
      CHECK_CUDNN_RET_WITH_EXCEPT(
        cudnnOpTensor(cudnn_handle_, opTensor_descriptor_, &alpha, inputB_descriptor_, inputB_addr, &alpha,
                      inputA_descriptor_, input_addr, &beta, inputB_descriptor_, output_addr),
        "cudnnOpTensor failed");
    }
    return true;
  }
  bool Init(const CNodePtr &kernel_node) override {
    InitResource();
    cudnn_data_type_ = kCudnnDtypeMap[TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))];
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 2) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but binary operation needs 2 inputs.";
      return false;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but binary operation needs 1 output.";
      return false;
    }
    InferBinaryType(kernel_node);
    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto input_shapeB = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    if (input_shape != output_shape && input_shapeB != output_shape) {
      MS_LOG(ERROR) << "Double-sided broadcast was not supported in cudnn of cudnnOpTensor:\n"
                       "InputA must match the corresponding dimension of the destination tensor outC, and each "
                       "dimension of the inputB "
                       "must match the corresponding dimension of outC or must be equal to 1.";
      return false;
    }
    is_null_input_ = CHECK_NULL_INPUT(input_shape) || CHECK_NULL_INPUT(input_shapeB);
    if (is_null_input_) {
      MS_LOG(WARNING) << "BinaryOpGpuKernel input is null";
      InitSizeLists();
      return true;
    }
    int shape_n = input_shape.size() < 4 ? 1 : SizeToInt(input_shape[input_shape.size() - 4]);
    int shape_c = input_shape.size() < 3 ? 1 : SizeToInt(input_shape[input_shape.size() - 3]);
    int shape_h = input_shape.size() < 2 ? 1 : SizeToInt(input_shape[input_shape.size() - 2]);
    int shape_w = input_shape.size() == 0 ? 1 : SizeToInt(input_shape[input_shape.size() - 1]);
    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(inputA_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
                                                           shape_n, shape_c, shape_h, shape_w),
                                "cudnnSetTensor4dDescriptor failed");
    int shapeB_n = input_shapeB.size() < 4 ? 1 : SizeToInt(input_shapeB[input_shapeB.size() - 4]);
    int shapeB_c = input_shapeB.size() < 3 ? 1 : SizeToInt(input_shapeB[input_shapeB.size() - 3]);
    int shapeB_h = input_shapeB.size() < 2 ? 1 : SizeToInt(input_shapeB[input_shapeB.size() - 2]);
    int shapeB_w = input_shapeB.size() == 0 ? 1 : SizeToInt(input_shapeB[input_shapeB.size() - 1]);
    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(inputB_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_,
                                                           shapeB_n, shapeB_c, shapeB_h, shapeB_w),
                                "cudnnSetTensor4dDescriptor failed");
    InitSizeLists();
    return true;
  }

 protected:
  void InitResource() override {
    cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&inputA_descriptor_),
                                "cudnnCreateTensorDescriptor failed.");
    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&inputB_descriptor_),
                                "cudnnCreateTensorDescriptor failed.");
    CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateOpTensorDescriptor(&opTensor_descriptor_),
                                "cudnnCreateOpTensorDescriptor failed.");
  }
  void InitSizeLists() override {
    if (!is_null_input_) {
      CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(inputA_descriptor_, &input_size_),
                                  "cudnnGetTensorSizeInBytes failed.");
      input_size_list_.push_back(input_size_);
      CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(inputB_descriptor_, &output_size_),
                                  "cudnnGetTensorSizeInBytes failed.");
    }
    input_size_list_.push_back(output_size_);
    if (binary_op_type_ == BINARY_OP_DIV || binary_op_type_ == BINARY_OP_SUB) {
      workspace_size_ = output_size_;
    }
    workspace_size_list_.push_back(workspace_size_);

    if (output_size_ > input_size_) {
      output_size_list_.push_back(output_size_);
    } else {
      output_size_list_.push_back(input_size_);
    }
    return;
  }

 private:
  void DestroyResource() noexcept {
    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(inputA_descriptor_),
                               "cudnnDestroyTensorDescriptor failed.");
    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(inputB_descriptor_),
                               "cudnnDestroyTensorDescriptor failed.");
    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyOpTensorDescriptor(opTensor_descriptor_),
                               "cudnnDestroyOpTensorDescriptor failed.");
  }
  void InferBinaryType(const CNodePtr &kernel_node) {
    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    auto iter = kBinaryOpTypeMap.find(kernel_name);
    if (iter == kBinaryOpTypeMap.end()) {
      MS_LOG(EXCEPTION) << "Binary operation " << kernel_name << " is not supported.";
    } else {
      binary_op_type_ = iter->second;
    }

    switch (binary_op_type_) {
      case BINARY_OP_DIV:
      case BINARY_OP_MUL: {
        tensor_op_ = CUDNN_OP_TENSOR_MUL;
        break;
      }
      case BINARY_OP_SUB: {
        tensor_op_ = CUDNN_OP_TENSOR_ADD;
        break;
      }
      default: {
        MS_LOG(EXCEPTION) << "Binary operation " << binary_op_type_ << " is not supported.";
      }
    }
    CHECK_CUDNN_RET_WITH_EXCEPT(
      cudnnSetOpTensorDescriptor(opTensor_descriptor_, tensor_op_, CUDNN_DATA_FLOAT, CUDNN_NOT_PROPAGATE_NAN),
      "cudnnSetOpTensorDescriptor failed");
    return;
  }

  cudnnHandle_t cudnn_handle_;
  BinaryOpType binary_op_type_;
  cudnnOpTensorOp_t tensor_op_;
  cudnnTensorDescriptor_t inputA_descriptor_;
  cudnnTensorDescriptor_t inputB_descriptor_;
  cudnnOpTensorDescriptor_t opTensor_descriptor_;
  cudnnDataType_t cudnn_data_type_;
  bool is_null_input_;
  size_t input_size_;
  size_t output_size_;
  size_t workspace_size_;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
 };
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_BINARYOP_GPU_KERNEL_H_
--- a/mindspore/ccsrc/kernel/gpu/math/broadcast_gpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/math/broadcast_gpu_kernel.cc
@@ -37,6 +37,16 @@ MS_REG_GPU_KERNEL_TWO(
 MS_REG_GPU_KERNEL_TWO(
  Pow, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  BroadcastOpGpuKernel, float, float)
 MS_REG_GPU_KERNEL_TWO(
  RealDiv,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  BroadcastOpGpuKernel, float, float)
 MS_REG_GPU_KERNEL_TWO(
  Mul, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  BroadcastOpGpuKernel, float, float)
 MS_REG_GPU_KERNEL_TWO(
  Sub, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  BroadcastOpGpuKernel, float, float)

 // fp16
 MS_REG_GPU_KERNEL_TWO(
@@ -57,5 +67,15 @@ MS_REG_GPU_KERNEL_TWO(
 MS_REG_GPU_KERNEL_TWO(
  Pow, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  BroadcastOpGpuKernel, half, half)
 MS_REG_GPU_KERNEL_TWO(
  RealDiv,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  BroadcastOpGpuKernel, half, half)
 MS_REG_GPU_KERNEL_TWO(
  Mul, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  BroadcastOpGpuKernel, half, half)
 MS_REG_GPU_KERNEL_TWO(
  Sub, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  BroadcastOpGpuKernel, half, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/kernel/gpu/math/broadcast_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/math/broadcast_gpu_kernel.h
@@ -98,7 +98,8 @@ class BroadcastOpGpuKernel : public GpuKernel {

    static std::map<std::string, BroadcastOpType> kBroadcastTypeMap = {
      {"Greater", BROADCAST_TYPE_GREATER}, {"Less", BROADCAST_TYPE_LESS}, {"Maximum", BROADCAST_TYPE_MAXIMUM},
      {"Minimum", BROADCAST_TYPE_MINIMUM}, {"Pow", BROADCAST_TYPE_POWER},
      {"Minimum", BROADCAST_TYPE_MINIMUM}, {"Pow", BROADCAST_TYPE_POWER}, {"RealDiv", BROADCAST_TYPE_REALDIV},
      {"Mul", BROADCAST_TYPE_MUL},         {"Sub", BROADCAST_TYPE_SUB},
    };

    auto iter = kBroadcastTypeMap.find(kernel_name);
--- a/mindspore/ccsrc/kernel/gpu/nn/softmax_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/softmax_gpu_kernel.h
@@ -58,11 +58,6 @@ class SoftmaxGpuKernel : public GpuKernel {
    }
    T *input_addr = GetDeviceAddress<T>(inputs, 0);
    T *output_addr = GetDeviceAddress<T>(outputs, 0);
    T *transpose_input_addr = GetDeviceAddress<T>(workspace, 0);
    T *transpose_output_addr = GetDeviceAddress<T>(workspace, 1);
    int *input_shape = GetDeviceAddress<int>(workspace, 2);
    int *transpose_shape = GetDeviceAddress<int>(workspace, 3);
    int *transpose_axis = GetDeviceAddress<int>(workspace, 4);
    const float alpha = 1;
    const float beta = 0;

@@ -71,6 +66,11 @@ class SoftmaxGpuKernel : public GpuKernel {
                                                      input_addr, &beta, output_descriptor_, output_addr),
                                  "cudnnSoftmaxForward failed");
    } else {
      T *transpose_input_addr = GetDeviceAddress<T>(workspace, 0);
      T *transpose_output_addr = GetDeviceAddress<T>(workspace, 1);
      int *input_shape = GetDeviceAddress<int>(workspace, 2);
      int *transpose_shape = GetDeviceAddress<int>(workspace, 3);
      int *transpose_axis = GetDeviceAddress<int>(workspace, 4);
      CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(input_shape, &input_shape_[0], workspace_size_, cudaMemcpyHostToDevice,
                                                 reinterpret_cast<cudaStream_t>(stream_ptr)),
                                 "cudaMemcpyAsync input_shape failed");
@@ -114,9 +114,6 @@ class SoftmaxGpuKernel : public GpuKernel {
      return true;
    }
    shape_size_ = SizeToInt(input_shape.size());
    if (shape_size_ != 2) {
      MS_LOG(EXCEPTION) << "Input is " << shape_size_ << "-D, but softmax only supports 2-D inputs.";
    }
    auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    if (kernel_name == "LogSoftmax") {
      algo_ = CUDNN_SOFTMAX_LOG;
@@ -163,7 +160,15 @@ class SoftmaxGpuKernel : public GpuKernel {
    CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(input_descriptor_), "destroy input_descriptor failed");
  }

  void InitSizeByAxis(const std::vector<size_t> input_shape, const int axis) {
  void InitSizeByAxis(const std::vector<size_t> &input_shape, const int &axis) {
    if (input_shape.size() == 2) {
      InitSizeByAxis2D(input_shape, axis);
    } else {
      InitSizeByAxisLastDim(input_shape, axis);
    }
  }

  void InitSizeByAxis2D(const std::vector<size_t> &input_shape, const int &axis) {
    axis_ = axis;
    if (axis_ < 0) {
      axis_ += shape_size_;
@@ -191,6 +196,31 @@ class SoftmaxGpuKernel : public GpuKernel {
    workspace_size_ = IntToSize(shape_size_) * sizeof(int);
  }

  void InitSizeByAxisLastDim(const std::vector<size_t> &input_shape, const int &axis) {
    int axis_pos = axis;
    if (axis_pos < 0) {
      axis_pos += input_shape.size();
    }
    // axis should be -1 with ND
    if (axis_pos != SizeToInt(input_shape.size() - 1)) {
      MS_LOG(EXCEPTION) << "Input is " << shape_size_ << "-D, but axis(" << axis << ") is invalid.";
    }
    // squeeze to 2d, then invoke cudnn
    size_t n = 1;
    for (size_t i = 0; i < input_shape.size() - 1; i++) {
      n *= input_shape[i];
    }
    axis_ = 1;
    batch_size_ = n;
    channel_size_ = input_shape[axis_pos];
    height_ = 1;
    width_ = 1;
    input_size_ = sizeof(T) * batch_size_ * channel_size_ * height_ * width_;
    output_size_ = input_size_;
    input_shape_.push_back(batch_size_);
    input_shape_.push_back(channel_size_);
  }

  cudnnHandle_t cudnn_handle_;
  cudnnTensorDescriptor_t input_descriptor_;
  cudnnTensorDescriptor_t output_descriptor_;
--- a/mindspore/ccsrc/kernel/hccl/hccl_kernel_metadata.cc
+++ b/mindspore/ccsrc/kernel/hccl/hccl_kernel_metadata.cc
@@ -35,7 +35,7 @@ void HcclMetadataInfo(const CNodePtr &kernel_node, std::vector<std::shared_ptr<K
  std::vector<std::string> input_format, output_format;
  std::vector<TypeId> input_type, output_type;
  for (const auto &data_type : data_type_list) {
    for (const auto &format : k4DSupportFormat) {
    for (const auto &format : kOpFormatList) {
      auto builder = std::make_shared<KernelBuildInfo::KernelBuildInfoBuilder>();
      input_format.clear();
      input_format.push_back(format);
--- a/mindspore/ccsrc/kernel/kernel_query.cc
+++ b/mindspore/ccsrc/kernel/kernel_query.cc
@@ -35,14 +35,18 @@ void FilterInvalidKernelInfo(const CNodePtr &kernel_node,
                       return AnfAlgo::GetOutputTensorNum(kernel_node) == kernel_build_info->GetOutputNum() &&
                              AnfAlgo::GetInputTensorNum(kernel_node) == kernel_build_info->GetInputNum();
                     });
  kernel_info_list->clear();
  if (!filtered_list.empty()) {
    kernel_info_list->clear();
    (void)std::copy(filtered_list.begin(), filtered_list.end(), std::back_inserter(*kernel_info_list));
  } else {
    MS_LOG(EXCEPTION) << "node" << kernel_node->DebugString() << "'s output size : ["
                      << AnfAlgo::GetOutputTensorNum(kernel_node) << "]"
                      << "input size : [" << AnfAlgo::GetInputTensorNum(kernel_node)
                      << "] cannot match any kernelInfo !";
    MS_LOG(WARNING) << "All kernel Info list does not match any kernel info ";
    for (size_t index; index < kernel_info_list->size(); ++index) {
      MS_EXCEPTION_IF_NULL(kernel_info_list->at(index));
      MS_LOG(WARNING) << "kernel [ " << index << " ] :" << kernel_info_list->at(index)->ToString();
    }
    MS_LOG(WARNING) << "node" << kernel_node->DebugString() << "'s output size : ["
                    << AnfAlgo::GetOutputTensorNum(kernel_node) << "]"
                    << "input size : [" << AnfAlgo::GetInputTensorNum(kernel_node) << "] cannot match any kernelInfo !";
  }
 }
 }  // namespace
@@ -50,7 +54,6 @@ void KernelQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel
  MS_EXCEPTION_IF_NULL(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_info_list);
  TbeMetadataInfo(kernel_node, kernel_info_list);

  if (kernel_info_list->empty()) {
    AicpuMetadataInfo(kernel_node, kernel_info_list);
  }
@@ -68,12 +71,41 @@ void KernelQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel
  FilterInvalidKernelInfo(kernel_node, kernel_info_list);
 }

 void AicpuQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel::KernelBuildInfo>> *kernel_info_list) {
 void AICpuQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel::KernelBuildInfo>> *kernel_info_list) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_info_list);
  kernel_info_list->clear();
  AicpuMetadataInfo(kernel_node, kernel_info_list);
  FilterInvalidKernelInfo(kernel_node, kernel_info_list);
 }
 bool IsSupportedByAiCpu(const AnfNodePtr &kernel_node, const KernelBuildInfoPtr &select_kernel_build_info) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  MS_EXCEPTION_IF_NULL(select_kernel_build_info);
  std::vector<std::shared_ptr<kernel::KernelBuildInfo>> kernel_info_list;
  auto cnode = kernel_node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  AicpuMetadataInfo(cnode, &kernel_info_list);
  FilterInvalidKernelInfo(cnode, &kernel_info_list);
  return std::any_of(kernel_info_list.begin(), kernel_info_list.end(),
                     [&select_kernel_build_info](const kernel::KernelBuildInfoPtr item) {
                       MS_EXCEPTION_IF_NULL(item);
                       return *item == *select_kernel_build_info;
                     });
 }

 bool IsSupportedByAiCore(const AnfNodePtr &kernel_node, const KernelBuildInfoPtr &select_kernel_build_info) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  MS_EXCEPTION_IF_NULL(select_kernel_build_info);
  std::vector<std::shared_ptr<kernel::KernelBuildInfo>> kernel_info_list;
  auto cnode = kernel_node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  TbeMetadataInfo(cnode, &kernel_info_list);
  FilterInvalidKernelInfo(cnode, &kernel_info_list);
  return std::any_of(kernel_info_list.begin(), kernel_info_list.end(),
                     [&select_kernel_build_info](const kernel::KernelBuildInfoPtr item) {
                       MS_EXCEPTION_IF_NULL(item);
                       return *item == *select_kernel_build_info;
                     });
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/kernel/kernel_query.h
+++ b/mindspore/ccsrc/kernel/kernel_query.h
@@ -26,7 +26,9 @@
 namespace mindspore {
 namespace kernel {
 void KernelQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel::KernelBuildInfo>> *kernel_info_list);
 void AicpuQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel::KernelBuildInfo>> *kernel_info_list);
 void AICpuQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel::KernelBuildInfo>> *kernel_info_list);
 bool IsSupportedByAiCpu(const AnfNodePtr &kernel_node, const KernelBuildInfoPtr &select_kernel_build_info);
 bool IsSupportedByAiCore(const AnfNodePtr &kernel_node, const KernelBuildInfoPtr &select_kernel_build_info);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_KERNEL_KERNEL_QUERY_H_
--- a/mindspore/ccsrc/kernel/tbe/tbe_kernel_select.cc
+++ b/mindspore/ccsrc/kernel/tbe/tbe_kernel_select.cc
@@ -551,11 +551,6 @@ bool ParseMetadata(const CNodePtr &kernel_node, const std::shared_ptr<const OpIn
 }

 bool IsShapeMatchFormat(const std::vector<size_t> &shape, const std::string &format) {
  const std::set<std::string> kOpFormatList = {kOpFormat_DEFAULT, kOpFormat_NC1KHKWHWC0, kOpFormat_ND,
                                               kOpFormat_NCHW,    kOpFormat_NHWC,        kOpFormat_HWCN,
                                               kOpFormat_NC1HWC0, kOpFormat_FRAC_Z,      kOpFormat_C1HWNCoC0,
                                               kOpFormat_FRAC_NZ, kOpFormat_NC1HWC0_C04, kOpFormat_FRACTAL_Z_C04};

  // if format is default, it remarkes support all format
  if (kOpFormatList.find(format) == kOpFormatList.end()) {
    MS_LOG(EXCEPTION) << "Got the unknown format " << format;
--- a/mindspore/ccsrc/mindrecord/common/shard_error.cc
+++ b/mindspore/ccsrc/mindrecord/common/shard_error.cc
@@ -170,6 +170,9 @@ std::string ErrnoToMessage(MSRStatus status) {
    case IO_FAILED:
      return "io operate failed";
      break;
    case MATCH_HEADER_FAILED:
      return "match header failed";
      break;
    default:
      return "invalid error no";
  }
--- a/mindspore/ccsrc/mindrecord/common/shard_pybind.cc
+++ b/mindspore/ccsrc/mindrecord/common/shard_pybind.cc
@@ -84,7 +84,8 @@ void BindShardWriter(py::module *m) {
 void BindShardReader(const py::module *m) {
  (void)py::class_<ShardReader, std::shared_ptr<ShardReader>>(*m, "ShardReader", py::module_local())
    .def(py::init<>())
    .def("open", (MSRStatus(ShardReader::*)(const std::string &, const int &, const std::vector<std::string> &,
    .def("open", (MSRStatus(ShardReader::*)(const std::vector<std::string> &, bool, const int &,
                                            const std::vector<std::string> &,
                                            const std::vector<std::shared_ptr<ShardOperator>> &)) &
                   ShardReader::OpenPy)
    .def("launch", &ShardReader::Launch)
@@ -106,7 +107,8 @@ void BindShardIndexGenerator(const py::module *m) {
 void BindShardSegment(py::module *m) {
  (void)py::class_<ShardSegment>(*m, "ShardSegment", py::module_local())
    .def(py::init<>())
    .def("open", (MSRStatus(ShardSegment::*)(const std::string &, const int &, const std::vector<std::string> &,
    .def("open", (MSRStatus(ShardSegment::*)(const std::vector<std::string> &, bool, const int &,
                                             const std::vector<std::string> &,
                                             const std::vector<std::shared_ptr<ShardOperator>> &)) &
                   ShardSegment::OpenPy)
    .def("get_category_fields",
--- a/mindspore/ccsrc/mindrecord/include/shard_error.h
+++ b/mindspore/ccsrc/mindrecord/include/shard_error.h
@@ -72,7 +72,8 @@ enum MSRStatus {
  ILLEGAL_PARAMETERS,
  GET_PAGE_BY_GROUP_ID_FAILED,
  GET_SYSTEM_STATE_FAILED,
  IO_FAILED
  IO_FAILED,
  MATCH_HEADER_FAILED
 };

 // convert error no to string message
--- a/mindspore/ccsrc/mindrecord/include/shard_header.h
+++ b/mindspore/ccsrc/mindrecord/include/shard_header.h
@@ -35,10 +35,11 @@ class ShardHeader {
 public:
  ShardHeader();

  MSRStatus Build(const std::string &file_path);

  ~ShardHeader() = default;

  MSRStatus BuildDataset(const std::vector<std::string> &file_paths, bool load_dataset = true);

  static std::pair<MSRStatus, json> BuildSingleHeader(const std::string &file_path);
  /// \brief add the schema and save it
  /// \param[in] schema the schema needs to be added
  /// \return the last schema's id
@@ -126,7 +127,7 @@ class ShardHeader {
  MSRStatus FileToPages(const std::string dump_file_name);

 private:
  MSRStatus InitializeHeader(const std::vector<json> &headers);
  MSRStatus InitializeHeader(const std::vector<json> &headers, bool load_dataset);

  /// \brief get the headers from all the shard data
  /// \param[in] the shard data real path
@@ -137,9 +138,9 @@ class ShardHeader {
  MSRStatus ValidateField(const std::vector<std::string> &field_name, json schema, const uint64_t &schema_id);

  /// \brief check the binary file status
  MSRStatus CheckFileStatus(const std::string &path);
  static MSRStatus CheckFileStatus(const std::string &path);

  std::pair<MSRStatus, json> ValidateHeader(const std::string &path);
  static std::pair<MSRStatus, json> ValidateHeader(const std::string &path);

  void ParseHeader(const json &header);

@@ -149,7 +150,7 @@ class ShardHeader {

  MSRStatus CheckIndexField(const std::string &field, const json &schema);

  void ParsePage(const json &page);
  void ParsePage(const json &page, int shard_index, bool load_dataset);

  MSRStatus ParseStatistics(const json &statistics);

--- a/mindspore/ccsrc/mindrecord/include/shard_reader.h
+++ b/mindspore/ccsrc/mindrecord/include/shard_reader.h
@@ -68,23 +68,25 @@ class ShardReader {
  virtual ~ShardReader();

  /// \brief open files and initialize reader, c++ API
  /// \param[in] file_path the path of ONE file, any file in dataset is fine
  /// \param[in] file_paths the path of ONE file, any file in dataset is fine or file list
  /// \param[in] load_dataset load dataset from single file or not
  /// \param[in] n_consumer number of threads when reading
  /// \param[in] selected_columns column list to be populated
  /// \param[in] operators operators applied to data, operator type is shuffle, sample or category
  /// \param[in] block_reader block-reader mode if true, otherwise row-reader mode
  /// \return MSRStatus the status of MSRStatus
  MSRStatus Open(const std::string &file_path, int n_consumer = 4,
  MSRStatus Open(const std::vector<std::string> &file_paths, bool load_dataset, int n_consumer = 4,
                 const std::vector<std::string> &selected_columns = {},
                 const std::vector<std::shared_ptr<ShardOperator>> &operators = {}, const bool &block_reader = false);

  /// \brief open files and initialize reader, python API
  /// \param[in] file_path the path of ONE file, any file in dataset is fine
  /// \param[in] file_paths the path of ONE file, any file in dataset is fine or file list
  /// \param[in] load_dataset load dataset from single file or not
  /// \param[in] n_consumer number of threads when reading
  /// \param[in] selected_columns column list to be populated
  /// \param[in] operators operators applied to data, operator type is shuffle, sample or category
  /// \return MSRStatus the status of MSRStatus
  MSRStatus OpenPy(const std::string &file_path, const int &n_consumer = 4,
  MSRStatus OpenPy(const std::vector<std::string> &file_paths, bool load_dataset, const int &n_consumer = 4,
                   const std::vector<std::string> &selected_columns = {},
                   const std::vector<std::shared_ptr<ShardOperator>> &operators = {});

@@ -114,11 +116,13 @@ class ShardReader {
  int GetShardCount() const;

  /// \brief get the number of rows in database
  /// \param[in] file_path the path of ONE file, any file in dataset is fine
  /// \param[in] file_paths the path of ONE file, any file in dataset is fine or file list
  /// \param[in] load_dataset load dataset from single file or not
  /// \param[in] op smart pointer refer to ShardCategory or ShardSample object
  /// \param[out] count # of rows
  /// \return MSRStatus the status of MSRStatus
  MSRStatus CountTotalRows(const std::string &file_path, const std::shared_ptr<ShardOperator> &op, int64_t *count);
  MSRStatus CountTotalRows(const std::vector<std::string> &file_paths, bool load_dataset,
                           const std::shared_ptr<ShardOperator> &op, int64_t *count);

  /// \brief shuffle task with incremental seed
  /// \return void
@@ -220,7 +224,7 @@ class ShardReader {
                               std::vector<std::vector<json>> &column_values);

  /// \brief initialize reader
  MSRStatus Init(const std::string &file_path);
  MSRStatus Init(const std::vector<std::string> &file_paths, bool load_dataset);

  /// \brief validate column list
  MSRStatus CheckColumnList(const std::vector<std::string> &selected_columns);
@@ -292,8 +296,9 @@ class ShardReader {
  void GetClassesInShard(sqlite3 *db, int shard_id, const std::string sql, std::set<std::string> &categories);

  /// \brief get number of classes
  int64_t GetNumClasses(const std::string &file_path, const std::string &category_field);
  int64_t GetNumClasses(const std::string &category_field);

  std::pair<MSRStatus, std::vector<std::string>> GetMeta(const std::string &file_path, json &meta_data);
  /// \brief get exactly blob fields data by indices
  std::vector<uint8_t> ExtractBlobFieldBySelectColumns(std::vector<uint8_t> &blob_fields_bytes,
                                                       std::vector<uint32_t> &ordered_selected_columns_index);
--- a/mindspore/ccsrc/mindrecord/io/shard_index_generator.cc
+++ b/mindspore/ccsrc/mindrecord/io/shard_index_generator.cc
@@ -36,9 +36,23 @@ ShardIndexGenerator::ShardIndexGenerator(const std::string &file_path, bool appe
      write_success_(true) {}

 MSRStatus ShardIndexGenerator::Build() {
  auto ret = ShardHeader::BuildSingleHeader(file_path_);
  if (ret.first != SUCCESS) {
    return FAILED;
  }
  auto json_header = ret.second;

  auto ret2 = GetParentDir(file_path_);
  if (SUCCESS != ret2.first) {
    return FAILED;
  }
  std::vector<std::string> real_addresses;
  for (const auto &path : json_header["shard_addresses"]) {
    std::string abs_path = ret2.second + string(path);
    real_addresses.emplace_back(abs_path);
  }
  ShardHeader header = ShardHeader();
  if (header.Build(file_path_) != SUCCESS) {
    MS_LOG(ERROR) << "Build shard schema failed.";
  if (header.BuildDataset(real_addresses) == FAILED) {
    return FAILED;
  }
  shard_header_ = header;
--- a/mindspore/ccsrc/mindrecord/io/shard_reader.cc
+++ b/mindspore/ccsrc/mindrecord/io/shard_reader.cc
@@ -47,20 +47,55 @@ ShardReader::ShardReader() {
  block_reader_ = false;
 }

 MSRStatus ShardReader::Init(const std::string &file_path) {
 std::pair<MSRStatus, std::vector<std::string>> ShardReader::GetMeta(const std::string &file_path, json &meta_data) {
  if (!IsLegalFile(file_path)) {
    return {FAILED, {}};
  }
  auto ret = ShardHeader::BuildSingleHeader(file_path);
  if (ret.first != SUCCESS) {
    return {FAILED, {}};
  }
  auto header = ret.second;
  meta_data = {{"header_size", header["header_size"]}, {"page_size", header["page_size"]},
               {"version", header["version"]},         {"index_fields", header["index_fields"]},
               {"schema", header["schema"]},           {"blob_fields", header["blob_fields"]}};
  return {SUCCESS, header["shard_addresses"]};
 }

 MSRStatus ShardReader::Init(const std::vector<std::string> &file_paths, bool load_dataset) {
  std::string file_path = file_paths[0];
  json first_meta_data = json();
  auto ret = GetMeta(file_path, first_meta_data);
  if (ret.first != SUCCESS) {
    return FAILED;
  }
  ShardHeader sh = ShardHeader();
  if (sh.Build(file_path) == FAILED) {
  if (file_paths.size() == 1 && load_dataset == true) {
    auto ret2 = GetParentDir(file_path);
    if (SUCCESS != ret2.first) {
      return FAILED;
    }
    std::vector<std::string> real_addresses;
    for (const auto &path : ret.second) {
      std::string abs_path = ret2.second + string(path);
      real_addresses.emplace_back(abs_path);
    }
    file_paths_ = real_addresses;
  } else if (file_paths.size() >= 1 && load_dataset == false) {
    file_paths_ = file_paths;
  } else {
    MS_LOG(ERROR) << "Error in parameter file_path or load_dataset.";
    return FAILED;
  }
  shard_header_ = std::make_shared<ShardHeader>(sh);
  header_size_ = shard_header_->GetHeaderSize();
  page_size_ = shard_header_->GetPageSize();
  file_paths_ = shard_header_->GetShardAddresses();

  for (const auto &file : file_paths_) {
    json meta_data = json();
    auto ret1 = GetMeta(file, meta_data);
    if (ret1.first != SUCCESS) {
      return FAILED;
    }
    if (meta_data != first_meta_data) {
      MS_LOG(ERROR) << "Mindrecord files meta information is different.";
      return FAILED;
    }
    sqlite3 *db = nullptr;
    // sqlite3_open create a database if not found, use sqlite3_open_v2 instead of it
    int rc = sqlite3_open_v2(common::SafeCStr(file + ".db"), &db, SQLITE_OPEN_READONLY, nullptr);
@@ -91,7 +126,13 @@ MSRStatus ShardReader::Init(const std::string &file_path) {
    }
    database_paths_.push_back(db);
  }

  ShardHeader sh = ShardHeader();
  if (sh.BuildDataset(file_paths_, load_dataset) == FAILED) {
    return FAILED;
  }
  shard_header_ = std::make_shared<ShardHeader>(sh);
  header_size_ = shard_header_->GetHeaderSize();
  page_size_ = shard_header_->GetPageSize();
  num_rows_ = 0;
  auto row_group_summary = ReadRowGroupSummary();
  for (const auto &rg : row_group_summary) {
@@ -248,7 +289,6 @@ MSRStatus ShardReader::ConvertLabelToJson(const std::vector<std::vector<std::str
        fs->close();
        return FAILED;
      }

      json label_json = json::from_msgpack(label_raw);
      json tmp;
      if (!columns.empty()) {
@@ -713,15 +753,9 @@ MSRStatus ShardReader::Finish() {
  return SUCCESS;
 }

 int64_t ShardReader::GetNumClasses(const std::string &file_path, const std::string &category_field) {
  ShardHeader sh = ShardHeader();
  if (sh.Build(file_path) == FAILED) {
    return -1;
  }
  auto header = std::make_shared<ShardHeader>(sh);
  auto file_paths = header->GetShardAddresses();
  auto shard_count = file_paths.size();
  auto index_fields = header->GetFields();
 int64_t ShardReader::GetNumClasses(const std::string &category_field) {
  auto shard_count = file_paths_.size();
  auto index_fields = shard_header_->GetFields();

  std::map<std::string, int64_t> map_schema_id_fields;
  for (auto &field : index_fields) {
@@ -742,7 +776,7 @@ int64_t ShardReader::GetNumClasses(const std::string &file_path, const std::stri
  std::set<std::string> categories;
  for (int x = 0; x < shard_count; x++) {
    sqlite3 *db = nullptr;
    int rc = sqlite3_open_v2(common::SafeCStr(file_paths[x] + ".db"), &db, SQLITE_OPEN_READONLY, nullptr);
    int rc = sqlite3_open_v2(common::SafeCStr(file_paths_[x] + ".db"), &db, SQLITE_OPEN_READONLY, nullptr);
    if (SQLITE_OK != rc) {
      MS_LOG(ERROR) << "Can't open database, error: " << sqlite3_errmsg(db);
      return -1;
@@ -756,16 +790,16 @@ int64_t ShardReader::GetNumClasses(const std::string &file_path, const std::stri
  return categories.size();
 }

 MSRStatus ShardReader::CountTotalRows(const std::string &file_path, const std::shared_ptr<ShardOperator> &op,
                                      int64_t *count) {
  if (Init(file_path) == FAILED) {
 MSRStatus ShardReader::CountTotalRows(const std::vector<std::string> &file_paths, bool load_dataset,
                                      const std::shared_ptr<ShardOperator> &op, int64_t *count) {
  if (SUCCESS != Init(file_paths, load_dataset)) {
    return FAILED;
  }
  int64_t num_samples = num_rows_;
  if (std::dynamic_pointer_cast<ShardCategory>(op)) {
    auto category_op = std::dynamic_pointer_cast<ShardCategory>(op);
    std::string category_field = category_op->GetCategoryField();
    auto num_classes = GetNumClasses(file_path, category_field);
    auto num_classes = GetNumClasses(category_field);
    num_samples = category_op->GetNumSamples(num_rows_, num_classes);
  } else if (std::dynamic_pointer_cast<ShardSample>(op)) {
    num_samples = op->GetNumSamples(num_rows_, 0);
@@ -779,12 +813,13 @@ MSRStatus ShardReader::CountTotalRows(const std::string &file_path, const std::s
  return SUCCESS;
 }

 MSRStatus ShardReader::Open(const std::string &file_path, int n_consumer,
 MSRStatus ShardReader::Open(const std::vector<std::string> &file_paths, bool load_dataset, int n_consumer,
                            const std::vector<std::string> &selected_columns,
                            const std::vector<std::shared_ptr<ShardOperator>> &operators, const bool &block_reader) {
  // Open file and set header by ShardReader
  if (Init(file_path) == FAILED) {
    return FAILED;
  auto ret = Init(file_paths, load_dataset);
  if (SUCCESS != ret) {
    return ret;
  }
  auto thread_limit = GetMaxThreadNum();
  if (n_consumer > thread_limit) {
@@ -837,11 +872,11 @@ MSRStatus ShardReader::Open(const std::string &file_path, int n_consumer,
  return SUCCESS;
 }

 MSRStatus ShardReader::OpenPy(const std::string &file_path, const int &n_consumer,
 MSRStatus ShardReader::OpenPy(const std::vector<std::string> &file_paths, bool load_dataset, const int &n_consumer,
                              const std::vector<std::string> &selected_columns,
                              const std::vector<std::shared_ptr<ShardOperator>> &operators) {
  // Open file and set header by ShardReader
  if (Init(file_path) == FAILED) {
  if (SUCCESS != Init(file_paths, load_dataset)) {
    return FAILED;
  }
  // should remove blob field from selected_columns when call from python
--- a/mindspore/ccsrc/mindrecord/io/shard_writer.cc
+++ b/mindspore/ccsrc/mindrecord/io/shard_writer.cc
@@ -174,12 +174,25 @@ MSRStatus ShardWriter::OpenForAppend(const std::string &path) {
  if (!IsLegalFile(path)) {
    return FAILED;
  }
  ShardHeader sh = ShardHeader();
  if (sh.Build(path) == FAILED) {
  auto ret1 = ShardHeader::BuildSingleHeader(path);
  if (ret1.first != SUCCESS) {
    return FAILED;
  }
  shard_header_ = std::make_shared<ShardHeader>(sh);
  auto paths = shard_header_->GetShardAddresses();
  auto json_header = ret1.second;
  auto ret2 = GetParentDir(path);
  if (SUCCESS != ret2.first) {
    return FAILED;
  }
  std::vector<std::string> real_addresses;
  for (const auto &path : json_header["shard_addresses"]) {
    std::string abs_path = ret2.second + string(path);
    real_addresses.emplace_back(abs_path);
  }
  ShardHeader header = ShardHeader();
  if (header.BuildDataset(real_addresses) == FAILED) {
    return FAILED;
  }
  shard_header_ = std::make_shared<ShardHeader>(header);
  MSRStatus ret = SetHeaderSize(shard_header_->GetHeaderSize());
  if (ret == FAILED) {
    return FAILED;
@@ -188,7 +201,7 @@ MSRStatus ShardWriter::OpenForAppend(const std::string &path) {
  if (ret == FAILED) {
    return FAILED;
  }
  ret = Open(paths, true);
  ret = Open(json_header["shard_addresses"], true);
  if (ret == FAILED) {
    MS_LOG(ERROR) << "Open file failed";
    return FAILED;
--- a/mindspore/ccsrc/mindrecord/meta/shard_header.cc
+++ b/mindspore/ccsrc/mindrecord/meta/shard_header.cc
@@ -35,8 +35,9 @@ namespace mindrecord {
 std::atomic<bool> thread_status(false);
 ShardHeader::ShardHeader() : shard_count_(0), header_size_(0), page_size_(0) { index_ = std::make_shared<Index>(); }

 MSRStatus ShardHeader::InitializeHeader(const std::vector<json> &headers) {
 MSRStatus ShardHeader::InitializeHeader(const std::vector<json> &headers, bool load_dataset) {
  shard_count_ = headers.size();
  int shard_index = 0;
  bool first = true;
  for (const auto &header : headers) {
    if (first) {
@@ -54,7 +55,8 @@ MSRStatus ShardHeader::InitializeHeader(const std::vector<json> &headers) {
      header_size_ = header["header_size"].get<uint64_t>();
      page_size_ = header["page_size"].get<uint64_t>();
    }
    ParsePage(header["page"]);
    ParsePage(header["page"], shard_index, load_dataset);
    shard_index++;
  }
  return SUCCESS;
 }
@@ -136,40 +138,39 @@ std::pair<MSRStatus, json> ShardHeader::ValidateHeader(const std::string &path)
  return {SUCCESS, json_header};
 }

 MSRStatus ShardHeader::Build(const std::string &file_path) {
 std::pair<MSRStatus, json> ShardHeader::BuildSingleHeader(const std::string &file_path) {
  auto ret = ValidateHeader(file_path);
  if (SUCCESS != ret.first) {
    return FAILED;
  }
  json main_header = ret.second;
  json addresses = main_header["shard_addresses"];
  vector<string> real_addresses;
  auto ret1 = GetParentDir(file_path);
  if (SUCCESS != ret1.first) {
    return FAILED;
    return {FAILED, json()};
  }
  std::string parent_dir = ret1.second;
  json raw_header = ret.second;
  json header = {{"shard_addresses", raw_header["shard_addresses"]},
                 {"header_size", raw_header["header_size"]},
                 {"page_size", raw_header["page_size"]},
                 {"index_fields", raw_header["index_fields"]},
                 {"blob_fields", raw_header["schema"][0]["blob_fields"]},
                 {"schema", raw_header["schema"][0]["schema"]},
                 {"version", raw_header["version"]}};
  return {SUCCESS, header};
 }

  for (const auto &addr : addresses) {
    std::string absolute_path = parent_dir + string(addr);
    real_addresses.emplace_back(absolute_path);
  }
 MSRStatus ShardHeader::BuildDataset(const std::vector<std::string> &file_paths, bool load_dataset) {
  uint32_t thread_num = std::thread::hardware_concurrency();
  if (thread_num == 0) thread_num = kThreadNumber;
  uint32_t work_thread_num = 0;
  uint32_t addr_count = real_addresses.size();
  int group_num = ceil(addr_count * 1.0 / thread_num);
  uint32_t shard_count = file_paths.size();
  int group_num = ceil(shard_count * 1.0 / thread_num);
  std::vector<std::thread> thread_set(thread_num);
  std::vector<json> headers(addr_count);
  std::vector<json> headers(shard_count);
  for (uint32_t x = 0; x < thread_num; ++x) {
    int start_num = x * group_num;
    int end_num = ((x + 1) * group_num > addr_count) ? addr_count : (x + 1) * group_num;
    int end_num = ((x + 1) * group_num > shard_count) ? shard_count : (x + 1) * group_num;
    if (start_num >= end_num) {
      continue;
    }

    thread_set[x] =
      std::thread(&ShardHeader::GetHeadersOneTask, this, start_num, end_num, std::ref(headers), real_addresses);
      std::thread(&ShardHeader::GetHeadersOneTask, this, start_num, end_num, std::ref(headers), file_paths);
    work_thread_num++;
  }

@@ -180,7 +181,7 @@ MSRStatus ShardHeader::Build(const std::string &file_path) {
    thread_status = false;
    return FAILED;
  }
  if (SUCCESS != InitializeHeader(headers)) {
  if (SUCCESS != InitializeHeader(headers, load_dataset)) {
    return FAILED;
  }
  return SUCCESS;
@@ -247,7 +248,8 @@ MSRStatus ShardHeader::ParseIndexFields(const json &index_fields) {
  return SUCCESS;
 }

 void ShardHeader::ParsePage(const json &pages) {
 void ShardHeader::ParsePage(const json &pages, int shard_index, bool load_dataset) {
  // set shard_index when load_dataset is false
  if (pages_.empty() && shard_count_ <= kMaxShardCount) {
    pages_.resize(shard_count_);
  }
@@ -267,7 +269,11 @@ void ShardHeader::ParsePage(const json &pages) {

    std::shared_ptr<Page> parsed_page = std::make_shared<Page>(page_id, shard_id, page_type, page_type_id, start_row_id,
                                                               end_row_id, row_group_ids, page_size);
    pages_[shard_id].push_back(std::move(parsed_page));
    if (load_dataset == true) {
      pages_[shard_id].push_back(std::move(parsed_page));
    } else {
      pages_[shard_index].push_back(std::move(parsed_page));
    }
  }
 }

@@ -709,7 +715,7 @@ MSRStatus ShardHeader::FileToPages(const std::string dump_file_name) {

  std::string line;
  while (std::getline(page_in_handle, line)) {
    ParsePage(json::parse(line));
    ParsePage(json::parse(line), -1, true);
  }

  page_in_handle.close();
--- a/mindspore/ccsrc/pipeline/parse/parse.cc
+++ b/mindspore/ccsrc/pipeline/parse/parse.cc
@@ -89,6 +89,9 @@ void Parser::BuildMethodMap() {
  stmt_method_map_["FunctionDef"] = &Parser::ParseFunctionDef;
  stmt_method_map_["AugAssign"] = &Parser::ParseAugAssign;
  stmt_method_map_["Global"] = &Parser::ParseGlobal;
  stmt_method_map_["Break"] = &Parser::ParseBreak;
  stmt_method_map_["Continue"] = &Parser::ParseContinue;
  stmt_method_map_["Pass"] = &Parser::ParsePass;
  expr_method_map_["NoneType"] = &Parser::ParseNone;
  expr_method_map_["BinOp"] = &Parser::ParseBinOp;
  expr_method_map_["Name"] = &Parser::ParseName;
@@ -270,6 +273,8 @@ FunctionBlockPtr Parser::ParseStatements(FunctionBlockPtr fn_block, const py::ob
    // insert appropriate depended items for the function block if it has a return node
    if (fn_block->func_graph()->get_return() != nullptr) {
      fn_block->InsertDependItemsBeforeReturn();
      // Skip statements after 'return' (or 'break', 'continue').
      break;
    }
  }
  return fn_block;
@@ -600,9 +605,11 @@ AnfNodePtr Parser::ParseAttribute(const FunctionBlockPtr &block, const py::objec
      std::string var_name = "self.";
      std::string attr_name = node.attr("attr").cast<std::string>();
      (void)var_name.append(attr_name);
      auto obj = ast()->obj().attr(attr_name.c_str());
      auto attr_obj = ast()->obj().attr(attr_name.c_str());
      if (py::hasattr(ast()->obj(), attr_name.c_str()) &&
          (data_converter::IsCellInstance(obj) || py::hasattr(obj, PYTHON_PRIMITIVE_FLAG))) {
          (py::hasattr(attr_obj, PYTHON_PRIMITIVE_FLAG) || py::isinstance<py::int_>(attr_obj) ||
           py::isinstance<py::float_>(attr_obj) || py::isinstance<py::bool_>(attr_obj) ||
           py::isinstance<py::str>(attr_obj) || data_converter::IsCellInstance(attr_obj))) {
        return block->MakeResolveSymbol(var_name);
      } else {
        return block->ReadVariable(var_name);
@@ -944,9 +951,6 @@ FunctionBlockPtr Parser::ParseWhile(const FunctionBlockPtr &block, const py::obj
  MS_LOG(INFO) << "Parse while statement";
  TraceManager::DebugTrace(std::make_shared<TraceWhileHeader>(block->func_graph()->debug_info()));
  FunctionBlockPtr header_block = MakeFunctionBlock(*this);
  if (MsContext::GetInstance()->is_multi_graph_sink()) {
    header_block->func_graph()->set_flags(FUNC_GRAPH_FLAG_IGNORE_VALUES, true);
  }
  TraceManager::EndTrace();

  TraceManager::DebugTrace(std::make_shared<TraceWhileBody>(block->func_graph()->debug_info()));
@@ -966,13 +970,24 @@ FunctionBlockPtr Parser::ParseWhile(const FunctionBlockPtr &block, const py::obj
  body_block->Mature();
  header_block->ConditionalJump(condition_node, body_block, after_block);

  // Parse loop body statements with loop context.
  LoopContext loop_context{&loops_, header_block, nullptr};
  py::object body_node = python_adapter::GetPyObjAttr(node, "body");
  FunctionBlockPtr after_body = ParseStatements(body_block, body_node);
  if (after_body->func_graph()->get_return() == nullptr) {
    after_body->Jump(header_block, nullptr);
  }

  header_block->Mature();
  after_block->Mature();
  auto &end_block = loop_context.EndBlock();
  if (end_block) {
    // end_block exists if we encounter 'break' in loop body.
    after_block->Jump(end_block, nullptr);
    end_block->Mature();
    return end_block;
  }
  // No 'break', no end_block.
  return after_block;
 }

@@ -1049,13 +1064,24 @@ FunctionBlockPtr Parser::ParseFor(const FunctionBlockPtr &block, const py::objec
  body_block->Mature();
  header_block->ConditionalJump(cond_apply, body_block, after_block);

  // Parse loop body statements with loop context.
  LoopContext loop_context{&loops_, header_block, iter2_app};
  py::object body_node = python_adapter::GetPyObjAttr(node, "body");
  FunctionBlockPtr after_body_block = ParseStatements(body_block, body_node);
  if (after_body_block->func_graph()->get_return() == nullptr) {
    after_body_block->Jump(header_block, iter2_app);
  }

  header_block->Mature();
  after_block->Mature();
  auto &end_block = loop_context.EndBlock();
  if (end_block) {
    // end_block exists if we encounter 'break' in loop body.
    after_block->Jump(end_block, nullptr);
    end_block->Mature();
    return end_block;
  }
  // No 'break', no end_block.
  return after_block;
 }
 AnfNodePtr Parser::ParseIfExp(const FunctionBlockPtr &block, const py::object &node) {
@@ -1222,6 +1248,52 @@ FunctionBlockPtr Parser::ParseAssign(const FunctionBlockPtr &block, const py::ob
  return block;
 }

 FunctionBlockPtr Parser::ParseBreak(const FunctionBlockPtr &block, const py::object &node) {
  if (loops_.empty()) {
    // Report error if loop context not set for the 'break' statement.
    py::list location = ast_->CallParserObjMethod(PYTHON_PARSE_GET_LOCATION, node);
    if (location.size() < 2) {
      MS_LOG(EXCEPTION) << "List size should not be less than 2.";
    }
    auto filename = location[0].cast<std::string>();
    auto line_no = location[1].cast<int>();
    MS_LOG(EXCEPTION) << "Unexpected 'break' at " << filename << ":" << line_no;
  }
  // Get current loop.
  Loop &loop = loops_.top();
  if (loop.end == nullptr) {
    // Create end_block if it is not existed.
    TraceManager::DebugTrace(std::make_shared<TraceLoopEnd>(block->func_graph()->debug_info()));
    loop.end = MakeFunctionBlock(*this);
    TraceManager::EndTrace();
  }
  // Jump to the end_block.
  block->Jump(loop.end, nullptr);
  return block;
 }

 FunctionBlockPtr Parser::ParseContinue(const FunctionBlockPtr &block, const py::object &node) {
  if (loops_.empty()) {
    // Report error if loop context not set for the 'continue' statement.
    py::list location = ast_->CallParserObjMethod(PYTHON_PARSE_GET_LOCATION, node);
    if (location.size() < 2) {
      MS_LOG(EXCEPTION) << "List size should not be less than 2.";
    }
    auto filename = location[0].cast<std::string>();
    auto line_no = location[1].cast<int>();
    MS_LOG(EXCEPTION) << "Unexpected 'continue' at " << filename << ":" << line_no;
  }
  // Jump to the header of the loop with iterator called.
  Loop &loop = loops_.top();
  block->Jump(loop.header, loop.iterator);
  return block;
 }

 FunctionBlockPtr Parser::ParsePass(const FunctionBlockPtr &block, const py::object &node) {
  // We just bypass 'pass' statement.
  return block;
 }

 void Parser::RemoveUnnecessaryPhis() {
  // merge all removable phis to one map;
  std::unordered_map<ParameterPtr, AnfNodePtr> removable_phis;
--- a/mindspore/ccsrc/pipeline/parse/parse.h
+++ b/mindspore/ccsrc/pipeline/parse/parse.h
@@ -23,6 +23,7 @@
 #include <string>
 #include <map>
 #include <set>
 #include <stack>
 #include <memory>
 #include "utils/misc.h"
 #include "ir/anf.h"
@@ -50,6 +51,33 @@ enum ParseStatusCode : int {
 class AstNodeType;
 class ParseAst;

 // Save loop info for 'continue' and 'break' statements.
 struct Loop {
  // Loop header block.
  FunctionBlockPtr header;
  // Loop iterator node, used in 'for loop'.
  AnfNodePtr iterator;
  // Loop end block.
  FunctionBlockPtr end;

  Loop(const FunctionBlockPtr &header, const AnfNodePtr &iterator, const FunctionBlockPtr &end)
      : header(header), iterator(iterator), end(end) {}
  ~Loop() = default;
 };

 // Loop context for loop stack management.
 class LoopContext {
 public:
  LoopContext(std::stack<Loop> *loops, const FunctionBlockPtr &header, const AnfNodePtr &iterator) : loops_(loops) {
    loops_->emplace(header, iterator, nullptr);
  }
  ~LoopContext() { loops_->pop(); }
  const FunctionBlockPtr &EndBlock() const { return loops_->top().end; }

 private:
  std::stack<Loop> *loops_;
 };

 // Parser to parse python function
 class Parser {
 public:
@@ -86,6 +114,12 @@ class Parser {
  FunctionBlockPtr ParseGlobal(const FunctionBlockPtr &block, const py::object &node);
  // process assign statement
  FunctionBlockPtr ParseAssign(const FunctionBlockPtr &block, const py::object &node);
  // process break statement
  FunctionBlockPtr ParseBreak(const FunctionBlockPtr &block, const py::object &node);
  // process continue statement
  FunctionBlockPtr ParseContinue(const FunctionBlockPtr &block, const py::object &node);
  // process pass statement
  FunctionBlockPtr ParsePass(const FunctionBlockPtr &block, const py::object &node);
  // process the expr and slice node method list
  AnfNodePtr ParseBinOp(const FunctionBlockPtr &block, const py::object &node);
  // process a variable name
@@ -216,6 +250,8 @@ class Parser {
  std::map<std::string, pStmtFunc> stmt_method_map_;
  // define the function map to parse ast expression
  std::map<std::string, pExprFunc> expr_method_map_;
  // Save current loops to support 'continue', 'break' statement.
  std::stack<Loop> loops_;
 };

 // AST node type define code to ast
--- a/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ascend_backend_optimization.cc
@@ -54,6 +54,7 @@
 #include "pre_activate/pass/optimize_dependence.h"
 #include "pre_activate/pass/erase_visit_attr.h"
 #include "pre_activate/ascend/format_type/insert_cast.h"
 #include "pre_activate/ascend/format_type/convert_unsupported_transnode_to_aicpu.h"
 #include "pre_activate/pass/eliminate_redundant_op.h"
 #include "pre_activate/pass/common_subexpression_elimination.h"
 #include "pre_activate/ascend/format_type/merge_cast_to_op.h"
@@ -172,6 +173,7 @@ void AscendMixPrecision(const std::shared_ptr<session::KernelGraph> &kernel_grap
  mixed_precision_pm->AddPass(std::make_shared<MergeCastToOp>());
  mixed_precision_pm->AddPass(std::make_shared<LayerNormBetaGammaBackpropFusion>());
  mixed_precision_pm->AddPass(std::make_shared<EraseVisitAttr>());
  mixed_precision_pm->AddPass(std::make_shared<ConvertUnSupportNodeToAICPU>());
  optimizer->AddPassManager(mixed_precision_pm);
  (void)optimizer->Optimize(kernel_graph);
  kernel_graph->SetExecOrderByDefault();
@@ -274,6 +276,7 @@ void AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kern
  auto other_pm = std::make_shared<PassManager>("other_pm");
  other_pm->AddPass(std::make_shared<AllReduceFusion>());
  other_pm->AddPass(std::make_shared<AllGatherFusion>());
  other_pm->AddPass(std::make_shared<BroadcastFusion>());
  other_pm->AddPass(std::make_shared<ParameterTransOpFusion>());
  other_pm->AddPass(std::make_shared<RefreshParameterFormat>());
  other_pm->AddPass(std::make_shared<BufferFusion>());
--- a/mindspore/ccsrc/pre_activate/ascend/ascend_helper.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ascend_helper.cc
@@ -268,6 +268,7 @@ AnfNodePtr AddCastOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePtr
  }
  AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), cast.get());
  AnfAlgo::SetOutputInferTypeAndShape({origin_type}, {origin_shape}, cast.get());
  AnfAlgo::SetNodeAttr(kIsBackendCast, MakeValue(true), cast);
  return cast;
 }

--- a/mindspore/ccsrc/pre_activate/ascend/ascend_helper.h
+++ b/mindspore/ccsrc/pre_activate/ascend/ascend_helper.h
@@ -30,10 +30,6 @@ class KernelSelect {
  KernelSelect() = default;
  virtual ~KernelSelect() = default;
  virtual void SelectKernel(const CNodePtr &cnode) { device::ascend::SelectKernelInfo(cnode); }
  virtual bool CheckKernelAccuracySupported(const CNodePtr &kernel_node,
                                            const kernel::KernelBuildInfoPtr &new_kernel_build_info) {
    return device::ascend::CheckKernelAccuracySupported(kernel_node, new_kernel_build_info);
  }
 };
 using KernelSelectPtr = std::shared_ptr<KernelSelect>;

@@ -41,8 +37,13 @@ class SupportedChecker {
 public:
  SupportedChecker() = default;
  virtual ~SupportedChecker() = default;
  virtual bool CheckSupported(const AnfNodePtr &anf_node, const kernel::KernelBuildInfoPtr &select_kernel_build_info) {
    return kernel::CheckSupported(anf_node, select_kernel_build_info);
  virtual bool CheckAiCoreSupported(const AnfNodePtr &anf_node,
                                    const kernel::KernelBuildInfoPtr &select_kernel_build_info) {
    return kernel::IsSupportedByAiCore(anf_node, select_kernel_build_info);
  }
  virtual bool CheckAiCpuSupported(const AnfNodePtr &anf_node,
                                   const kernel::KernelBuildInfoPtr &select_kernel_build_info) {
    return kernel::IsSupportedByAiCpu(anf_node, select_kernel_build_info);
  }
 };
 using SupportedCheckerPtr = std::shared_ptr<SupportedChecker>;
--- a/mindspore/ccsrc/pre_activate/ascend/format_type/convert_unsupported_transnode_to_aicpu.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/convert_unsupported_transnode_to_aicpu.cc
@@ -0,0 +1,54 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "pre_activate/ascend/format_type/convert_unsupported_transnode_to_aicpu.h"
 #include <memory>
 #include "session/anf_runtime_algorithm.h"
 #include "kernel/kernel_build_info.h"
 #include "kernel/kernel_query.h"
 namespace mindspore {
 namespace opt {
 const BaseRef ConvertUnSupportNodeToAICPU::DefinePattern() const {
  VarPtr X = std::make_shared<Var>();
  VarPtr Xs = std::make_shared<SeqVar>();
  return VectorRef({X, Xs});
 }

 const AnfNodePtr ConvertUnSupportNodeToAICPU::Process(const mindspore::FuncGraphPtr &,
                                                      const mindspore::AnfNodePtr &node,
                                                      const mindspore::EquivPtr &) const {
  if (node == nullptr || !node->isa<CNode>()) {
    return nullptr;
  }
  auto node_name = AnfAlgo::GetCNodeName(node);
  if (node_name != prim::KPrimTransData->name() || node_name != prim::kPrimCast->name()) {
    return nullptr;
  }
  auto kernel_builder_info = AnfAlgo::GetSelectKernelBuildInfo(node);
  if (supported_checker_->CheckAiCoreSupported(node, kernel_builder_info)) {
    return node;
  } else if (supported_checker_->CheckAiCpuSupported(node, kernel_builder_info)) {
    auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(kernel_builder_info);
    builder->SetKernelType(AICPU_KERNEL);
    AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), node.get());
  } else {
    MS_LOG(EXCEPTION) << " kernel " << kernel_builder_info->ToString() << "is not supported in AiCPU & AiCore : node ["
                      << node->DebugString() << "]";
  }
  return node;
 }
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/pre_activate/ascend/format_type/convert_unsupported_transnode_to_aicpu.h
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/convert_unsupported_transnode_to_aicpu.h
@@ -0,0 +1,37 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <memory>
 #include "pre_activate/common/optimizer.h"
 #include "pre_activate/ascend/ascend_helper.h"
 #ifndef MINDSPORE_CONVERT_UNSUPPORTED_NODE_TO_AICPU_H
 #define MINDSPORE_CONVERT_UNSUPPORTED_NODE_TO_AICPU_H
 namespace mindspore {
 namespace opt {
 class ConvertUnSupportNodeToAICPU : public PatternProcessPass {
 public:
  explicit ConvertUnSupportNodeToAICPU(bool multigraph = true)
      : PatternProcessPass("convert_unsupported_node_to_aicpu", multigraph),
        supported_checker_(std::make_shared<SupportedChecker>()) {}
  ~ConvertUnSupportNodeToAICPU() override = default;
  const BaseRef DefinePattern() const override;
  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;

 private:
  SupportedCheckerPtr supported_checker_;
 };
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CONVERT_UNSUPPORTED_NODE_TO_AICPU_H
--- a/mindspore/ccsrc/pre_activate/ascend/format_type/insert_cast_for_runop.h
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/insert_cast_for_runop.h
@@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_DEVICE_OPTIMIZER_FORMAT_TYPE_PASS_INSERT_CAST_FOR_RUNOP_H_
 #define MINDSPORE_CCSRC_DEVICE_OPTIMIZER_FORMAT_TYPE_PASS_INSERT_CAST_FOR_RUNOP_H_
 #ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_INSERT_CAST_FOR_RUNOP_H_
 #define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_INSERT_CAST_FOR_RUNOP_H_
 #include <string>

 #include "pre_activate/common/optimizer.h"
@@ -32,4 +32,4 @@ class RunOpInsertCast : public PatternProcessPass {
 }  // namespace opt
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_DEVICE_OPTIMIZER_FORMAT_TYPE_PASS_INSERT_CAST_FOR_RUNOP_H_
 #endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_INSERT_CAST_FOR_RUNOP_H_
--- a/mindspore/ccsrc/pre_activate/ascend/format_type/insert_transdata_for_runop.h
+++ b/mindspore/ccsrc/pre_activate/ascend/format_type/insert_transdata_for_runop.h
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_DEVICE_OPTIMIZER_FORMAT_TYPE_PASS_INSERT_TRANSDATA_FOR_RUNOP_H_
 #define MINDSPORE_CCSRC_DEVICE_OPTIMIZER_FORMAT_TYPE_PASS_INSERT_TRANSDATA_FOR_RUNOP_H_
 #ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_INSERT_TRANSDATA_FOR_RUNOP_H_
 #define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_INSERT_TRANSDATA_FOR_RUNOP_H_

 #include <string>
 #include <utility>
@@ -41,4 +41,4 @@ class RunOpInsertTransData : public PatternProcessPass {
 }  // namespace opt
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_DEVICE_OPTIMIZER_FORMAT_TYPE_PASS_INSERT_TRANSDATA_FOR_RUNOP_H_
 #endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_INSERT_TRANSDATA_FOR_RUNOP_H_
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fission/topk_split.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fission/topk_split.cc
@@ -128,7 +128,7 @@ const AnfNodePtr TopKSplit::Process(const FuncGraphPtr &func_graph, const AnfNod
  auto indices_const = CreateValueNode(new_cnode);
  new_cnode->add_input(indices_const);
  MS_EXCEPTION_IF_NULL(supported_checker_);
  if (!supported_checker_->CheckSupported(new_cnode, CreateKernelBuildInfo())) {
  if (!supported_checker_->CheckAiCoreSupported(new_cnode, CreateKernelBuildInfo())) {
    return nullptr;
  }

--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/transpose_transdata_fusion.cc
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/transpose_transdata_fusion.cc
@@ -53,7 +53,7 @@ const AnfNodePtr TransposeTransDataFusion::Process(const FuncGraphPtr &func_grap
  new_transdata_builder->SetProcessor(transdata_kernel_build_info->processor());

  auto new_fusion_transdata = std::make_shared<Primitive>(kTransDataOpName);
  if (kernel_select_->CheckKernelAccuracySupported(transdata_cnode, new_transdata_builder->Build())) {
  if (supported_checker_->CheckAiCoreSupported(transdata_cnode, new_transdata_builder->Build())) {
    std::vector<AnfNodePtr> inputs = {NewValueNode(new_fusion_transdata),
                                      utils::cast<AnfNodePtr>((*equiv)[input_varptr_])};
    auto new_node = func_graph->NewCNode(inputs);
--- a/mindspore/ccsrc/pre_activate/ascend/ir_fusion/transpose_transdata_fusion.h
+++ b/mindspore/ccsrc/pre_activate/ascend/ir_fusion/transpose_transdata_fusion.h
@@ -34,7 +34,7 @@ class TransposeTransDataFusion : public PatternProcessPass {
  explicit TransposeTransDataFusion(bool multigraph = true)
      : PatternProcessPass("transpose_transdata_fusion", multigraph) {
    input_varptr_ = std::make_shared<Var>();
    kernel_select_ = std::make_shared<KernelSelect>();
    supported_checker_ = std::make_shared<SupportedChecker>();
  }
  ~TransposeTransDataFusion() override = default;
  const BaseRef DefinePattern() const override;
@@ -42,7 +42,9 @@ class TransposeTransDataFusion : public PatternProcessPass {

 private:
  VarPtr input_varptr_;
  KernelSelectPtr kernel_select_;

 private:
  SupportedCheckerPtr supported_checker_;
 };
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/pre_activate/pass/communication_op_fusion.h
+++ b/mindspore/ccsrc/pre_activate/pass/communication_op_fusion.h
@@ -62,6 +62,12 @@ class AllGatherFusion : public CommunicationOpFusion {
  explicit AllGatherFusion(size_t groups = 1) : CommunicationOpFusion("all_gather_fusion", kAllGatherOpName, groups) {}
  ~AllGatherFusion() override = default;
 };

 class BroadcastFusion : public CommunicationOpFusion {
 public:
  explicit BroadcastFusion(size_t groups = 1) : CommunicationOpFusion("broadcast_fusion", kBroadcastOpName, groups) {}
  ~BroadcastFusion() override = default;
 };
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_PASS_COMMUNICATION_OP_FUSION_H_
--- a/mindspore/ccsrc/session/ascend_session.cc
+++ b/mindspore/ccsrc/session/ascend_session.cc
@@ -329,9 +329,9 @@ void AscendSession::SelectKernel(const KernelGraph &kernel_graph) const {
  size_t reduce_precision_count = 0;
  for (const auto &cnode : kernel_graph.execution_order()) {
    auto status = device::ascend::SelectKernelInfo(cnode);
    if (status == kStatusRaisePrecision) {
    if (status == device::ascend::kStatusRaisePrecision) {
      raise_precision_count++;
    } else if (status == kStatusReducePrecision) {
    } else if (status == device::ascend::kStatusReducePrecision) {
      reduce_precision_count++;
    }
    MS_LOG(INFO) << "Select ApplyKernel: " << cnode->DebugString();
--- a/mindspore/ccsrc/session/kernel_graph.cc
+++ b/mindspore/ccsrc/session/kernel_graph.cc
@@ -27,6 +27,8 @@
 namespace mindspore {
 namespace session {
 namespace {
 constexpr auto kIsFeatureMapOutput = "IsFeatureMapOutput";
 constexpr auto kIsFeatureMapInputList = "IsFeatureMapInputList";
 void PushNoVisitedNode(const AnfNodePtr &node, std::queue<AnfNodePtr> *que,
                       std::unordered_set<AnfNodePtr> *visited_nodes) {
  MS_EXCEPTION_IF_NULL(que);
@@ -180,11 +182,24 @@ CNodePtr KernelGraph::NewCNode(const std::vector<AnfNodePtr> &inputs) {
  cnode->set_abstract(std::make_shared<abstract::AbstractNone>());
  // create kernel_info from new parameter
  auto kernel_info = std::make_shared<device::KernelInfo>();
  std::vector<size_t> feature_map_input_indexs;
  // if the node only has the primitive(such as getNext) or the node's input has a feature map input
  // then the node's output is a feature map output
  if (inputs.size() == 1 || std::any_of(inputs.begin() + 1, inputs.end(),
                                        [&](const AnfNodePtr &node) { return AnfAlgo::IsFeatureMapOutput(node); })) {
  for (size_t index = 1; index < inputs.size(); ++index) {
    auto node = inputs[index];
    if (AnfAlgo::IsFeatureMapOutput(node)) {
      feature_map_input_indexs.push_back(index);
    }
  }
  if (AnfAlgo::GetCNodeName(cnode) == prim::kPrimCast->name()) {
    AnfAlgo::SetNodeAttr(kIsBackendCast, MakeValue(false), cnode);
  }
  if (inputs.size() == 1 || !feature_map_input_indexs.empty()) {
    kernel_info->SetFeatureMapFlag(true);
    AnfAlgo::SetNodeAttr(kIsFeatureMapOutput, MakeValue(true), cnode);
    AnfAlgo::SetNodeAttr(kIsFeatureMapInputList, MakeValue(feature_map_input_indexs), cnode);
  } else {
    AnfAlgo::SetNodeAttr(kIsFeatureMapOutput, MakeValue(false), cnode);
  }
  cnode->set_kernel_info(kernel_info);
  AnfAlgo::SetGraphId(graph_id_, cnode.get());
--- a/mindspore/ccsrc/utils/context/ms_context.cc
+++ b/mindspore/ccsrc/utils/context/ms_context.cc
@@ -366,7 +366,11 @@ void MsContext::GetGeOptions(std::map<std::string, std::string> *ge_options) con
  }

  // Enable auto mixed precision according to the context options
  (*ge_options)["ge.exec.auto_mix_precision"] = std::to_string(auto_mixed_precision_flag_);
  if (auto_mixed_precision_flag_) {
    (*ge_options)["ge.exec.precision_mode"] = "allow_mix_precision";
  } else {
    (*ge_options)["ge.exec.precision_mode"] = "allow_fp32_to_fp16";
  }
  // Disable the global variable acc, only enable it whlie adding training graph in pipeline
  (*ge_options)["ge.exec.variable_acc"] = "0";
 #endif
--- a/mindspore/ccsrc/utils/utils.h
+++ b/mindspore/ccsrc/utils/utils.h
@@ -142,6 +142,7 @@ constexpr auto kLabelGotoOpName = "LabelGoto";

 // attr key name
 constexpr auto kAttrInputNames = "input_names";
 constexpr auto kIsBackendCast = "is_backed_cast";
 constexpr auto kAttrOutputNames = "output_names";
 constexpr auto kAttrVisited = "visited";
 constexpr auto kAttrShape = "shape";
@@ -201,10 +202,6 @@ constexpr auto kControlDependBehindIndex = 2;
 // index define of depend
 constexpr auto kRealInputIndexInDepend = 1;
 constexpr auto kDependAttachNodeIndex = 2;
 // status of kernel select result
 const int kStatusReducePrecision = -1;
 const int kStatusRaisePrecision = 1;
 const int kStatusAllMatched = 0;
 // format
 constexpr auto kOpFormat_DEFAULT = "DefaultFormat";
 constexpr auto kOpFormat_NC1KHKWHWC0 = "NC1KHKWHWC0";
@@ -218,18 +215,11 @@ constexpr auto kOpFormat_FRAC_NZ = "FRACTAL_NZ";
 constexpr auto kOpFormat_C1HWNCoC0 = "C1HWNCoC0";
 constexpr auto kOpFormat_NC1HWC0_C04 = "NC1HWC0_C04";
 constexpr auto kOpFormat_FRACTAL_Z_C04 = "FRACTAL_Z_C04";
 const std::set<std::string> k1DSupportFormat = {kOpFormat_DEFAULT,  kOpFormat_NCHW,        kOpFormat_NHWC,
                                                kOpFormat_FRAC_Z,   kOpFormat_NC1KHKWHWC0, kOpFormat_NC1HWC0,
                                                kOpFormat_C1HWNCoC0};

 const std::set<std::string> k2DSupportFormat = {kOpFormat_DEFAULT, kOpFormat_NCHW, kOpFormat_NHWC, kOpFormat_FRAC_Z,
                                                kOpFormat_NC1KHKWHWC0};
 const std::set<std::string> k3DSupportFormat = {kOpFormat_DEFAULT, kOpFormat_NC1KHKWHWC0};
 const std::set<std::string> k4DSupportFormat = k1DSupportFormat;
 const std::vector<std::set<std::string>> kShapeSupportFormatMap = {k1DSupportFormat, k2DSupportFormat, k3DSupportFormat,
                                                                   k4DSupportFormat};
 const std::set<std::string> kOpFormatList = {kOpFormat_DEFAULT, kOpFormat_NC1KHKWHWC0, kOpFormat_ND,
                                             kOpFormat_NCHW,    kOpFormat_NHWC,        kOpFormat_HWCN,
                                             kOpFormat_NC1HWC0, kOpFormat_FRAC_Z,      kOpFormat_C1HWNCoC0,
                                             kOpFormat_FRAC_NZ, kOpFormat_NC1HWC0_C04, kOpFormat_FRACTAL_Z_C04};
 const std::set<std::string> kDefaultCompatibleFormat = {kOpFormat_ND, kOpFormat_NCHW, kOpFormat_NHWC, kOpFormat_HWCN};

 const std::set<std::string> kOptOperatorSet = {
  kMomentumOpName,       kApplyMomentumOpName,        kApplyAdadeltaOpName,
  kApplyAdagradOpName,   kApplyAdagradDAName,         kApplyAdamOpName,
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@@ -2189,7 +2189,7 @@ class MindDataset(SourceDataset):
    A source dataset that reads from shard files and database.

    Args:
        dataset_file (str): one of file names in dataset.
        dataset_file (str, list[str]): One of file names or file list in dataset.
        columns_list (list[str], optional): List of columns to be read (default=None).
        num_parallel_workers (int, optional): The number of readers (default=None).
        shuffle (bool, optional): Whether or not to perform shuffle on the dataset
@@ -2214,6 +2214,10 @@ class MindDataset(SourceDataset):
                 shuffle=None, num_shards=None, shard_id=None,
                 block_reader=False, sampler=None):
        super().__init__(num_parallel_workers)
        if isinstance(dataset_file, list):
            self.load_dataset = False
        else:
            self.load_dataset = True
        self.dataset_file = dataset_file
        self.columns_list = columns_list
        self.global_shuffle = shuffle
@@ -2256,6 +2260,7 @@ class MindDataset(SourceDataset):
    def get_args(self):
        args = super().get_args()
        args["dataset_file"] = self.dataset_file
        args["load_dataset"] = self.load_dataset
        args["columns_list"] = self.columns_list
        args["global_shuffle"] = self.global_shuffle
        args["partitions"] = self.partitions
@@ -2272,8 +2277,11 @@ class MindDataset(SourceDataset):
        Return:
            Number, number of batches.
        """

        num_rows = MindRecordOp.get_num_rows(self.dataset_file, self.sampler)
        if self.load_dataset:
            dataset_file = [self.dataset_file]
        else:
            dataset_file = self.dataset_file
        num_rows = MindRecordOp.get_num_rows(dataset_file, self.load_dataset, self.sampler)
        if self.partitions is not None and self.partitions[0] > 0:
            if num_rows % self.partitions[0] == 0:
                num_rows = num_rows // self.partitions[0]
@@ -2294,11 +2302,11 @@ def _iter_fn(dataset, num_samples):
            except StopIteration:
                return
            # convert output tensors to ndarrays
            yield tuple([np.array(x) for x in val])
            yield tuple([np.array(x, copy=False) for x in val])
    else:
        for val in dataset:
            # convert output tensors to ndarrays
            yield tuple([np.array(x) for x in val])
            yield tuple([np.array(x, copy=False) for x in val])


 def _generator_fn(generator, num_samples):
@@ -2332,12 +2340,12 @@ def _py_sampler_fn(sampler, num_samples, dataset):
                return
            val = dataset[idx]
            # convert output tensors to ndarrays
            yield tuple([np.array(x) for x in val])
            yield tuple([np.array(x, copy=False) for x in val])
    else:
        for i in sampler:
            val = dataset[i]
            # convert output tensors to ndarrays
            yield tuple([np.array(x) for x in val])
            yield tuple([np.array(x, copy=False) for x in val])


 def _cpp_sampler_fn(sampler, dataset):
@@ -2348,7 +2356,7 @@ def _cpp_sampler_fn(sampler, dataset):
    for i in indices:
        val = dataset[i]
        # convert output tensors to ndarrays
        yield tuple([np.array(x) for x in val])
        yield tuple([np.array(x, copy=False) for x in val])


 def _cpp_sampler_fn_mp(sampler, dataset, num_worker):
@@ -2437,7 +2445,7 @@ def _sampler_fn_mp(indices, dataset, num_worker):
        # Set eoe event once all indices are sent
        if idx_cursor == len(indices) and not eoe.is_set():
            eoe.set()
        yield tuple([np.array(x) for x in result])
        yield tuple([np.array(x, copy=False) for x in result])


 def _generator_worker_loop(dataset, idx_queue, result_queue, eoe):
@@ -2549,35 +2557,35 @@ class GeneratorDataset(SourceDataset):
            when num_shards is also specified. Random accessible input is required.

    Examples:
        >>> import mindspore.dataengine as de
        >>> import mindspore.dataset as ds
        >>> # 1) Multidimensional generator function as callable input
        >>> def generator_md():
        >>>     for i in range(64):
        >>>         yield (np.array([[i, i + 1], [i + 2, i + 3]]),)
        >>> # create multi_dimension_generator_dataset with GeneratorMD and column name "multi_dimensional_data"
        >>> multi_dimension_generator_dataset = de.GeneratorDataset(generator_md, ["multi_dimensional_data"])
        >>> multi_dimension_generator_dataset = ds.GeneratorDataset(generator_md, ["multi_dimensional_data"])
        >>> # 2) Multi-column generator function as callable input
        >>> def generator_mc(maxid = 64):
        >>>     for i in range(maxid):
        >>>         yield (np.array([i]), np.array([[i, i + 1], [i + 2, i + 3]]))
        >>> # create multi_column_generator_dataset with GeneratorMC and column names "col1" and "col2"
        >>> multi_column_generator_dataset = de.GeneratorDataset(generator_mc, ["col1", "col2"])
        >>> multi_column_generator_dataset = ds.GeneratorDataset(generator_mc, ["col1", "col2"])
        >>> # 3) Iterable dataset as iterable input
        >>> class MyIterable():
        >>>     def __iter__(self):
        >>>         return # User implementation
        >>> # create iterable_generator_dataset with MyIterable object
        >>> iterable_generator_dataset = de.GeneratorDataset(MyIterable(), ["col1"])
        >>> iterable_generator_dataset = ds.GeneratorDataset(MyIterable(), ["col1"])
        >>> # 4) Random accessible dataset as Random accessible input
        >>> class MyRA():
        >>>     def __getitem__(self, index):
        >>>         return # User implementation
        >>> # create ra_generator_dataset with MyRA object
        >>> ra_generator_dataset = de.GeneratorDataset(MyRA(), ["col1"])
        >>> ra_generator_dataset = ds.GeneratorDataset(MyRA(), ["col1"])
        >>> # List/Dict/Tuple is also random accessible
        >>> list_generator = de.GeneratorDataset([(np.array(0),), (np.array(1)), (np.array(2))], ["col1"])
        >>> list_generator = ds.GeneratorDataset([(np.array(0),), (np.array(1)), (np.array(2))], ["col1"])
        >>> # 5) Built-in Sampler
        >>> my_generator = de.GeneratorDataset(my_ds, ["img", "label"], sampler=samplers.RandomSampler())
        >>> my_generator = ds.GeneratorDataset(my_ds, ["img", "label"], sampler=samplers.RandomSampler())
        >>>
    """

--- a/mindspore/dataset/engine/validators.py
+++ b/mindspore/dataset/engine/validators.py
@@ -529,8 +529,11 @@ def check_minddataset(method):
        dataset_file = param_dict.get('dataset_file')
        if dataset_file is None:
            raise ValueError("dataset_file is not provided.")
        check_dataset_file(dataset_file)

        if isinstance(dataset_file, list):
            for f in dataset_file:
                check_dataset_file(f)
        else:
            check_dataset_file(dataset_file)
        check_param_type(nreq_param_int, param_dict, int)

        check_param_type(nreq_param_list, param_dict, list)
--- a/mindspore/mindrecord/filereader.py
+++ b/mindspore/mindrecord/filereader.py
@@ -28,7 +28,7 @@ class FileReader:
    Class to read MindRecord File series.

    Args:
       file_name (str): File name of MindRecord File.
       file_name (str, list[str]): One of MindRecord File or file list.
       num_consumer(int, optional): Number of consumer threads which load data to memory (default=4).
           It should not be smaller than 1 or larger than the number of CPU.
       columns (list[str], optional): List of fields which correspond data would be read (default=None).
@@ -38,8 +38,11 @@ class FileReader:
        ParamValueError: If file_name, num_consumer or columns is invalid.
    """
    def __init__(self, file_name, num_consumer=4, columns=None, operator=None):
        check_filename(file_name)
        self._file_name = file_name
        if isinstance(file_name, list):
            for f in file_name:
                check_filename(f)
        else:
            check_filename(file_name)

        if num_consumer is not None:
            if isinstance(num_consumer, int):
--- a/mindspore/mindrecord/mindpage.py
+++ b/mindspore/mindrecord/mindpage.py
@@ -28,7 +28,7 @@ class MindPage:
    Class to read MindRecord File series in pagination.

    Args:
        file_name (str): File name of MindRecord File.
        file_name (str): One of MindRecord File or file list.
        num_consumer(int, optional): Number of consumer threads which load data to memory (default=4).
            It should not be smaller than 1 or larger than the number of CPU.

@@ -37,8 +37,11 @@ class MindPage:
        MRMInitSegmentError: If failed to initialize ShardSegment.
    """
    def __init__(self, file_name, num_consumer=4):
        check_filename(file_name)
        self._file_name = file_name
        if isinstance(file_name, list):
            for f in file_name:
                check_filename(f)
        else:
            check_filename(file_name)

        if num_consumer is not None:
            if isinstance(num_consumer, int):
--- a/mindspore/mindrecord/shardreader.py
+++ b/mindspore/mindrecord/shardreader.py
@@ -35,7 +35,7 @@ class ShardReader:
        Open file and prepare to read MindRecord File.

        Args:
           file_name (str): File name of MindRecord File.
           file_name (str, list[str]): File names of MindRecord File.
           num_consumer (int): Number of worker threads which load data in parallel. Default: 4.
           columns (list[str]): List of fields which correspond data would be read.
           operator(int): Reserved parameter for operators. Default: None.
@@ -48,7 +48,12 @@ class ShardReader:
        """
        columns = columns if columns else []
        operator = operator if operator else []
        ret = self._reader.open(file_name, num_consumer, columns, operator)
        if isinstance(file_name, list):
            load_dataset = False
        else:
            load_dataset = True
            file_name = [file_name]
        ret = self._reader.open(file_name, load_dataset, num_consumer, columns, operator)
        if ret != ms.MSRStatus.SUCCESS:
            logger.error("Failed to open {}.".format(file_name))
            raise MRMOpenError
--- a/mindspore/mindrecord/shardsegment.py
+++ b/mindspore/mindrecord/shardsegment.py
@@ -40,7 +40,7 @@ class ShardSegment:
        Initialize the ShardSegment.

        Args:
            file_name (str): File name of MindRecord File.
            file_name (str, list[str]): File names of MindRecord File.
            num_consumer (int): Number of worker threads which load data in parallel. Default: 4.
            columns (list[str]): List of fields which correspond data would be read.
            operator(int): Reserved parameter for operators. Default: None.
@@ -53,7 +53,12 @@ class ShardSegment:
        """
        self._columns = columns if columns else []
        operator = operator if operator else []
        ret = self._segment.open(file_name, num_consumer, self._columns, operator)
        if isinstance(file_name, list):
            load_dataset = False
        else:
            load_dataset = True
            file_name = [file_name]
        ret = self._segment.open(file_name, load_dataset, num_consumer, self._columns, operator)
        if ret != SUCCESS:
            logger.error("Failed to open {}.".format(file_name))
            raise MRMOpenError
--- a/mindspore/nn/cell.py
+++ b/mindspore/nn/cell.py
@@ -60,7 +60,7 @@ class Cell:
        self._cells = OrderedDict()
        self.training = False
        self.pynative = False
        self._param_perfix = ''
        self._param_prefix = ''
        self._auto_prefix = auto_prefix
        self._scope = None
        self._phase = 'train'
@@ -85,22 +85,22 @@ class Cell:
        return self._cell_init_args

    @property
    def param_perfix(self):
    def param_prefix(self):
        """
        Param perfix is the prfix of curent cell's direct child parameter.
        Param prefix is the prefix of current cell's direct child parameter.
        """
        return self._param_perfix
        return self._param_prefix

    def update_cell_prefix(self):
        """
        Update the all child cells' self.param_prefix.

        After invoked, can get all the cell's children's name perfix by '_param_perfix'.
        After invoked, can get all the cell's children's name prefix by '_param_prefix'.
        """
        cells = self.cells_and_names()

        for cell_name, cell in cells:
            cell._param_perfix = cell_name
            cell._param_prefix = cell_name

    @cell_init_args.setter
    def cell_init_args(self, value):
--- a/mindspore/nn/layer/basic.py
+++ b/mindspore/nn/layer/basic.py
@@ -398,22 +398,22 @@ class Pad(Cell):
            paddings are int type. For `D` th dimension of input, paddings[D, 0] indicates how many sizes to be
            extended ahead of the `D` th dimension of the input tensor, and paddings[D, 1] indicates how many sizes to
            be extended behind of the `D` th dimension of the input tensor.
        mode (string): Specifies padding mode. The optional values are "CONSTANT", "REFLECT", "SYMMETRIC".
        mode (str): Specifies padding mode. The optional values are "CONSTANT", "REFLECT", "SYMMETRIC".
            Default: "CONSTANT".

    Inputs:
        - ** input_x** (Tensor) - The input tensor.
        - **input_x** (Tensor) - The input tensor.

    Outputs:
        Tensor, the tensor after padding.

        - If `mode` is "CONSTANT", it fill the edge with 0, regardless of the values of the `input_x`.
        - If `mode` is "CONSTANT", it fills the edge with 0, regardless of the values of the `input_x`.
          If the `input_x` is [[1,2,3],[4,5,6],[7,8,9]] and `paddings` is [[1,1],[2,2]], then the
          Outputs is [[0,0,0,0,0,0,0],[0,0,1,2,3,0,0],[0,0,4,5,6,0,0],[0,0,7,8,9,0,0],[0,0,0,0,0,0,0]].
        - If 'mode` is "REFLECT", it uses a way of symmetrical copying throught the axis of symmetry to fill in,
          symmetry. If the `input_x` is [[1,2,3],[4,5,6],[7,8,9]] and `paddings` is [[1,1],[2,2]], then the
        - If `mode` is "REFLECT", it uses a way of symmetrical copying throught the axis of symmetry to fill in.
          If the `input_x` is [[1,2,3],[4,5,6],[7,8,9]] and `paddings` is [[1,1],[2,2]], then the
          Outputs is [[6,5,4,5,6,5,4],[3,2,1,2,3,2,1],[6,5,4,5,6,5,4],[9,8,7,8,9,8,7],[6,5,4,5,6,5,4]].
        - If 'mode' is "SYMMETRIC", the filling method is similar to the "REFLECT". It is also copied
        - If `mode` is "SYMMETRIC", the filling method is similar to the "REFLECT". It is also copied
          according to the symmetry axis, except that it includes the symmetry axis. If the `input_x`
          is [[1,2,3],[4,5,6],[7,8,9]] and `paddings` is [[1,1],[2,2]], then the Outputs is
          [[2,1,1,2,3,3,2],[2,1,1,2,3,3,2],[5,4,4,5,6,6,5],[8,7,7,8,9,9,8],[8,7,7,8,9,9,8]].
--- a/mindspore/nn/layer/quant.py
+++ b/mindspore/nn/layer/quant.py
@@ -191,6 +191,8 @@ class Conv2dBatchNormQuant(Cell):
                 stride,
                 pad_mode,
                 padding=0,
                 dilation=1,
                 group=1,
                 eps=1e-5,
                 momentum=0.9,
                 weight_init=None,
@@ -198,7 +200,6 @@ class Conv2dBatchNormQuant(Cell):
                 gamma_init=None,
                 mean_init=None,
                 var_init=None,
                 group=1,
                 quant_delay=0,
                 freeze_bn=100000,
                 fake=True,
--- a/mindspore/nn/loss/loss.py
+++ b/mindspore/nn/loss/loss.py
@@ -18,6 +18,8 @@ from mindspore.common.tensor import Tensor
 from mindspore.ops import operations as P
 from mindspore.ops import functional as F
 from mindspore.nn.cell import Cell
 from mindspore._checkparam import Validator as validator
 from mindspore._checkparam import Rel
 from ... import context


@@ -215,6 +217,8 @@ class SoftmaxCrossEntropyWithLogits(_Loss):
        sparse (bool): Specifies whether labels use sparse format or not. Default: False.
        reduction (Union[str, None]): Type of reduction to apply to loss. Support 'sum' or 'mean' If None,
            do not reduction. Default: None.
        smooth_factor (float): Label smoothing factor. It is a optional input. Default: 0.
        num_classes (int): The number of classes in the task. It is a optional input Default: 2.

    Inputs:
        - **logits** (Tensor) - Tensor of shape :math:`(x_1, x_2, ..., x_R)`.
@@ -235,14 +239,20 @@ class SoftmaxCrossEntropyWithLogits(_Loss):
    def __init__(self,
                 is_grad=True,
                 sparse=False,
                 reduction=None):
                 reduction=None,
                 smooth_factor=0,
                 num_classes=2):
        super(SoftmaxCrossEntropyWithLogits, self).__init__(reduction)
        self.is_grad = is_grad
        self.sparse = sparse
        validator.check_integer("num_classes", num_classes, 1, Rel.GT, self.cls_name)
        validator.check_number_range("smooth_factor", smooth_factor, 0, 1, Rel.INC_BOTH, self.cls_name)
        self.smooth_factor = smooth_factor
        self.num_classes = num_classes
        self.softmax_cross_entropy = P.SoftmaxCrossEntropyWithLogits()
        self.one_hot = P.OneHot()
        self.on_value = Tensor(1.0, mstype.float32)
        self.off_value = Tensor(0.0, mstype.float32)
        self.on_value = Tensor(1.0 - self.smooth_factor, mstype.float32)
        self.off_value = Tensor(1.0 * self.smooth_factor / (self.num_classes - 1), mstype.float32)
        self.is_cpugpu = context.get_context('device_target') in ["CPU", "GPU"]

        if self.is_cpugpu:
--- a/mindspore/nn/optim/momentum.py
+++ b/mindspore/nn/optim/momentum.py
@@ -17,6 +17,7 @@ from mindspore.ops import functional as F, composite as C, operations as P
 from mindspore.common.parameter import Parameter
 from mindspore.common.tensor import Tensor
 import mindspore.common.dtype as mstype
 from mindspore._checkparam import check_bool
 from .optimizer import Optimizer

 momentum_opt = C.MultitypeFuncGraph("momentum_opt")
@@ -67,6 +68,7 @@ class Momentum(Optimizer):
        momentum (float): Hyperparameter of type float, means momentum for the moving average.
        weight_decay (float): Weight decay (L2 penalty). Default: 0.0.
        loss_scale (float): A floating point value for the loss scale. Default: 1.0.
        use_nesterov (bool): Enable Nesterov momentum. Default: False.

    Inputs:
        - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.
@@ -95,15 +97,16 @@ class Momentum(Optimizer):
        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
        >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None)
    """
    def __init__(self, params, learning_rate, momentum, weight_decay=0.0, loss_scale=1.0):
    def __init__(self, params, learning_rate, momentum, weight_decay=0.0, loss_scale=1.0, use_nesterov=False):
        super(Momentum, self).__init__(learning_rate, params, weight_decay, loss_scale)
        if isinstance(momentum, float) and momentum < 0.0:
            raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
        self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum")
        self.params = self.parameters
        self.use_nesterov = check_bool(use_nesterov)
        self.moments = self.params.clone(prefix="moments", init='zeros')
        self.hyper_map = C.HyperMap()
        self.opt = P.ApplyMomentum()
        self.opt = P.ApplyMomentum(use_nesterov=self.use_nesterov)

    def construct(self, gradients):
        params = self.params
--- a/mindspore/nn/optim/sgd.py
+++ b/mindspore/nn/optim/sgd.py
@@ -136,7 +136,6 @@ class SGD(Optimizer):
        params = self.parameters
        accum = self.accum
        stat = self.stat
        gradients = self.decay_weight(gradients)
        gradients = self.scale_grad(gradients)
        lr = self.get_lr()
        if self.is_group_lr:
--- a/mindspore/ops/_op_impl/akg/gpu/cast.py
+++ b/mindspore/ops/_op_impl/akg/gpu/cast.py
@@ -22,6 +22,8 @@ cast_op_info = AkgRegOp("Cast") \
    .attr("dst_type", "required", "str") \
    .dtype_format(DataType.F16_Default, DataType.F32_Default) \
    .dtype_format(DataType.F32_Default, DataType.F16_Default) \
    .dtype_format(DataType.I32_Default, DataType.F32_Default) \
    .dtype_format(DataType.BOOL_Default, DataType.F32_Default) \
    .get_op_info()


--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@@ -1792,8 +1792,8 @@ class LayerNorm(Primitive):

        - **output_x** (Tensor) - The normalized input, has the same type and shape as the `input_x`.
          The shape is :math:`(N, C)`.
        - **updated_gamma** (Tensor) - Tensor of shape :math:`(C,)`.
        - **updated_beta** (Tensor) - Tensor of shape :math:`(C,)`.
        - **mean** (Tensor) - Tensor of shape :math:`(C,)`.
        - **variance** (Tensor) - Tensor of shape :math:`(C,)`.

    Examples:
        >>> input_x = Tensor(np.array([[1, 2, 3], [1, 2, 3]]), mindspore.float32)
@@ -2320,7 +2320,7 @@ class MirrorPad(PrimitiveWithInfer):
    Pads the input tensor according to the paddings and mode.

    Args:
        mode (string): Specifies padding mode. The optional values are "REFLECT", "SYMMETRIC".
        mode (str): Specifies padding mode. The optional values are "REFLECT", "SYMMETRIC".
            Default: "REFLECT".

    Inputs:
@@ -2334,10 +2334,10 @@ class MirrorPad(PrimitiveWithInfer):
    Outputs:
        Tensor, the tensor after padding.

        - If 'mode` is "REFLECT", it uses a way of symmetrical copying throught the axis of symmetry to fill in,
          symmetry. If the `input_x` is [[1,2,3],[4,5,6],[7,8,9]] and `paddings` is [[1,1],[2,2]], then the
        - If `mode` is "REFLECT", it uses a way of symmetrical copying throught the axis of symmetry to fill in.
          If the `input_x` is [[1,2,3],[4,5,6],[7,8,9]] and `paddings` is [[1,1],[2,2]], then the
          Outputs is [[6,5,4,5,6,5,4],[3,2,1,2,3,2,1],[6,5,4,5,6,5,4],[9,8,7,8,9,8,7],[6,5,4,5,6,5,4]].
        - If 'mode' is "SYMMETRIC", the filling method is similar to the "REFLECT". It is also copied
        - If `mode` is "SYMMETRIC", the filling method is similar to the "REFLECT". It is also copied
          according to the symmetry axis, except that it includes the symmetry axis. If the `input_x`
          is [[1,2,3],[4,5,6],[7,8,9]] and `paddings` is [[1,1],[2,2]], then the Outputs is
          [[2,1,1,2,3,3,2],[2,1,1,2,3,3,2],[5,4,4,5,6,6,5],[8,7,7,8,9,9,8],[8,7,7,8,9,9,8]].
--- a/tests/perf_test/bert/test_bert_train.py
+++ b/tests/perf_test/bert/test_bert_train.py
@@ -18,13 +18,13 @@
 # pylint: disable=missing-docstring, arguments-differ, W0612

 import os

 import mindspore.common.dtype as mstype
 import mindspore.context as context
 from mindspore import Tensor
 from mindspore.nn.optim import AdamWeightDecayDynamicLR
 from mindspore.model_zoo.Bert_NEZHA import BertConfig, BertNetworkWithLoss, BertTrainOneStepCell, \
    BertTrainOneStepWithLossScaleCell
 from mindspore.nn.wrap.loss_scale import FixedLossScaleUpdateCell
 from mindspore.nn.optim import AdamWeightDecayDynamicLR
 from mindspore.train.loss_scale_manager import DynamicLossScaleManager
 from ...dataset_mock import MindData
 from ...ops_common import nn, np, batch_tuple_tensor, build_construct_graph
--- a/tests/perf_test/mindrecord/imagenet/imagenet_to_mindrecord.py
+++ b/tests/perf_test/mindrecord/imagenet/imagenet_to_mindrecord.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 # ============================================================================
 """use ImageNetToMR tool generate mindrecord"""
 import os
 from mindspore.mindrecord import ImageNetToMR

 IMAGENET_MAP_FILE = "../../../ut/data/mindrecord/testImageNetDataWhole/labels_map.txt"
@@ -21,6 +20,7 @@ IMAGENET_IMAGE_DIR = "../../../ut/data/mindrecord/testImageNetDataWhole/images"
 MINDRECORD_FILE = "./imagenet.mindrecord"
 PARTITION_NUMBER = 16


 def imagenet_to_mindrecord():
    imagenet_transformer = ImageNetToMR(IMAGENET_MAP_FILE,
                                        IMAGENET_IMAGE_DIR,
@@ -28,5 +28,6 @@ def imagenet_to_mindrecord():
                                        PARTITION_NUMBER)
    imagenet_transformer.transform()


 if __name__ == '__main__':
    imagenet_to_mindrecord()
--- a/tests/perf_test/mindrecord/imagenet/imagenet_to_tfrecord.py
+++ b/tests/perf_test/mindrecord/imagenet/imagenet_to_tfrecord.py
@@ -15,6 +15,7 @@
 """generate tfrecord"""
 import collections
 import os

 import tensorflow as tf

 IMAGENET_MAP_FILE = "../../../ut/data/mindrecord/testImageNetDataWhole/labels_map.txt"
@@ -22,6 +23,7 @@ IMAGENET_IMAGE_DIR = "../../../ut/data/mindrecord/testImageNetDataWhole/images"
 TFRECORD_FILE = "./imagenet.tfrecord"
 PARTITION_NUMBER = 16


 def get_imagenet_filename_label_pic(map_file, image_dir):
    """
    Get data from imagenet.
@@ -69,18 +71,22 @@ def get_imagenet_filename_label_pic(map_file, image_dir):
                continue
            yield str(file_name), int(label), image_bytes


 def create_int_feature(values):
    feature = tf.train.Feature(int64_list=tf.train.Int64List(value=[values]))
    return feature


 def create_string_feature(values):
    feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[bytes(values, encoding='utf-8')]))
    return feature


 def create_bytes_feature(values):
    feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[values]))
    return feature


 def imagenet_to_tfrecord():
    writers = []
    for i in range(PARTITION_NUMBER):
@@ -109,5 +115,6 @@ def imagenet_to_tfrecord():

    print("Write {} total examples".format(total_written))


 if __name__ == '__main__':
    imagenet_to_tfrecord()
--- a/tests/perf_test/mindrecord/imagenet/perf_read_imagenet.py
+++ b/tests/perf_test/mindrecord/imagenet/perf_read_imagenet.py
@@ -14,17 +14,20 @@
 # ============================================================================
 """test dataset performance about mindspore.MindDataset, mindspore.TFRecordDataset, tf.data.TFRecordDataset"""
 import time
 import mindspore.dataset as ds
 from mindspore.mindrecord import FileReader

 import tensorflow as tf

 import mindspore.dataset as ds
 from mindspore.mindrecord import FileReader

 print_step = 5000


 def print_log(count):
    if count % print_step == 0:
        print("Read {} rows ...".format(count))


 def use_filereader(mindrecord):
    start = time.time()
    columns_list = ["data", "label"]
@@ -38,6 +41,7 @@ def use_filereader(mindrecord):
    end = time.time()
    print("Read by FileReader - total rows: {}, cost time: {}s".format(num_iter, end - start))


 def use_minddataset(mindrecord):
    start = time.time()
    columns_list = ["data", "label"]
@@ -51,6 +55,7 @@ def use_minddataset(mindrecord):
    end = time.time()
    print("Read by MindDataset - total rows: {}, cost time: {}s".format(num_iter, end - start))


 def use_tfrecorddataset(tfrecord):
    start = time.time()
    columns_list = ["data", "label"]
@@ -66,8 +71,10 @@ def use_tfrecorddataset(tfrecord):
    end = time.time()
    print("Read by TFRecordDataset - total rows: {}, cost time: {}s".format(num_iter, end - start))


 def use_tensorflow_tfrecorddataset(tfrecord):
    start = time.time()

    def _parse_record(example_photo):
        features = {
            'file_name': tf.io.FixedLenFeature([], tf.string),
@@ -87,6 +94,7 @@ def use_tensorflow_tfrecorddataset(tfrecord):
    end = time.time()
    print("Read by TensorFlow TFRecordDataset - total rows: {}, cost time: {}s".format(num_iter, end - start))


 if __name__ == '__main__':
    # use MindDataset
    mindrecord = './imagenet.mindrecord00'
--- a/tests/perf_test/test_lenet.py
+++ b/tests/perf_test/test_lenet.py
@@ -18,15 +18,14 @@
 import numpy as np

 import mindspore.nn as nn
 from mindspore.common.api import _executor
 import mindspore.ops.composite as C
 from mindspore import Tensor
 from mindspore.model_zoo.lenet import LeNet
 from mindspore import context
 import mindspore.ops.composite as C
 from mindspore.common.api import _executor
 from mindspore.model_zoo.lenet import LeNet

 context.set_context(mode=context.GRAPH_MODE)


 batch_size = 1
 channel = 1
 height = 32
@@ -36,6 +35,7 @@ num_class = 10

 class LeNetGrad(nn.Cell):
    """Backward of LeNet"""

    def __init__(self, network):
        super(LeNetGrad, self).__init__()
        self.grad_op = C.grad_all_with_sens
--- a/tests/perf_test/test_resnet_infer.py
+++ b/tests/perf_test/test_resnet_infer.py
@@ -17,10 +17,11 @@

 import numpy as np

 from mindspore.common.api import _executor
 from mindspore import Tensor
 from mindspore.common.api import _executor
 from .resnet_example import resnet50


 def test_compile():
    net = resnet50()
    inp = Tensor(np.ones([1, 3, 224, 224]).astype(np.float32))
--- a/tests/perf_test/test_resnet_pynative.py
+++ b/tests/perf_test/test_resnet_pynative.py
@@ -20,9 +20,9 @@
 import numpy as np

 from mindspore import Tensor
 from ..train_step_wrap import train_step_without_opt
 from .resnet_example import resnet50
 from ..vm_impl import *
 from ..train_step_wrap import train_step_without_opt


 def test_resnet50_pynative():
    net = train_step_without_opt(resnet50())
--- a/tests/perf_test/test_resnet_train.py
+++ b/tests/perf_test/test_resnet_train.py
@@ -17,13 +17,15 @@

 import numpy as np

 from mindspore.common.api import _executor
 import mindspore.context as context
 from mindspore import Tensor
 from ..train_step_wrap import train_step_with_loss_warp
 from mindspore.common.api import _executor
 from .resnet_example import resnet50
 from ..train_step_wrap import train_step_with_loss_warp

 context.set_context(mode=context.GRAPH_MODE)


 def test_train_step():
    net = train_step_with_loss_warp(resnet50())
    net.set_train()
--- a/tests/st/control/test_cont_break.py
+++ b/tests/st/control/test_cont_break.py
@@ -0,0 +1,162 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """ test_cont_break """
 import pytest
 import numpy as np
 from mindspore.nn import Cell
 from mindspore import Tensor, Model, context

 def run_test(netclass, count, dev):
    context.set_context(mode=context.GRAPH_MODE, device_target=dev)
    net = netclass()
    model = Model(net)
    for _ in range(count):
        input_np = np.random.randn(2, 3).astype(np.float32)
        input_ms = Tensor(input_np)
        output_np = net.construct(input_np) # run python
        output_ms = model.predict(input_ms) # run graph
        np.testing.assert_array_almost_equal(output_np, output_ms.asnumpy(), decimal=3)

 class for_loop_with_break(Cell):
    def __init__(self):
        super().__init__()

    def construct(self, x):
        for i in range(8):
            if i > 5:
                x *= 3
                break
            x = x * 2
            pass
        return x

 class for_loop_with_continue(Cell):
    def __init__(self):
        super().__init__()

    def construct(self, x):
        for i in range(8):
            if i > 5:
                x *= 3
                continue
            x = x * 2
        return x

 class for_loop_with_cont_break(Cell):
    def __init__(self):
        super().__init__()

    def construct(self, x):
        for i in range(8):
            if i < 3:
                i *= 2
                continue
            if i > 5:
                x *= 3
                break
                x *= 2
            x = x * 2
            pass
        return x

 class for_nested_loop_with_break(Cell):
    def __init__(self):
        super().__init__()

    def construct(self, x):
        for i in range(3):
            for j in range(5):
                if j > 3:
                    x *= 2
                    break
                x = x * 1.5
        return x

 class while_with_break(Cell):
    def __init__(self):
        super().__init__()

    def construct(self, x):
        i = 0
        while i < 5:
            if i > 3:
                x *= 2
                break
            x = x * 1.5
            i += 1
        return x

 class while_with_continue(Cell):
    def __init__(self):
        super().__init__()

    def construct(self, x):
        i = 0
        while i < 5:
            if i > 3:
                x *= 2
                i += 1
                continue
            x = x * 1.5
            i += 1
        return x

 class while_for_nested(Cell):
    def __init__(self):
        super().__init__()

    def construct(self, x):
        i = 0
        while i < 5:
            if i > 3:
                for j in range(3):
                    if j > 1:
                        break
                    x *= 2
                i += 1
                continue
            x = x * 1.5
            i += 1
        return x

 class pass_branch(Cell):
    def __init__(self):
        super().__init__()

    def construct(self, x):
        i = 0
        while i < 5:
            if i > 3:
                pass
            else:
                x = x * 1.5
            i += 1
        return x

@pytest.mark.level0
@pytest.mark.platform_x86_cpu
@pytest.mark.env_onecard
 def test_cont_break():
    count = 20
    dev = 'CPU'
    run_test(for_loop_with_break, count, dev)
    run_test(for_loop_with_continue, count, dev)
    run_test(for_loop_with_cont_break, count, dev)
    run_test(for_nested_loop_with_break, count, dev)
    run_test(while_with_break, count, dev)
    run_test(while_with_continue, count, dev)
    run_test(while_for_nested, count, dev)
    run_test(pass_branch, count, dev)

--- a/tests/st/ops/gpu/test_broadcast_op.py
+++ b/tests/st/ops/gpu/test_broadcast_op.py
@@ -50,6 +50,19 @@ def test_nobroadcast():
    output_np = np.power(x1_np, x2_np)
    assert np.allclose(output_ms.asnumpy(), output_np)

    output_ms = P.RealDiv()(Tensor(x1_np), Tensor(x2_np))
    output_np = x1_np / x2_np
    assert np.allclose(output_ms.asnumpy(), output_np)

    output_ms = P.Mul()(Tensor(x1_np), Tensor(x2_np))
    output_np = x1_np * x2_np
    assert np.allclose(output_ms.asnumpy(), output_np)

    output_ms = P.Sub()(Tensor(x1_np), Tensor(x2_np))
    output_np = x1_np - x2_np
    assert np.allclose(output_ms.asnumpy(), output_np)



@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@@ -80,6 +93,17 @@ def test_broadcast():
    output_np = np.power(x1_np, x2_np)
    assert np.allclose(output_ms.asnumpy(), output_np)

    output_ms = P.RealDiv()(Tensor(x1_np), Tensor(x2_np))
    output_np = x1_np / x2_np
    assert np.allclose(output_ms.asnumpy(), output_np)

    output_ms = P.Mul()(Tensor(x1_np), Tensor(x2_np))
    output_np = x1_np * x2_np
    assert np.allclose(output_ms.asnumpy(), output_np)

    output_ms = P.Sub()(Tensor(x1_np), Tensor(x2_np))
    output_np = x1_np - x2_np
    assert np.allclose(output_ms.asnumpy(), output_np)

@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@@ -109,3 +133,15 @@ def test_broadcast_diff_dims():
    output_ms = P.Pow()(Tensor(x1_np), Tensor(x2_np))
    output_np = np.power(x1_np, x2_np)
    assert np.allclose(output_ms.asnumpy(), output_np)

    output_ms = P.RealDiv()(Tensor(x1_np), Tensor(x2_np))
    output_np = x1_np / x2_np
    assert np.allclose(output_ms.asnumpy(), output_np)

    output_ms = P.Mul()(Tensor(x1_np), Tensor(x2_np))
    output_np = x1_np * x2_np
    assert np.allclose(output_ms.asnumpy(), output_np)

    output_ms = P.Sub()(Tensor(x1_np), Tensor(x2_np))
    output_np = x1_np - x2_np
    assert np.allclose(output_ms.asnumpy(), output_np)
--- a/tests/st/ops/gpu/test_cast_op.py
+++ b/tests/st/ops/gpu/test_cast_op.py
@@ -49,3 +49,21 @@ def test_cast():
    assert (type0 == 'float16')
    type1 = output[1].asnumpy().dtype
    assert (type1 == 'float32')


@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
 def test_cast1():
    x0 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.int32))
    t0 = mstype.float32
    x1 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.bool))
    t1 = mstype.float32

    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
    net = Net()
    output = net(x0, t0, x1, t1)
    type0 = output[0].asnumpy().dtype
    assert (type0 == 'float32')
    type1 = output[1].asnumpy().dtype
    assert (type1 == 'float32')
--- a/tests/st/ops/gpu/test_softmax_op.py
+++ b/tests/st/ops/gpu/test_softmax_op.py
@@ -1,64 +1,195 @@
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 import pytest
 import numpy as np
 from mindspore import Tensor
 from mindspore.ops import operations as P
 import mindspore.nn as nn
 import mindspore.context as context

 class NetSoftmax(nn.Cell):
    def __init__(self):
        super(NetSoftmax, self).__init__()
        axis = -2
        self.softmax1 = P.Softmax()
        self.softmax2 = P.Softmax(axis)

    def construct(self, x):
        return self.softmax1(x), self.softmax2(x)


@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
 def test_softmax():
    x = Tensor(np.array([[0.1, 0.3, 0.6, -0.3],
                         [0.2, -0.6, 0.8, 0.6],
                         [0.6, -1.2, 0.4, 0.6]]).astype(np.float32))
    expect1 = np.ones(3)
    expect2 = np.ones(4)
    error1 = expect1 * 1.0e-6
    error2 = expect2 * 1.0e-6

    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
    Softmax = NetSoftmax()
    output = Softmax(x)
    outputSum1 = output[0].asnumpy().sum(axis=1)
    outputSum2 = output[1].asnumpy().sum(axis=0)
    diff1 = np.abs(outputSum1 - expect1)
    diff2 = np.abs(outputSum2 - expect2)
    assert np.all(diff1 < error1)
    assert np.all(diff2 < error2)

    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
    Softmax = NetSoftmax()
    output = Softmax(x)
    outputSum1 = output[0].asnumpy().sum(axis=1)
    outputSum2 = output[1].asnumpy().sum(axis=0)
    diff1 = np.abs(outputSum1 - expect1)
    diff2 = np.abs(outputSum2 - expect2)
    assert np.all(diff1 < error1)
    assert np.all(diff2 < error2)
 # Copyright 2019 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 import pytest
 import numpy as np
 from mindspore import Tensor
 from mindspore.ops import operations as P
 from mindspore.ops import composite as C
 import mindspore.nn as nn
 import mindspore.context as context

 class NetSoftmax(nn.Cell):
    def __init__(self):
        super(NetSoftmax, self).__init__()
        axis = -2
        self.softmax1 = P.Softmax()
        self.softmax2 = P.Softmax(axis)

    def construct(self, x):
        return self.softmax1(x), self.softmax2(x)


@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
 def test_softmax():
    x = Tensor(np.array([[0.1, 0.3, 0.6, -0.3],
                         [0.2, -0.6, 0.8, 0.6],
                         [0.6, -1.2, 0.4, 0.6]]).astype(np.float32))
    expect1 = np.ones(3)
    expect2 = np.ones(4)
    error1 = expect1 * 1.0e-6
    error2 = expect2 * 1.0e-6

    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
    Softmax = NetSoftmax()
    output = Softmax(x)
    outputSum1 = output[0].asnumpy().sum(axis=1)
    outputSum2 = output[1].asnumpy().sum(axis=0)
    diff1 = np.abs(outputSum1 - expect1)
    diff2 = np.abs(outputSum2 - expect2)
    assert np.all(diff1 < error1)
    assert np.all(diff2 < error2)

    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
    Softmax = NetSoftmax()
    output = Softmax(x)
    outputSum1 = output[0].asnumpy().sum(axis=1)
    outputSum2 = output[1].asnumpy().sum(axis=0)
    diff1 = np.abs(outputSum1 - expect1)
    diff2 = np.abs(outputSum2 - expect2)
    assert np.all(diff1 < error1)
    assert np.all(diff2 < error2)


 class Net(nn.Cell):
    def __init__(self):
        super(Net, self).__init__()
        self.softmax1 = P.Softmax()

    def construct(self, x):
        return self.softmax1(x)

 class Grad(nn.Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.grad = C.GradOperation(name="get_all", get_all=True, sens_param=True)
        self.network = network

    def construct(self, input_data, sens):
        gout = self.grad(self.network)(input_data, sens)
        return gout


@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
 def test_softmax_4d():
    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
    x = np.array([[[[ 2.7866030e-01,  8.5578346e-01, -2.7546784e-01, -8.5833269e-01,  1.5753637e-01],
                    [-4.5145524e-01,  1.5590921e-01, -6.1947298e-01, -6.3499230e-01, -1.0625143e+00],
                    [-6.8716180e-01, -3.5565588e-01,  9.9680430e-01, -3.5519487e-01,  5.2122700e-01],
                    [-9.8125875e-01,  9.0505141e-01,  6.5961617e-01,  6.5950197e-01,  1.0319239e+00]],
                   [[-7.6588345e-01, -1.6929083e-01,  9.4459933e-01, -8.3931917e-01,  1.4916732e+00],
                    [ 8.1874236e-02, -1.9288104e-02,  7.3255712e-01, -1.4598954e-01,  1.1225560e+00],
                    [ 2.7356184e-01,  1.2557162e-01,  1.3796539e+00,  1.0073920e-01,  7.9203087e-01],
                    [-3.6947381e-01,  4.7919992e-01,  2.2421131e+00, -8.3911163e-01,  1.0814662e+00]],
                   [[-2.5838584e-01,  2.0765430e-01, -1.9366746e-01,  6.7511219e-01, -3.7492469e-01],
                    [ 4.4170797e-01, -9.9537361e-01, -3.5100895e-01, -7.8317386e-01,  1.1672008e-02],
                    [ 1.6037937e+00, -1.7059358e+00, -9.3724984e-01, -1.5016698e+00, -2.7605603e-02],
                    [ 1.6392696e-01,  1.0074581e+00, -2.7704465e+00,  8.1361882e-02,  7.9730105e-01]]],
                  [[[ 2.9516423e-01,  4.6354745e-02,  1.7318316e-01,  1.5894413e+00, -1.2769363e+00],
                    [ 2.8939021e-01, -3.8801813e-01, -1.3376296e+00, -4.9808905e-01, -3.2318991e-02],
                    [-1.1740140e+00, -1.1140432e+00, -1.4198960e-01,  5.8953021e-02, -3.6763316e-01],
                    [ 1.8660797e+00, -5.8705074e-01,  6.8757606e-01, -4.0573463e-01, -7.1130061e-01]],
                   [[ 2.6170531e-01,  5.4814044e-02,  1.3891056e-01,  3.4492522e-02, -1.0920379e-01],
                    [ 1.1420644e-01,  1.6939731e-01, -1.0413316e+00, -1.4040415e-01, -3.3280477e-01],
                    [-3.0776244e-01,  1.0526397e+00,  2.9497927e-01,  1.1266683e+00,  8.4419928e-02],
                    [-2.1593940e+00, -1.0187222e+00,  1.7475771e+00, -3.5802367e-01, -1.2900480e+00]],
                   [[ 3.2892069e-01, -1.6604670e+00, -5.7856506e-01,  5.8143520e-01,  5.9596705e-01],
                    [-1.5992336e-01, -5.9647644e-01,  1.2957820e+00, -1.0650631e-01,  7.0879894e-01],
                    [ 4.1372257e-01,  3.6408889e-01, -6.3091749e-01,  1.0573713e+00,  1.0981073e+00],
                    [-1.9162457e-01,  3.6392561e-05, -1.8338780e-01,  1.7549801e+00, -9.3534666e-01]]]]).astype(np.float32)

    dy = np.array([[[[ 2.98213929e-01,  3.10518718e+00, -1.64306939e-01, -7.33681679e-01,  5.23136854e-02],
                     [-3.47142726e-01, -1.52662742e+00,  5.26977003e-01,  5.29672280e-02, -4.34386432e-01],
                     [ 1.34674394e+00,  1.69386661e+00,  3.17139983e-01,  5.77129781e-01,  1.25290680e+00],
                     [-1.71099675e+00, -1.62872851e+00, -7.89083183e-01,  8.64615321e-01, -1.74364686e+00]],
                    [[ 1.11915946e+00, -7.06878662e-01, -6.71557069e-01, -4.50884640e-01,  2.95763493e-01],
                     [-7.64747679e-01,  1.62951392e-03, -2.84069944e-02,  7.55402744e-01, -1.02387452e+00],
                     [-5.92088878e-01,  4.47980821e-01,  4.50127304e-01, -3.99038166e-01, -5.24561822e-01],
                     [ 1.92535609e-01,  2.44671494e-01, -8.70469391e-01, -8.30129832e-02, -4.04477213e-03]],
                    [[-1.94159836e-01, -8.50215256e-01, -1.01224804e+00,  2.64235616e-01,  5.34391068e-02],
                     [-6.71353936e-01,  3.73690695e-01,  4.48037744e-01, -2.84973383e-01, -2.80129910e+00],
                     [ 6.69475198e-01,  2.08404279e+00,  4.49459851e-01,  2.50908136e+00,  9.80683088e-01],
                     [ 1.18290365e+00, -1.28790128e+00, -1.70202863e+00, -1.37078688e-01,  9.53227460e-01]]],
                   [[[-6.44128084e-01,  1.37707603e+00, -8.60912442e-01, -3.83467346e-01,  6.68365955e-01],
                     [-3.32795471e-01,  3.05202007e-01,  2.20850635e+00,  6.93960607e-01, -1.94968760e-01],
                     [-3.35764170e-01,  1.10562348e+00, -1.13264215e+00, -1.08296621e+00, -6.53923571e-01],
                     [-4.64974046e-01,  8.83257568e-01, -1.70353889e+00, -4.48120385e-01, -1.76938546e+00]],
                    [[-3.80976290e-01, -1.49393475e+00, -8.51393223e-01, -1.49780405e+00, -1.24160886e-01],
                     [-7.18508661e-02,  2.44543999e-01,  3.29225749e-01,  7.09274471e-01, -9.26648498e-01],
                     [ 6.67312503e-01, -1.08737612e+00, -9.63039994e-01, -3.22715081e-02, -4.03802067e-01],
                     [-5.97982287e-01, -1.40739769e-01,  2.80631828e+00,  5.72278857e-01,  2.05998325e+00]],
                    [[ 3.46207246e-02,  7.34213948e-01,  1.45563519e+00,  1.02045703e+00,  1.40984225e+00],
                     [ 4.14457440e-01, -8.74118507e-01, -4.21902031e-01,  7.87168801e-01, -1.48280108e+00],
                     [ 1.42688036e+00, -2.02695489e+00,  9.26816165e-01,  9.37691629e-01,  7.85577714e-01],
                     [-6.59893751e-01,  1.14681525e-02, -5.79456389e-01, -1.65206456e+00,  4.37116653e-01]]]]).astype(np.float32)

    expect_x = np.array([[[[0.21919312,  0.3903627,  0.12594244, 0.07031325, 0.19418849],
                           [0.19778392,  0.36304963, 0.16719443, 0.1646197,  0.10735231],
                           [0.07986113,  0.11125171, 0.43020225, 0.11130301, 0.26738194],
                           [0.03936873,  0.25963634, 0.20313013, 0.20310691, 0.29475793]],
                          [[0.05308856,  0.09640461, 0.29366633, 0.04932966, 0.50751084],
                           [0.13426398,  0.12134594, 0.2573638,  0.10690536, 0.38012096],
                           [0.13503104,  0.11645612, 0.40813455, 0.11359984, 0.22677852],
                           [0.04576753,  0.10693795, 0.6233836,  0.02861518, 0.19529575]],
                          [[0.14096586,  0.2246532,  0.15039064, 0.35853124, 0.12545899],
                           [0.37957698,  0.09019516, 0.17180163, 0.11151683, 0.2469094 ],
                           [0.7375885,   0.0269412,  0.05811028, 0.03304673, 0.14431332],
                           [0.16174863,  0.37599453, 0.00859921, 0.1489303,  0.3047274 ]]],
                         [[[0.15335402,  0.11957449, 0.13574363, 0.55949026, 0.03183762],
                           [0.34669915,  0.17609946, 0.06813136, 0.15774474, 0.2513253 ],
                           [0.09487908,  0.10074313, 0.26630113, 0.32556766, 0.21250896],
                           [0.6357843,   0.05469263, 0.19565557, 0.0655652,  0.0483023 ]],
                          [[0.23898226, 0.19431841, 0.21136671, 0.19040942, 0.16492325],
                           [0.2641041,   0.27909,    0.08316323, 0.20473833, 0.16890427],
                           [0.08062991,  0.3142761,  0.14732064, 0.33842432, 0.11934903],
                           [0.01604616,  0.05020634, 0.79826504, 0.09720672, 0.03827571]],
                          [[0.24191543, 0.03308899, 0.09762195, 0.31140763, 0.31596598],
                           [0.10669514,  0.06895282, 0.45745608, 0.11254943, 0.25434658],
                           [0.16156755,  0.15374413, 0.05684244, 0.3075298,  0.32031605],
                           [0.09346025,  0.11320464, 0.09423324, 0.65467626, 0.04442552]]]]).astype(np.float32)

    expect_dx = np.array([[[[-0.20103945,  0.737705  , -0.17376284, -0.1370458 , -0.22585672],
                            [ 0.04461281, -0.34632078,  0.18386088,  0.10299816, 0.01484894],
                            [ 0.04113413,  0.09592049, -0.22135337, -0.02833145, 0.11263024],
                            [-0.0284293 , -0.1661311 ,  0.04058228,  0.37645525, -0.22247711]],
                           [[ 0.06355994, -0.06061868, -0.17428297, -0.01839012,  0.1897318 ],
                            [-0.04652473,  0.05094835,  0.10032654,  0.12546772, -0.23021786],
                            [-0.07882182,  0.05314343,  0.18712361, -0.04438123, -0.11706398],
                            [ 0.03219109,  0.08079126, -0.22419631,  0.01224192,  0.09897206]],
                           [[ 0.01057316, -0.1305348 , -0.11175273,  0.19124077,  0.04047358],
                            [ 0.07448982,  0.11195826,  0.2260284 ,  0.06497248, -0.47744888],
                            [-0.09664576,  0.03458005, -0.02039931,  0.05646288,  0.02600216],
                            [ 0.1973966 , -0.47014874, -0.01431374, -0.01483214,  0.30189803]]],
                          [[[-0.06132338,  0.19386888, -0.08370841, -0.07789247,  0.02905542],
                            [-0.16714299,  0.0274538 ,  0.14029635,  0.08591694, -0.08652411],
                            [ 0.03585254,  0.18327834, -0.11158065, -0.12024056,  0.01269035],
                            [ 0.14654502,  0.0863447 , -0.19723451,  0.01621746, -0.05187264]],
                           [[ 0.11614501, -0.12182987,  0.00329342, -0.12011584,  0.12250728],
                            [-0.03623635,  0.05001016,  0.02194443,  0.13183522, -0.16755345],
                            [ 0.09322704, -0.18807998, -0.06984743,  0.15454148,  0.01015892],
                            [-0.04743218, -0.12545264,  0.35787603, -0.1735842 , -0.01140684]],
                           [[-0.21854429, -0.00674347,  0.05053139,  0.02567403,  0.14908233],
                            [ 0.09731252, -0.02596174,  0.03463032,  0.14460044, -0.2505815 ],
                            [ 0.1478814 , -0.3902862 ,  0.02360253,  0.13103928,  0.087763  ],
                            [ 0.04834083,  0.13455458,  0.05632052, -0.3109298 ,  0.07171366]]]]).astype(np.float32)
    y = Net()(Tensor(x))
    assert np.allclose(y.asnumpy(), expect_x)

    dx = Grad(Net())(Tensor(x), Tensor(dy))
    assert np.allclose(dx[0].asnumpy(), expect_dx)
--- a/tests/train_step_wrap.py
+++ b/tests/train_step_wrap.py
@@ -16,15 +16,15 @@
 train step wrap
 """
 import mindspore.nn as nn
 from mindspore.ops import functional as F
 from mindspore import ParameterTuple
 from mindspore.ops import composite as C
 from mindspore.ops import operations as P
 from mindspore import Parameter, ParameterTuple


 class TrainStepWrap(nn.Cell):
    """
    TrainStepWrap definition
    """

    def __init__(self, network):
        super(TrainStepWrap, self).__init__()
        self.network = network
@@ -39,10 +39,12 @@ class TrainStepWrap(nn.Cell):
        grads = self.grad(self.network, weights)(x, label)
        return self.optimizer(grads)


 class NetWithLossClass(nn.Cell):
    """
    NetWithLossClass definition
    """

    def __init__(self, network):
        super(NetWithLossClass, self).__init__(auto_prefix=False)
        self.loss = nn.SoftmaxCrossEntropyWithLogits()
@@ -61,6 +63,7 @@ class TrainStepWrap2(nn.Cell):
    """
    TrainStepWrap2 definition
    """

    def __init__(self, network, sens):
        super(TrainStepWrap2, self).__init__()
        self.network = network
@@ -76,13 +79,16 @@ class TrainStepWrap2(nn.Cell):
        grads = self.grad(self.network, weights)(x, self.sens)
        return self.optimizer(grads)


 def train_step_with_sens(network, sens):
    return TrainStepWrap2(network, sens)


 class TrainStepWrapWithoutOpt(nn.Cell):
    """
    TrainStepWrapWithoutOpt definition
    """

    def __init__(self, network):
        super(TrainStepWrapWithoutOpt, self).__init__()
        self.network = network
@@ -93,5 +99,6 @@ class TrainStepWrapWithoutOpt(nn.Cell):
        grads = self.grad(self.network, self.weights)(x, label)
        return grads


 def train_step_without_opt(network):
    return TrainStepWrapWithoutOpt(NetWithLossClass(network))
--- a/tests/ut/cpp/dataset/mind_record_op_test.cc
+++ b/tests/ut/cpp/dataset/mind_record_op_test.cc
@@ -62,7 +62,8 @@ TEST_F(MindDataTestMindRecordOp, TestMindRecordBasic) {

  std::shared_ptr<MindRecordOp> my_mindrecord_op;
  MindRecordOp::Builder builder;
  builder.SetDatasetFile(mindrecord_root_path_ + "/testMindDataSet/testImageNetData/imagenet.mindrecord0")
  builder.SetDatasetFile({mindrecord_root_path_ + "/testMindDataSet/testImageNetData/imagenet.mindrecord0"})
      .SetLoadDataset(true)
      .SetRowsPerBuffer(3)
      .SetNumMindRecordWorkers(4)
      .SetColumnsToLoad(column_list);
@@ -132,7 +133,8 @@ TEST_F(MindDataTestMindRecordOp, TestMindRecordSample) {

  std::shared_ptr<MindRecordOp> my_mindrecord_op;
  MindRecordOp::Builder builder;
  builder.SetDatasetFile(mindrecord_root_path_ + "/testMindDataSet/testImageNetData/imagenet.mindrecord0")
  builder.SetDatasetFile({mindrecord_root_path_ + "/testMindDataSet/testImageNetData/imagenet.mindrecord0"})
      .SetLoadDataset(true)
      .SetRowsPerBuffer(3)
      .SetNumMindRecordWorkers(4)
      .SetColumnsToLoad(column_list)
@@ -203,7 +205,8 @@ TEST_F(MindDataTestMindRecordOp, TestMindRecordShuffle) {

  std::shared_ptr<MindRecordOp> my_mindrecord_op;
  MindRecordOp::Builder builder;
  builder.SetDatasetFile(mindrecord_root_path_ + "/testMindDataSet/testImageNetData/imagenet.mindrecord0")
  builder.SetDatasetFile({mindrecord_root_path_ + "/testMindDataSet/testImageNetData/imagenet.mindrecord0"})
      .SetLoadDataset(true)
      .SetRowsPerBuffer(3)
      .SetNumMindRecordWorkers(4)
      .SetColumnsToLoad(column_list)
@@ -277,7 +280,8 @@ TEST_F(MindDataTestMindRecordOp, TestMindRecordCategory) {

  std::shared_ptr<MindRecordOp> my_mindrecord_op;
  MindRecordOp::Builder builder;
  builder.SetDatasetFile(mindrecord_root_path_ + "/testMindDataSet/testImageNetData/imagenet.mindrecord0")
  builder.SetDatasetFile({mindrecord_root_path_ + "/testMindDataSet/testImageNetData/imagenet.mindrecord0"})
      .SetLoadDataset(true)
      .SetRowsPerBuffer(3)
      .SetNumMindRecordWorkers(4)
      .SetColumnsToLoad(column_list)
@@ -345,7 +349,8 @@ TEST_F(MindDataTestMindRecordOp, TestMindRecordRepeat) {

  std::shared_ptr<MindRecordOp> my_mindrecord_op;
  MindRecordOp::Builder builder;
  builder.SetDatasetFile(mindrecord_root_path_ + "/testMindDataSet/testImageNetData/imagenet.mindrecord0")
  builder.SetDatasetFile({mindrecord_root_path_ + "/testMindDataSet/testImageNetData/imagenet.mindrecord0"})
      .SetLoadDataset(true)
      .SetRowsPerBuffer(3)
      .SetNumMindRecordWorkers(4)
      .SetColumnsToLoad(column_list);
@@ -426,7 +431,8 @@ TEST_F(MindDataTestMindRecordOp, TestMindRecordBlockReaderRepeat) {

  std::shared_ptr<MindRecordOp> my_mindrecord_op;
  MindRecordOp::Builder builder;
  builder.SetDatasetFile(mindrecord_root_path_ + "/testMindDataSet/testImageNetData/imagenet.mindrecord0")
  builder.SetDatasetFile({mindrecord_root_path_ + "/testMindDataSet/testImageNetData/imagenet.mindrecord0"})
      .SetLoadDataset(true)
      .SetRowsPerBuffer(3)
      .SetNumMindRecordWorkers(4)
      .SetBlockReader()
@@ -507,7 +513,8 @@ TEST_F(MindDataTestMindRecordOp, TestMindRecordInvalidColumnList) {

  std::shared_ptr<MindRecordOp> my_mindrecord_op;
  MindRecordOp::Builder builder;
  builder.SetDatasetFile(mindrecord_root_path_ + "/testMindDataSet/testImageNetData/imagenet.mindrecord0")
  builder.SetDatasetFile({mindrecord_root_path_ + "/testMindDataSet/testImageNetData/imagenet.mindrecord0"})
      .SetLoadDataset(true)
      .SetRowsPerBuffer(3)
      .SetNumMindRecordWorkers(4)
      .SetColumnsToLoad(column_list);
--- a/tests/ut/cpp/device/ascend_kernel_select_test.cc
+++ b/tests/ut/cpp/device/ascend_kernel_select_test.cc
@@ -1,345 +0,0 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "mindspore/ccsrc/device/ascend/kernel_select_ascend.h"
 #include "common/common_test.h"
 #include "session/kernel_graph.h"
 #include "kernel/kernel.h"
 #include "session/anf_runtime_algorithm.h"
 #include "utils/utils.h"
 #include "operator/ops.h"
 #include "mindspore/ccsrc/device/kernel_info.h"
 #include "mindspore/ccsrc/kernel/kernel_build_info.h"
 #include <vector>
 namespace mindspore {
 namespace device {
 namespace ascend {
 namespace {
 using KernelInfo = device::KernelInfo;
 using KernelBuildInfoBuilder = kernel::KernelBuildInfo::KernelBuildInfoBuilder;
 using KernelBuildInfo = kernel::KernelBuildInfo;
 using KernelGraph = session::KernelGraph;
 using KernelBuildInfoPtr = std::shared_ptr<KernelBuildInfo>;
 using KernelBuilderPtr = std::shared_ptr<KernelBuildInfoBuilder>;
 using Shape = std::vector<size_t>;
 using ShapeList = std::vector<Shape>;
 enum MatchCountPriority {
  MATCH_COUNT_PRIORITY_BEGIN = 0,
  MATCH_FORMAT_COUNT = MATCH_COUNT_PRIORITY_BEGIN,
  MATCH_DTYPE_COUNT,
  MATCH_NZ_FORMAT_COUNT,
  MATCH_5D_FORMAT_COUNT,
  MATCH_OUTPUT_DTYPE_COUNT,
  MATCH_COUNT_PRIORITY_END
 };

 const std::set<std::string> kOpFormatList = {
  kOpFormat_DEFAULT, kOpFormat_NC1KHKWHWC0, kOpFormat_ND,     kOpFormat_NCHW,      kOpFormat_NHWC,
  kOpFormat_HWCN,    kOpFormat_NC1HWC0,     kOpFormat_FRAC_Z, kOpFormat_C1HWNCoC0, kOpFormat_FRAC_NZ};

 bool IsShapeMatchFormat(const std::vector<size_t> &shape, const std::string &format) {
  // if format is default,it remarkes support all format
  if (kOpFormatList.find(format) == kOpFormatList.end()) {
    MS_EXCEPTION(ArgumentError) << "got the unknow format " << format;
  }
  if (format == kOpFormat_DEFAULT) {
    return true;
  }
  // if shape size is 0,the shape will be a scalar
  if (shape.empty()) {
    return true;
  }
  if (shape.size() > kShapeSupportFormatMap.size()) {
    return false;
  }
  if (format == kOpFormat_FRAC_NZ && shape.size() >= 2) {
    return shape[shape.size() - 1] % 16 != 0 && shape[shape.size() - 2] % 16 != 0;
  }
  return !(kShapeSupportFormatMap[shape.size() - 1].find(format) == kShapeSupportFormatMap[shape.size() - 1].end());
 }

 bool IsValidKernelInfo(const std::shared_ptr<CNode> &kernel_node, const kernel::KernelBuildInfo &kernel_build_info) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  auto check_function = [](const std::vector<size_t> &shape, const std::string &format) -> bool {
    if (!IsShapeMatchFormat(shape, format)) {
      return false;
    }
    for (auto shape_value : shape) {
      if (shape_value == 0) {
        MS_EXCEPTION(ArgumentError) << "dimension size of the tensor shape should be a positive integer, but got ["
                                    << shape_value << "]";
      }
    }
    return true;
  };
  for (size_t index = 0; index < kernel_build_info.GetOutputNum(); ++index) {
    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, index);
    if (!check_function(output_shape, kernel_build_info.GetOutputFormat(index))) {
      return false;
    }
  }
  for (size_t index = 0; index < kernel_build_info.GetInputNum(); ++index) {
    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, index);
    if (!check_function(input_shape, kernel_build_info.GetInputFormat(index))) {
      return false;
    }
  }
  return true;
 }

 bool MatchInferOutputDataType(const CNodePtr &cnode, const kernel::KernelBuildInfo &kernel_build_info) {
  MS_EXCEPTION_IF_NULL(cnode);
  // Check input data type
  for (size_t input_index = 0; input_index < kernel_build_info.GetInputNum(); ++input_index) {
    AnfNodePtr cur_input = cnode->input(input_index + 1);
    MS_EXCEPTION_IF_NULL(cur_input);
    TypeId input_origin_type;
    if (cur_input->isa<Parameter>() && AnfAlgo::IsParameterWeight(cur_input->cast<ParameterPtr>())) {
      // weight
      input_origin_type = AnfAlgo::GetOutputDeviceDataType(cur_input, 0);
    } else {
      // feature map
      input_origin_type = AnfAlgo::GetPrevNodeOutputInferDataType(cnode, input_index);
    }
    if (input_origin_type == kTypeUnknown) {
      continue;
    }
    if (kernel_build_info.GetInputDeviceType(input_index) != input_origin_type) {
      return false;
    }
  }
  // Check output data type
  for (size_t output_index = 0; output_index < kernel_build_info.GetOutputNum(); ++output_index) {
    if (kernel_build_info.GetOutputDeviceType(output_index) != AnfAlgo::GetOutputInferDataType(cnode, output_index)) {
      return false;
    }
  }
  return true;
 }

 /**
 * compare too vector by priority,select a better vector,like compare too num,first compare highest num location,if
 * equal then next num location
 * example:[3,1,1,1] > [2,2,2,2] > [2,2,1,2] > [2,1,1,3]
 */
 bool PriorityChooseItem(const std::vector<int> &cur_item, std::vector<int> *best_item) {
  MS_EXCEPTION_IF_NULL(best_item);
  if (cur_item.size() != best_item->size()) {
    MS_LOG(ERROR) << "item size should be same!";
    return false;
  }
  // Update the best_item by comparing the cur_item and best_item
  for (size_t i = 0; i < cur_item.size(); i++) {
    if (cur_item[i] > best_item->at(i)) {
      *best_item = cur_item;
      return true;
    } else if (cur_item[i] == best_item->at(i)) {
      continue;
    } else {
      return false;
    }
  }
  return false;
 }

 void UpdateCurMatchCounts(const kernel::KernelBuildInfo &kernel_build_info, const std::shared_ptr<CNode> &kernel_node,
                          std::vector<int> *const cur_kernelinfo_match_counts) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  MS_EXCEPTION_IF_NULL(cur_kernelinfo_match_counts);
  if (cur_kernelinfo_match_counts->size() < MATCH_COUNT_PRIORITY_END) {
    MS_EXCEPTION(ArgumentError) << "Out of range cur_kernelinfo_match_counts " << MATCH_COUNT_PRIORITY_END;
  }
  for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(kernel_node); ++input_index) {
    AnfNodePtr input_anf_node = kernel_node->input(input_index + 1);
    MS_EXCEPTION_IF_NULL(input_anf_node);
    // if a input parameter is a weight with default format, the input shouldn't participate the judge
    if (input_anf_node->isa<Parameter>()) {
      auto para = input_anf_node->cast<ParameterPtr>();
      if (AnfAlgo::IsParameterWeight(para) && AnfAlgo::GetOutputDeviceDataType(para, 0) == kTypeUnknown) {
        continue;
      }
    }
    if (kernel_build_info.GetInputFormat(input_index) == AnfAlgo::GetPrevNodeOutputFormat(kernel_node, input_index)) {
      (*cur_kernelinfo_match_counts)[MATCH_FORMAT_COUNT]++;
    }
    if (kernel_build_info.GetInputDeviceType(input_index) ==
        AnfAlgo::GetPrevNodeOutputDeviceDataType(kernel_node, input_index)) {
      (*cur_kernelinfo_match_counts)[MATCH_DTYPE_COUNT]++;
    }
    if (kernel_build_info.GetInputFormat(input_index) == kOpFormat_FRAC_NZ) {
      (*cur_kernelinfo_match_counts)[MATCH_NZ_FORMAT_COUNT]++;
    }
    if (kernel_build_info.GetInputFormat(input_index) == kOpFormat_NC1HWC0) {
      (*cur_kernelinfo_match_counts)[MATCH_5D_FORMAT_COUNT]++;
    }
  }

  for (size_t output_index = 0; output_index < AnfAlgo::GetOutputTensorNum(kernel_node); ++output_index) {
    // cal count of same output dtype between abstract and kernel info
    if (kernel_build_info.GetOutputDeviceType(output_index) ==
        AnfAlgo::GetOutputInferDataType(kernel_node, output_index)) {
      (*cur_kernelinfo_match_counts)[MATCH_OUTPUT_DTYPE_COUNT]++;
    }
  }
 }

 void SetKernelBuildInfo(KernelBuilderPtr builder) {
  builder->SetFusionType(kernel::OPAQUE);
  builder->SetKernelType(AUTO_DIFF_KERNEL);
  builder->SetProcessor(kernel::AICORE);
 }

 void test_select(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel::KernelBuildInfo>> kernel_info_list) {
  std::vector<int> most_match_counts = {-1, -1, -1, -1, -1};
  int selected_index = -1;
  for (size_t info_index = 0; info_index < kernel_info_list.size(); ++info_index) {
    std::vector<int> cur_kernel_info_match_counts = {0, 0, 0, 0, 0};
    if (!IsValidKernelInfo(kernel_node, *(kernel_info_list[info_index]))) {
      continue;
    }
    if (!MatchInferOutputDataType(kernel_node, *(kernel_info_list[info_index]))) {
      continue;
    }
    std::shared_ptr<kernel::KernelBuildInfo> kernel_info_ptr = kernel_info_list[info_index];
    UpdateCurMatchCounts(*kernel_info_ptr, kernel_node, &cur_kernel_info_match_counts);
    // Currently the selection policy is the match format count first, and then is datatype counts.
    if (PriorityChooseItem(cur_kernel_info_match_counts, &most_match_counts)) {
      selected_index = SizeToInt(info_index);
    }
  }
  if (selected_index == -1) {
    MS_EXCEPTION(NotExistsError) << "" << kernel_node->DebugString() << " Cannot find valid kernel Info !";
  }
  auto index = IntToSize(selected_index);
  if (index >= kernel_info_list.size()) {
    MS_EXCEPTION(ArgumentError) << "index outof range";
  }
  std::shared_ptr<kernel::KernelBuildInfo> selected_kernel_info_ptr = kernel_info_list[index];
  MS_EXCEPTION_IF_NULL(selected_kernel_info_ptr);
  AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_info_ptr, kernel_node.get());
 }

 void SetParentAbstract(std::vector<AnfNodePtr> parent_list, std::vector<std::vector<size_t>> shapes,
                       std::vector<TypeId> types) {
  for (const auto &node : parent_list) {
    AnfAlgo::SetOutputInferTypeAndShape(types, shapes, node.get());
  }
 }
 }  // namespace
 class AscendKernelSelctTest : public UT::Common {
 public:
  AscendKernelSelctTest() = default;
  void SetUp() override {}
  void TearDown() override {}
 };

 TEST_F(AscendKernelSelctTest, TestSelect) {
  std::vector<KernelBuilderPtr> build_list;
  std::vector<TypeId> type_list = {kNumberTypeFloat32};
  for (size_t i = 0; i <= 4; ++i) {
    build_list.push_back(std::make_shared<KernelBuildInfoBuilder>());
    SetKernelBuildInfo(build_list[i]);
    build_list[i]->SetInputsDeviceType(type_list);
    build_list[i]->SetOutputsDeviceType(type_list);
  }

  std::vector<std::string> nd_fmt = {kOpFormat_DEFAULT};
  std::vector<std::string> nz_fmt = {kOpFormat_FRAC_NZ};
  auto anf_graph = std::make_shared<KernelGraph>();

  // 16's multiple should not chose format NZ
  Shape nd_shapes = {2, 32, 224, 224};

  Shape nz_shapes = {3, 3, 5, 5};
  auto add_value = NewValueNode(prim::kPrimTensorAdd);
  auto a_node = anf_graph->NewCNode(std::vector<AnfNodePtr>{add_value});
  auto b_node = anf_graph->NewCNode(std::vector<AnfNodePtr>{add_value});
  std::vector<AnfNodePtr> parent_list = {add_value, a_node, b_node};

  auto c_node = anf_graph->NewCNode(parent_list);

  // a   b
  //  \ /
  //   c
  // a & b:  kernel_info:{output_format:{nz},dtype:{kNumberTypeFloat32}}
  //     infer_dtype:{kNumberTypeFloat32},infer_shape:{{3, 3, 5, 5}}
  // c:  infer_dtype:{kNumberTypeFloat32},infer_shape:{{3, 3,224, 224}}

  // set a & b's info
  SetParentAbstract(parent_list, ShapeList{nz_shapes}, type_list);
  // set abstract c
  AnfAlgo::SetOutputInferTypeAndShape(type_list, ShapeList{nd_shapes}, c_node.get());
  // set format of kernel info
  build_list[0]->SetOutputsFormat(nz_fmt);
  build_list[1]->SetOutputsFormat(nz_fmt);

  build_list[2]->SetInputsFormat(std::vector<std::string>{nd_fmt[0], nd_fmt[0]});
  build_list[3]->SetInputsFormat(std::vector<std::string>{nz_fmt[0], nz_fmt[0]});
  build_list[2]->SetInputsDeviceType(std::vector<TypeId>{kNumberTypeFloat32, kNumberTypeFloat32});
  build_list[3]->SetInputsDeviceType(std::vector<TypeId>{kNumberTypeFloat32, kNumberTypeFloat32});
  build_list[2]->SetOutputsFormat(nd_fmt);
  build_list[3]->SetOutputsFormat(nz_fmt);
  std::vector<KernelBuildInfoPtr> select_info_list;
  // set select info list
  select_info_list.emplace_back(build_list[2]->Build());
  select_info_list.emplace_back(build_list[3]->Build());

  // set device info for a & b
  AnfAlgo::SetSelectKernelBuildInfo(build_list[0]->Build(), a_node.get());
  AnfAlgo::SetSelectKernelBuildInfo(build_list[1]->Build(), b_node.get());

  test_select(c_node, select_info_list);
  EXPECT_EQ(AnfAlgo::GetInputFormat(c_node, 0), kOpFormat_DEFAULT);
  EXPECT_EQ(AnfAlgo::GetInputFormat(c_node, 1), kOpFormat_DEFAULT);

  // set a & b's info
  // a   b
  //  \ /
  //   c
  // a: kernel_info:{output_format:{5d},dtype:{kNumberTypeFloat32}}
  //    infer_dtype:{kNumberTypeFloat32},infer_shape:{{3, 3, 5, 5}}
  // b:  kernel_info:{output_format:{nz},dtype:{kNumberTypeFloat32}}
  //     infer_dtype:{kNumberTypeFloat32},infer_shape:{{3, 3, 5, 5}}
  // c:  infer_dtype:{kNumberTypeFloat32},infer_shape:{{3, 3, 5, 5}}

  // set a & b's info
  SetParentAbstract(parent_list, ShapeList{nz_shapes}, type_list);
  // set abstract c
  AnfAlgo::SetOutputInferTypeAndShape(type_list, ShapeList{nz_shapes}, c_node.get());
  // set format of kernel info
  build_list[0]->SetOutputsFormat(std::vector<std::string>{kOpFormat_NC1HWC0});
  build_list[1]->SetOutputsFormat(nz_fmt);

  build_list[2]->SetInputsFormat(std::vector<std::string>{kOpFormat_NC1HWC0, nd_fmt[0]});
  build_list[3]->SetInputsFormat(std::vector<std::string>{nd_fmt[0], nz_fmt[0]});
  build_list[2]->SetInputsDeviceType(std::vector<TypeId>{kNumberTypeFloat32, kNumberTypeFloat32});
  build_list[3]->SetInputsDeviceType(std::vector<TypeId>{kNumberTypeFloat32, kNumberTypeFloat32});
  build_list[2]->SetOutputsFormat(nd_fmt);
  build_list[3]->SetOutputsFormat(nz_fmt);
  // set select info list
  select_info_list.emplace_back(build_list[2]->Build());
  select_info_list.emplace_back(build_list[3]->Build());

  // set device info for a & b
  AnfAlgo::SetSelectKernelBuildInfo(build_list[0]->Build(), a_node.get());
  AnfAlgo::SetSelectKernelBuildInfo(build_list[1]->Build(), b_node.get());

  test_select(c_node, select_info_list);
  EXPECT_EQ(AnfAlgo::GetInputFormat(c_node, 0), kOpFormat_DEFAULT);
  EXPECT_EQ(AnfAlgo::GetInputFormat(c_node, 1), kOpFormat_FRAC_NZ);
 }
 }  // namespace ascend
 }  // namespace device
 }  // namespace mindspore
--- a/tests/ut/cpp/mindrecord/ut_shard_operator_test.cc
+++ b/tests/ut/cpp/mindrecord/ut_shard_operator_test.cc
@@ -63,7 +63,7 @@ TEST_F(TestShardOperator, TestShardSampleBasic) {
  std::vector<std::shared_ptr<ShardOperator>> ops;
  ops.push_back(std::make_shared<ShardSample>(kSampleCount));
  ShardReader dataset;
  dataset.Open(file_name, 4, column_list, ops);
  dataset.Open({file_name}, true, 4, column_list, ops);
  dataset.Launch();

  int i = 0;
@@ -89,7 +89,7 @@ TEST_F(TestShardOperator, TestShardSampleWrongNumber) {
  ops.push_back(std::make_shared<ShardSample>(kNum, kDen));

  ShardReader dataset;
  dataset.Open(file_name, 4, column_list, ops);
  dataset.Open({file_name}, true, 4, column_list, ops);
  dataset.Launch();

  int i = 0;
@@ -115,7 +115,7 @@ TEST_F(TestShardOperator, TestShardSampleRatio) {
  ops.push_back(std::make_shared<ShardSample>(kNum, kDen));

  ShardReader dataset;
  dataset.Open(file_name, 4, column_list, ops);
  dataset.Open({file_name}, true, 4, column_list, ops);
  dataset.Launch();

  int i = 0;
@@ -144,7 +144,7 @@ TEST_F(TestShardOperator, TestShardSamplePartition) {
  ASSERT_TRUE(partitions.second == 2);

  ShardReader dataset;
  dataset.Open(file_name, 4, column_list, ops);
  dataset.Open({file_name}, true, 4, column_list, ops);
  dataset.Launch();

  int i = 0;
@@ -168,7 +168,7 @@ TEST_F(TestShardOperator, TestShardPkSamplerBasic) {
  ops.push_back(std::make_shared<ShardPkSample>("label", 2));

  ShardReader dataset;
  dataset.Open(file_name, 4, column_list, ops);
  dataset.Open({file_name},true, 4, column_list, ops);
  dataset.Launch();

  int i = 0;
@@ -193,7 +193,7 @@ TEST_F(TestShardOperator, TestShardPkSamplerNumClass) {
  ops.push_back(std::make_shared<ShardPkSample>("label", 2, 3, 0));

  ShardReader dataset;
  dataset.Open(file_name, 4, column_list, ops);
  dataset.Open({file_name},true, 4, column_list, ops);
  dataset.Launch();

  int i = 0;
@@ -223,7 +223,7 @@ TEST_F(TestShardOperator, TestShardCategory) {
  ops.push_back(std::make_shared<ShardCategory>(categories));

  ShardReader dataset;
  dataset.Open(file_name, 4, column_list, ops);
  dataset.Open({file_name}, true, 4, column_list, ops);
  dataset.Launch();

  int i = 0;
@@ -254,7 +254,7 @@ TEST_F(TestShardOperator, TestShardShuffle) {
  ops.push_back(std::make_shared<ShardShuffle>(1));

  ShardReader dataset;
  dataset.Open(file_name, 16, column_list, ops);
  dataset.Open({file_name}, true, 16, column_list, ops);
  dataset.Launch();

  int i = 0;
@@ -279,7 +279,7 @@ TEST_F(TestShardOperator, TestShardSampleShuffle) {
  ops.push_back(std::make_shared<ShardShuffle>(1));

  ShardReader dataset;
  dataset.Open(file_name, 4, column_list, ops);
  dataset.Open({file_name}, true, 4, column_list, ops);
  dataset.Launch();

  int i = 0;
@@ -306,7 +306,7 @@ TEST_F(TestShardOperator, TestShardShuffleSample) {
  ops.push_back(std::make_shared<ShardSample>(kSampleSize));

  ShardReader dataset;
  dataset.Open(file_name, 4, column_list, ops);
  dataset.Open({file_name}, true,  4, column_list, ops);
  dataset.Launch();

  int i = 0;
@@ -333,7 +333,7 @@ TEST_F(TestShardOperator, TestShardSampleShuffleSample) {
  ops.push_back(std::make_shared<ShardSample>(35));

  ShardReader dataset;
  dataset.Open(file_name, 4, column_list, ops);
  dataset.Open({file_name}, true, 4, column_list, ops);
  dataset.Launch();

  int i = 0;
@@ -357,11 +357,11 @@ TEST_F(TestShardOperator, TestShardShuffleCompare) {
  ops.push_back(std::make_shared<ShardShuffle>(1));

  ShardReader dataset;
  dataset.Open(file_name, 4, column_list, ops);
  dataset.Open({file_name}, true,  4, column_list, ops);
  dataset.Launch();

  ShardReader compare_dataset;
  compare_dataset.Open(file_name, 4, column_list);
  compare_dataset.Open({file_name},true, 4, column_list);
  compare_dataset.Launch();

  int i = 0;
@@ -396,7 +396,7 @@ TEST_F(TestShardOperator, TestShardCategoryShuffle1) {
  ops.push_back(std::make_shared<ShardShuffle>(21));

  ShardReader dataset;
  dataset.Open(file_name, 4, column_list, ops);
  dataset.Open({file_name}, true, 4, column_list, ops);
  dataset.Launch();

  int i = 0;
@@ -430,7 +430,7 @@ TEST_F(TestShardOperator, TestShardCategoryShuffle2) {
  ops.push_back(std::make_shared<ShardCategory>(categories));

  ShardReader dataset;
  dataset.Open(file_name, 4, column_list, ops);
  dataset.Open({file_name}, true, 4, column_list, ops);
  dataset.Launch();

  int i = 0;
@@ -464,7 +464,7 @@ TEST_F(TestShardOperator, TestShardCategorySample) {
  ops.push_back(std::make_shared<ShardCategory>(categories));

  ShardReader dataset;
  dataset.Open(file_name, 4, column_list, ops);
  dataset.Open({file_name},true, 4, column_list, ops);
  dataset.Launch();

  int i = 0;
@@ -502,7 +502,7 @@ TEST_F(TestShardOperator, TestShardCategorySampleShuffle) {
  ops.push_back(std::make_shared<ShardShuffle>(100));

  ShardReader dataset;
  dataset.Open(file_name, 4, column_list, ops);
  dataset.Open({file_name}, true, 4, column_list, ops);
  dataset.Launch();

  int i = 0;
--- a/tests/ut/cpp/mindrecord/ut_shard_reader_test.cc
+++ b/tests/ut/cpp/mindrecord/ut_shard_reader_test.cc
@@ -55,7 +55,7 @@ TEST_F(TestShardReader, TestShardReaderGeneral) {
  auto column_list = std::vector<std::string>{"file_name"};

  ShardReader dataset;
  dataset.Open(file_name, 4, column_list);
  dataset.Open({file_name}, true, 4, column_list);
  dataset.Launch();

  while (true) {
@@ -78,7 +78,7 @@ TEST_F(TestShardReader, TestShardReaderSample) {
  std::vector<std::shared_ptr<ShardOperator>> ops;
  ops.push_back(std::make_shared<ShardSample>(17));
  ShardReader dataset;
  dataset.Open(file_name, 4, column_list, ops);
  dataset.Open({file_name}, true, 4, column_list, ops);
  dataset.Launch();

  while (true) {
@@ -103,7 +103,7 @@ TEST_F(TestShardReader, TestShardReaderBlock) {
  ops.push_back(std::make_shared<ShardSample>(3));
  ShardReader dataset;
  const bool kBlockReader = true;
  dataset.Open(file_name, 4, column_list, ops, kBlockReader);
  dataset.Open({file_name}, true, 4, column_list, ops, kBlockReader);
  dataset.Launch();

  while (true) {
@@ -123,7 +123,7 @@ TEST_F(TestShardReader, TestShardReaderEasy) {
  MS_LOG(INFO) << FormatInfo("Test read imageNet");
  std::string file_name = "./imagenet.shard01";
  ShardReader dataset;
  dataset.Open(file_name);
  dataset.Open({file_name}, true);
  dataset.Launch();

  while (true) {
@@ -143,7 +143,7 @@ TEST_F(TestShardReader, TestShardReaderColumnNotInIndex) {
  std::string file_name = "./imagenet.shard01";
  auto column_list = std::vector<std::string>{"label"};
  ShardReader dataset;
  MSRStatus ret = dataset.Open(file_name, 4, column_list);
  MSRStatus ret = dataset.Open({file_name}, true,  4, column_list);
  ASSERT_EQ(ret, SUCCESS);
  dataset.Launch();

@@ -164,7 +164,7 @@ TEST_F(TestShardReader, TestShardReaderColumnNotInSchema) {
  std::string file_name = "./imagenet.shard01";
  auto column_list = std::vector<std::string>{"file_namex"};
  ShardReader dataset;
  MSRStatus ret = dataset.Open(file_name, 4, column_list);
  MSRStatus ret = dataset.Open({file_name}, true, 4, column_list);
  ASSERT_EQ(ret, ILLEGAL_COLUMN_LIST);
 }

@@ -172,7 +172,7 @@ TEST_F(TestShardReader, TestShardVersion) {
  MS_LOG(INFO) << FormatInfo("Test shard version");
  std::string file_name = "./imagenet.shard01";
  ShardReader dataset;
  MSRStatus ret = dataset.Open(file_name, 4);
  MSRStatus ret = dataset.Open({file_name}, true,  4);
  ASSERT_EQ(ret, SUCCESS);
  dataset.Launch();

@@ -195,7 +195,7 @@ TEST_F(TestShardReader, TestShardReaderDir) {
  auto column_list = std::vector<std::string>{"file_name"};

  ShardReader dataset;
  MSRStatus ret = dataset.Open(file_name, 4, column_list);
  MSRStatus ret = dataset.Open({file_name}, true,  4, column_list);
  ASSERT_EQ(ret, FAILED);
 }

@@ -205,7 +205,7 @@ TEST_F(TestShardReader, TestShardReaderConsumer) {
  auto column_list = std::vector<std::string>{"file_name"};

  ShardReader dataset;
  dataset.Open(file_name, -481565535, column_list);
  dataset.Open({file_name}, true,  -481565535, column_list);
  dataset.Launch();

  while (true) {
--- a/tests/ut/cpp/mindrecord/ut_shard_segment_test.cc
+++ b/tests/ut/cpp/mindrecord/ut_shard_segment_test.cc
@@ -59,7 +59,7 @@ TEST_F(TestShardSegment, TestShardSegment) {
  std::string file_name = "./imagenet.shard01";

  ShardSegment dataset;
  dataset.Open(file_name, 4);
  dataset.Open({file_name}, true, 4);

  auto x = dataset.GetCategoryFields();
  for (const auto &fields : x.second) {
@@ -97,7 +97,7 @@ TEST_F(TestShardSegment, TestReadAtPageByNameOfCategoryName) {
  std::string file_name = "./imagenet.shard01";

  ShardSegment dataset;
  dataset.Open(file_name, 4);
  dataset.Open({file_name}, true, 4);

  auto x = dataset.GetCategoryFields();
  for (const auto &fields : x.second) {
@@ -121,7 +121,7 @@ TEST_F(TestShardSegment, TestReadAtPageByIdOfCategoryId) {
  std::string file_name = "./imagenet.shard01";

  ShardSegment dataset;
  dataset.Open(file_name, 4);
  dataset.Open({file_name}, true,  4);

  auto x = dataset.GetCategoryFields();
  for (const auto &fields : x.second) {
@@ -143,7 +143,7 @@ TEST_F(TestShardSegment, TestReadAtPageByIdOfPageNo) {
  std::string file_name = "./imagenet.shard01";

  ShardSegment dataset;
  dataset.Open(file_name, 4);
  dataset.Open({file_name}, true, 4);

  auto x = dataset.GetCategoryFields();
  for (const auto &fields : x.second) {
@@ -165,7 +165,7 @@ TEST_F(TestShardSegment, TestReadAtPageByIdOfPageRows) {
  std::string file_name = "./imagenet.shard01";

  ShardSegment dataset;
  dataset.Open(file_name, 4);
  dataset.Open({file_name}, true, 4);

  auto x = dataset.GetCategoryFields();
  for (const auto &fields : x.second) {
--- a/tests/ut/cpp/mindrecord/ut_shard_writer_test.cc
+++ b/tests/ut/cpp/mindrecord/ut_shard_writer_test.cc
@@ -60,7 +60,7 @@ TEST_F(TestShardWriter, TestShardWriterOneSample) {
  std::string filename = "./OneSample.shard01";

  ShardReader dataset;
  MSRStatus ret = dataset.Open(filename, 4);
  MSRStatus ret = dataset.Open({filename}, true, 4);
  ASSERT_EQ(ret, SUCCESS);
  dataset.Launch();

@@ -756,7 +756,7 @@ TEST_F(TestShardWriter, TestShardReaderStringAndNumberColumnInIndex) {
  filename = "./imagenet.shard01";
  auto column_list = std::vector<std::string>{"label", "file_name", "data"};
  ShardReader dataset;
  MSRStatus ret = dataset.Open(filename, 4, column_list);
  MSRStatus ret = dataset.Open({filename}, true, 4, column_list);
  ASSERT_EQ(ret, SUCCESS);
  dataset.Launch();

@@ -842,7 +842,7 @@ TEST_F(TestShardWriter, TestShardNoBlob) {
  filename = "./imagenet.shard01";
  auto column_list = std::vector<std::string>{"label", "file_name"};
  ShardReader dataset;
  MSRStatus ret = dataset.Open(filename, 4, column_list);
  MSRStatus ret = dataset.Open({filename}, true, 4, column_list);
  ASSERT_EQ(ret, SUCCESS);
  dataset.Launch();

@@ -936,7 +936,7 @@ TEST_F(TestShardWriter, TestShardReaderStringAndNumberNotColumnInIndex) {
  filename = "./imagenet.shard01";
  auto column_list = std::vector<std::string>{"label", "data"};
  ShardReader dataset;
  MSRStatus ret = dataset.Open(filename, 4, column_list);
  MSRStatus ret = dataset.Open({filename}, true, 4, column_list);
  ASSERT_EQ(ret, SUCCESS);
  dataset.Launch();

@@ -1043,7 +1043,7 @@ TEST_F(TestShardWriter, TestShardWriter10Sample40Shard) {

  filename = "./TenSampleFortyShard.shard01";
  ShardReader dataset;
  MSRStatus ret = dataset.Open(filename, 4);
  MSRStatus ret = dataset.Open({filename}, true, 4);
  ASSERT_EQ(ret, SUCCESS);
  dataset.Launch();

--- a/tests/ut/cpp/pre_activate/ascend/ir_fission/topk_split_test.cc
+++ b/tests/ut/cpp/pre_activate/ascend/ir_fission/topk_split_test.cc
@@ -39,7 +39,7 @@ class MockSupportedChecker : public SupportedChecker {
 public:
  MockSupportedChecker() = default;
  ~MockSupportedChecker() override = default;
  bool CheckSupported(const AnfNodePtr &anf_node, const kernel::KernelBuildInfoPtr &select_kernel_build_info) override {
  bool CheckAiCoreSupported(const AnfNodePtr &anf_node, const kernel::KernelBuildInfoPtr &select_kernel_build_info) override {
    return true;
  }
 };  // namespace opt
--- a/tests/ut/cpp/pre_activate/ascend/ir_fusion/transpose_transdata_fusion_test.cc
+++ b/tests/ut/cpp/pre_activate/ascend/ir_fusion/transpose_transdata_fusion_test.cc
@@ -37,6 +37,15 @@ class TestHWTransposeTransdataFusion : public BackendCommon {
  UT::PyFuncGraphFetcher get_py_fun_;
 };

 class MockSupportedChecker : public SupportedChecker {
 public:
  MockSupportedChecker() = default;
  ~MockSupportedChecker() override = default;
  bool CheckAiCoreSupported(const AnfNodePtr &anf_node, const kernel::KernelBuildInfoPtr &select_kernel_build_info) override {
    return true;
  }
 };

 class MockInsertTransOpKernelSelectTrans4Dto5D : public KernelSelect {
 public:
  MockInsertTransOpKernelSelectTrans4Dto5D() = default;
@@ -60,37 +69,6 @@ class MockInsertTransOpKernelSelectTrans4Dto5D : public KernelSelect {
  }
 };

 class MockTransposeTransdataFusionKernelSelect : public KernelSelect {
 public:
  MockTransposeTransdataFusionKernelSelect() = default;
  ~MockTransposeTransdataFusionKernelSelect() override = default;
  bool CheckKernelAccuracySupported(const CNodePtr &kernel_node,
                                    const kernel::KernelBuildInfoPtr &new_kernel_build_info) override {
    std::vector<std::shared_ptr<kernel::KernelBuildInfo>> kernel_info_list;
    kernel::KernelBuildInfo::KernelBuildInfoBuilder builder;
    builder.SetInputsFormat({kOpFormat_NCHW});
    builder.SetOutputsFormat({kOpFormat_DEFAULT});
    builder.SetInputsDeviceType({kNumberTypeFloat16});
    builder.SetOutputsDeviceType({kNumberTypeFloat16});
    builder.SetKernelType(KernelType::AUTO_DIFF_KERNEL);
    builder.SetFusionType(kernel::FusionType::OPAQUE);
    builder.SetProcessor(kernel::Processor::AICORE);
    kernel_info_list.push_back(builder.Build());
    MS_LOG(INFO) << "transpose transdata fusion success";
    MS_LOG(INFO) << "new transdata build info input format:" << new_kernel_build_info->GetInputFormat(0)
                 << ",outputformat:" << new_kernel_build_info->GetOutputFormat(0)
                 << ",kerneltype:" << new_kernel_build_info->kernel_type()
                 << ",fusiontype:" << new_kernel_build_info->fusion_type()
                 << ",process:" << new_kernel_build_info->processor();
    auto result = std::find_if(kernel_info_list.begin(), kernel_info_list.end(),
                               [&new_kernel_build_info](kernel::KernelBuildInfoPtr item) {
                                 MS_EXCEPTION_IF_NULL(item);
                                 return *item == *new_kernel_build_info;
                               });
    return result != kernel_info_list.end();
  }
 };

 TEST_F(TestHWTransposeTransdataFusion, test_transpose_transdata_fusion) {
  /*
   * def before(input0, input1):
@@ -128,7 +106,7 @@ TEST_F(TestHWTransposeTransdataFusion, test_transpose_transdata_fusion) {
  insert_trans_op_pass->kernel_select_ = std::make_shared<MockInsertTransOpKernelSelectTrans4Dto5D>();
  pm->AddPass(insert_trans_op_pass);
  auto transpose_transdata_pass = std::make_shared<opt::TransposeTransDataFusion>();
  transpose_transdata_pass->kernel_select_ = std::make_shared<MockTransposeTransdataFusionKernelSelect>();
  transpose_transdata_pass->supported_checker_ = std::make_shared<MockSupportedChecker>();
  pm->AddPass(transpose_transdata_pass);
  optimizer->AddPassManager(pm);
  FuncGraphPtr new_graph = optimizer->Optimize(kg);
--- a/tests/ut/cpp/pre_activate/common/ir_fusion/allreduce_fusion_test.cc
+++ b/tests/ut/cpp/pre_activate/common/ir_fusion/allreduce_fusion_test.cc
--- a/tests/ut/python/dataset/test_minddataset.py
+++ b/tests/ut/python/dataset/test_minddataset.py
@@ -32,6 +32,8 @@ from mindspore.mindrecord import FileWriter

 FILES_NUM = 4
 CV_FILE_NAME = "../data/mindrecord/imagenet.mindrecord"
 CV1_FILE_NAME = "../data/mindrecord/imagenet1.mindrecord"
 CV2_FILE_NAME = "../data/mindrecord/imagenet2.mindrecord"
 CV_DIR_NAME = "../data/mindrecord/testImageNetData"
 NLP_FILE_NAME = "../data/mindrecord/aclImdb.mindrecord"
 NLP_FILE_POS = "../data/mindrecord/testAclImdbData/pos"
@@ -111,7 +113,6 @@ def test_cv_minddataset_writer_tutorial():
        os.remove("{}".format(x))
        os.remove("{}.db".format(x))


 def test_cv_minddataset_partition_tutorial(add_and_remove_cv_file):
    """tutorial for cv minddataset."""
    columns_list = ["data", "file_name", "label"]
@@ -247,6 +248,126 @@ def test_cv_minddataset_blockreader_some_field_not_in_index_tutorial(add_and_rem
    assert num_iter == 20


 def test_cv_minddataset_reader_file_list(add_and_remove_cv_file):
    """tutorial for cv minderdataset."""
    columns_list = ["data", "file_name", "label"]
    num_readers = 4
    data_set = ds.MindDataset([CV_FILE_NAME + str(x) for x in range(FILES_NUM)], columns_list, num_readers)
    assert data_set.get_dataset_size() == 10
    num_iter = 0
    for item in data_set.create_dict_iterator():
        logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter))
        logger.info("-------------- len(item[data]): {} ------------------------".format(len(item["data"])))
        logger.info("-------------- item[data]: {} -----------------------------".format(item["data"]))
        logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
        logger.info("-------------- item[label]: {} ----------------------------".format(item["label"]))
        num_iter += 1
    assert num_iter == 10

 def test_cv_minddataset_reader_one_partition(add_and_remove_cv_file):
    """tutorial for cv minderdataset."""
    columns_list = ["data", "file_name", "label"]
    num_readers = 4
    data_set = ds.MindDataset([CV_FILE_NAME + "0"], columns_list, num_readers)
    assert data_set.get_dataset_size() < 10
    num_iter = 0
    for item in data_set.create_dict_iterator():
        logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter))
        logger.info("-------------- len(item[data]): {} ------------------------".format(len(item["data"])))
        logger.info("-------------- item[data]: {} -----------------------------".format(item["data"]))
        logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
        logger.info("-------------- item[label]: {} ----------------------------".format(item["label"]))
        num_iter += 1
    assert num_iter < 10

 def test_cv_minddataset_reader_two_dataset(add_and_remove_cv_file):
    """tutorial for cv minderdataset."""
    if os.path.exists(CV1_FILE_NAME):
        os.remove(CV1_FILE_NAME)
    if os.path.exists("{}.db".format(CV1_FILE_NAME)):
        os.remove("{}.db".format(CV1_FILE_NAME))
    if os.path.exists(CV2_FILE_NAME):
        os.remove(CV2_FILE_NAME)
    if os.path.exists("{}.db".format(CV2_FILE_NAME)):
        os.remove("{}.db".format(CV2_FILE_NAME))
    writer = FileWriter(CV1_FILE_NAME, 1)
    data = get_data(CV_DIR_NAME)
    cv_schema_json = {"id": {"type": "int32"},
                      "file_name": {"type": "string"},
                      "label": {"type": "int32"},
                      "data": {"type": "bytes"}}
    writer.add_schema(cv_schema_json, "CV1_schema")
    writer.add_index(["file_name", "label"])
    writer.write_raw_data(data)
    writer.commit()

    writer = FileWriter(CV2_FILE_NAME, 1)
    data = get_data(CV_DIR_NAME)
    cv_schema_json = {"id": {"type": "int32"},
                      "file_name": {"type": "string"},
                      "label": {"type": "int32"},
                      "data": {"type": "bytes"}}
    writer.add_schema(cv_schema_json, "CV2_schema")
    writer.add_index(["file_name", "label"])
    writer.write_raw_data(data)
    writer.commit()
    columns_list = ["data", "file_name", "label"]
    num_readers = 4
    data_set = ds.MindDataset([CV_FILE_NAME + str(x) for x in range(FILES_NUM)] + [CV1_FILE_NAME, CV2_FILE_NAME], columns_list, num_readers)
    assert data_set.get_dataset_size() == 30
    num_iter = 0
    for item in data_set.create_dict_iterator():
        logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter))
        logger.info("-------------- len(item[data]): {} ------------------------".format(len(item["data"])))
        logger.info("-------------- item[data]: {} -----------------------------".format(item["data"]))
        logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
        logger.info("-------------- item[label]: {} ----------------------------".format(item["label"]))
        num_iter += 1
    assert num_iter == 30
    if os.path.exists(CV1_FILE_NAME):
        os.remove(CV1_FILE_NAME)
    if os.path.exists("{}.db".format(CV1_FILE_NAME)):
        os.remove("{}.db".format(CV1_FILE_NAME))
    if os.path.exists(CV2_FILE_NAME):
        os.remove(CV2_FILE_NAME)
    if os.path.exists("{}.db".format(CV2_FILE_NAME)):
        os.remove("{}.db".format(CV2_FILE_NAME))
        
 def test_cv_minddataset_reader_two_dataset_partition(add_and_remove_cv_file):
    paths = ["{}{}".format(CV1_FILE_NAME, str(x).rjust(1, '0'))
             for x in range(FILES_NUM)]
    for x in paths:
        os.remove("{}".format(x)) if os.path.exists("{}".format(x)) else None
        os.remove("{}.db".format(x)) if os.path.exists("{}.db".format(x)) else None
    writer = FileWriter(CV1_FILE_NAME, FILES_NUM)
    data = get_data(CV_DIR_NAME)
    cv_schema_json = {"id": {"type": "int32"},
                      "file_name": {"type": "string"},
                      "label": {"type": "int32"},
                      "data": {"type": "bytes"}}
    writer.add_schema(cv_schema_json, "CV1_schema")
    writer.add_index(["file_name", "label"])
    writer.write_raw_data(data)
    writer.commit()

    columns_list = ["data", "file_name", "label"]
    num_readers = 4
    data_set = ds.MindDataset([CV_FILE_NAME + str(x) for x in range(2)] + [CV1_FILE_NAME + str(x) for x in range(2, 4)], columns_list, num_readers)
    assert data_set.get_dataset_size() < 20
    num_iter = 0
    for item in data_set.create_dict_iterator():
        logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter))
        logger.info("-------------- len(item[data]): {} ------------------------".format(len(item["data"])))
        logger.info("-------------- item[data]: {} -----------------------------".format(item["data"]))
        logger.info("-------------- item[file_name]: {} ------------------------".format(item["file_name"]))
        logger.info("-------------- item[label]: {} ----------------------------".format(item["label"]))
        num_iter += 1
    assert num_iter < 20
    for x in paths:
        os.remove("{}".format(x))
        os.remove("{}.db".format(x))


 def test_cv_minddataset_reader_basic_tutorial(add_and_remove_cv_file):
    """tutorial for cv minderdataset."""
    columns_list = ["data", "file_name", "label"]
--- a/tests/ut/python/dataset/test_minddataset_exception.py
+++ b/tests/ut/python/dataset/test_minddataset_exception.py
@@ -22,6 +22,7 @@ import mindspore.dataset as ds
 from mindspore.mindrecord import FileWriter

 CV_FILE_NAME = "./imagenet.mindrecord"
 CV1_FILE_NAME = "./imagenet1.mindrecord"


 def create_cv_mindrecord(files_num):
@@ -37,6 +38,31 @@ def create_cv_mindrecord(files_num):
    writer.commit()


 def create_diff_schema_cv_mindrecord(files_num):
    """tutorial for cv dataset writer."""
    os.remove(CV1_FILE_NAME) if os.path.exists(CV1_FILE_NAME) else None
    os.remove("{}.db".format(CV1_FILE_NAME)) if os.path.exists("{}.db".format(CV1_FILE_NAME)) else None
    writer = FileWriter(CV1_FILE_NAME, files_num)
    cv_schema_json = {"file_name_1": {"type": "string"}, "label": {"type": "int32"}, "data": {"type": "bytes"}}
    data = [{"file_name_1": "001.jpg", "label": 43, "data": bytes('0xffsafdafda', encoding='utf-8')}]
    writer.add_schema(cv_schema_json, "img_schema")
    writer.add_index(["file_name_1", "label"])
    writer.write_raw_data(data)
    writer.commit()

 def create_diff_page_size_cv_mindrecord(files_num):
    """tutorial for cv dataset writer."""
    os.remove(CV1_FILE_NAME) if os.path.exists(CV1_FILE_NAME) else None
    os.remove("{}.db".format(CV1_FILE_NAME)) if os.path.exists("{}.db".format(CV1_FILE_NAME)) else None
    writer = FileWriter(CV1_FILE_NAME, files_num)
    writer.set_page_size(1<< 26) #64MB
    cv_schema_json = {"file_name": {"type": "string"}, "label": {"type": "int32"}, "data": {"type": "bytes"}}
    data = [{"file_name": "001.jpg", "label": 43, "data": bytes('0xffsafdafda', encoding='utf-8')}]
    writer.add_schema(cv_schema_json, "img_schema")
    writer.add_index(["file_name", "label"])
    writer.write_raw_data(data)
    writer.commit()

 def test_cv_lack_json():
    """tutorial for cv minderdataset."""
    create_cv_mindrecord(1)
@@ -111,3 +137,34 @@ def test_cv_minddataset_pk_sample_exclusive_shuffle():
    os.remove(CV_FILE_NAME)
    os.remove("{}.db".format(CV_FILE_NAME))

 def test_cv_minddataset_reader_different_schema():
    create_cv_mindrecord(1)
    create_diff_schema_cv_mindrecord(1)
    columns_list = ["data", "label"]
    num_readers = 4
    with pytest.raises(Exception, match="MindRecordOp init failed"):
        data_set = ds.MindDataset([CV_FILE_NAME, CV1_FILE_NAME], columns_list,
                num_readers)
        num_iter = 0
        for item in data_set.create_dict_iterator():
            num_iter += 1
    os.remove(CV_FILE_NAME)
    os.remove("{}.db".format(CV_FILE_NAME))
    os.remove(CV1_FILE_NAME)
    os.remove("{}.db".format(CV1_FILE_NAME))

 def test_cv_minddataset_reader_different_page_size():
    create_cv_mindrecord(1)
    create_diff_page_size_cv_mindrecord(1)
    columns_list = ["data", "label"]
    num_readers = 4
    with pytest.raises(Exception, match="MindRecordOp init failed"):
        data_set = ds.MindDataset([CV_FILE_NAME, CV1_FILE_NAME], columns_list,
                num_readers)
        num_iter = 0
        for item in data_set.create_dict_iterator():
            num_iter += 1
    os.remove(CV_FILE_NAME)
    os.remove("{}.db".format(CV_FILE_NAME))
    os.remove(CV1_FILE_NAME)
    os.remove("{}.db".format(CV1_FILE_NAME))
--- a/tests/ut/python/dtype/test_dictionary.py
+++ b/tests/ut/python/dtype/test_dictionary.py
@@ -28,6 +28,7 @@ context.set_context(mode=context.GRAPH_MODE)
 def Xtest_arg_dict():
    class DictNet(Cell):
        """DictNet definition"""

        def __init__(self):
            super(DictNet, self).__init__()
            self.max = P.Maximum()
@@ -48,6 +49,7 @@ def Xtest_arg_dict():
 def test_const_dict():
    class DictNet(Cell):
        """DictNet1 definition"""

        def __init__(self):
            super(DictNet, self).__init__()
            self.max = P.Maximum()
@@ -58,6 +60,7 @@ def test_const_dict():
            a = self.max(self.dictionary["x"], self.dictionary["y"])
            b = self.min(self.dictionary["x"], self.dictionary["y"])
            return a + b

    net = DictNet()
    net()

@@ -65,6 +68,7 @@ def test_const_dict():
 def test_dict_set_or_get_item():
    class DictNet(Cell):
        """DictNet1 definition"""

        def __init__(self):
            super(DictNet, self).__init__()
            self.dict_ = {"x": 1, "y": 2}
@@ -91,6 +95,7 @@ def test_dict_set_or_get_item():
 def test_dict_set_or_get_item_2():
    class DictNet(Cell):
        """DictNet1 definition"""

        def __init__(self):
            super(DictNet, self).__init__()

@@ -117,6 +122,7 @@ def test_dict_set_or_get_item_2():
 def test_dict_set_or_get_item_3():
    class DictNet(Cell):
        """DictNet1 definition"""

        def __init__(self):
            super(DictNet, self).__init__()
            self.dict_ = {"x": Tensor(np.ones([2, 2, 3], np.float32)), "y": 1}
@@ -130,5 +136,3 @@ def test_dict_set_or_get_item_3():

    net = DictNet()
    assert net() == Tensor(np.ones([4, 2, 3], np.float32))


--- a/tests/ut/python/dtype/test_hypermap.py
+++ b/tests/ut/python/dtype/test_hypermap.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 # ============================================================================
 import numpy as np
 import pytest

 from mindspore import Tensor, context
 from mindspore.nn import Cell
--- a/tests/ut/python/exec/init.py
+++ b/tests/ut/python/exec/init.py
@@ -15,6 +15,7 @@
 """setup for pytest"""
 import mindspore.context as context


 # pylint: disable=unused-argument
 def setup_module(module):
    context.set_context(mode=context.GRAPH_MODE)
--- a/tests/ut/python/exec/resnet_example.py
+++ b/tests/ut/python/exec/resnet_example.py
@@ -16,6 +16,7 @@
 resnet50 example
 """
 import numpy as np

 import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore.ops import operations as P
--- a/tests/ut/python/exec/test_AssignAdd.py
+++ b/tests/ut/python/exec/test_AssignAdd.py
@@ -16,19 +16,21 @@
 test assign add
 """
 import numpy as np

 import mindspore as ms
 import mindspore.context as context
 import mindspore.nn as nn
 from mindspore.ops import operations as P
 from mindspore.common.initializer import initializer
 from mindspore import Tensor, Parameter
 import mindspore as ms
 from mindspore.common.initializer import initializer
 from mindspore.ops import operations as P
 from ..ut_filter import non_graph_engine
 from mindspore.common.api import _executor
 import mindspore.context as context
 import pytest

 context.set_context(mode=context.GRAPH_MODE)


 class Net(nn.Cell):
    """Net definition"""

    def __init__(self):
        super(Net, self).__init__()
        self.AssignAdd = P.AssignAdd()
@@ -39,18 +41,19 @@ class Net(nn.Cell):
        out = self.AssignAdd(self.inputdata, x)
        return out


@non_graph_engine
 def test_AssignAdd_1():
    """test AssignAdd 1"""
    import mindspore.context as context
    context.set_context(mode=context.GRAPH_MODE)
    net = Net()
    x = Tensor(np.ones([1]).astype(np.int64)*100)
    x = Tensor(np.ones([1]).astype(np.int64) * 100)

    print("MyPrintResult dataX:", x)
    result = net(x)
    print("MyPrintResult data::", result)
    expect = np.ones([1]).astype(np.int64)*101
    expect = np.ones([1]).astype(np.int64) * 101
    diff = result.asnumpy() - expect

    print("MyPrintExpect:", expect)
@@ -58,18 +61,19 @@ def test_AssignAdd_1():
    error = np.ones(shape=[1]) * 1.0e-3
    assert np.all(diff < error)


@non_graph_engine
 def test_AssignAdd_2():
    """test AssignAdd 2"""
    import mindspore.context as context
    context.set_context(mode=context.GRAPH_MODE)
    net = Net()
    x = Tensor(np.ones([1]).astype(np.int64)*102)
    x = Tensor(np.ones([1]).astype(np.int64) * 102)

    print("MyPrintResult dataX:", x)
    result = net(x)
    print("MyPrintResult data::", result.asnumpy())
    expect = np.ones([1]).astype(np.int64)*103
    expect = np.ones([1]).astype(np.int64) * 103
    diff = result.asnumpy() - expect

    print("MyPrintExpect:", expect)
@@ -77,8 +81,10 @@ def test_AssignAdd_2():
    error = np.ones(shape=[1]) * 1.0e-3
    assert np.all(diff < error)


 class AssignAddNet(nn.Cell):
    """Net definition"""

    def __init__(self):
        super(AssignAddNet, self).__init__()
        self.AssignAdd = P.AssignAdd()
@@ -89,9 +95,10 @@ class AssignAddNet(nn.Cell):
        z1 = self.AssignAdd(self.inputdata, self.one)
        return z1


@non_graph_engine
 def test_assignadd_scalar_cast():
    net = AssignAddNet()
    x = Tensor(np.ones([1]).astype(np.int64)*102)
    #_executor.compile(net, 1)
    x = Tensor(np.ones([1]).astype(np.int64) * 102)
    # _executor.compile(net, 1)
    result = net(x)
--- a/tests/ut/python/exec/test_activation.py
+++ b/tests/ut/python/exec/test_activation.py
@@ -14,6 +14,7 @@
 # ============================================================================
 """ test Activations """
 import numpy as np

 import mindspore.nn as nn
 from mindspore import Tensor
 from ..ut_filter import non_graph_engine
--- a/tests/ut/python/exec/test_assign_sub.py
+++ b/tests/ut/python/exec/test_assign_sub.py
@@ -16,15 +16,17 @@
 test assign sub
 """
 import numpy as np

 import mindspore.context as context
 import mindspore.nn as nn
 import mindspore.ops.operations as P
 from mindspore import Tensor
 import mindspore.context as context
 from mindspore.common.initializer import initializer
 from mindspore.common.parameter import Parameter

 context.set_context(mode=context.GRAPH_MODE)


 class Net(nn.Cell):
    def __init__(self):
        super(Net, self).__init__()
--- a/tests/ut/python/exec/test_batchnorm.py
+++ b/tests/ut/python/exec/test_batchnorm.py
@@ -14,6 +14,7 @@
 # ============================================================================
 """ut for batchnorm layer"""
 import numpy as np

 import mindspore.nn as nn
 from mindspore import Tensor
 from ..ut_filter import non_graph_engine
--- a/tests/ut/python/exec/test_bias_add.py
+++ b/tests/ut/python/exec/test_bias_add.py
@@ -14,14 +14,17 @@
 # ============================================================================
 """ test BiasAdd """
 import numpy as np

 import mindspore.nn as nn
 from mindspore.ops import operations as P
 from mindspore.common.initializer import initializer
 from mindspore import Tensor, Parameter
 from mindspore.common.initializer import initializer
 from mindspore.ops import operations as P
 from ..ut_filter import non_graph_engine


 class Net(nn.Cell):
    """Net definition"""

    def __init__(self,
                 output_channels,
                 bias_init='zeros',
--- a/tests/ut/python/exec/test_conv.py
+++ b/tests/ut/python/exec/test_conv.py
@@ -14,6 +14,7 @@
 # ============================================================================
 """test conv"""
 import numpy as np

 import mindspore.nn as nn
 from mindspore import Tensor
 from ..ut_filter import non_graph_engine
@@ -25,6 +26,7 @@ out_channels = 64

 class Net(nn.Cell):
    """Net definition"""

    def __init__(self,
                 cin,
                 cout,
@@ -70,6 +72,7 @@ def test_compile2():
    output = net(input_data)
    print(output.asnumpy())


@non_graph_engine
 def test_compile3():
    net = Net(3, 1, (3, 3), weight_init='ONES')
--- a/tests/ut/python/exec/test_dense.py
+++ b/tests/ut/python/exec/test_dense.py
@@ -14,12 +14,15 @@
 # ============================================================================
 """ test Dense """
 import numpy as np

 import mindspore.nn as nn
 from mindspore import Tensor
 from ..ut_filter import non_graph_engine


 class Net(nn.Cell):
    """Net definition"""

    def __init__(self,
                 input_channels,
                 output_channels,
--- a/tests/ut/python/exec/test_eval.py
+++ b/tests/ut/python/exec/test_eval.py
@@ -14,11 +14,12 @@
 # ============================================================================
 """test eval"""
 import numpy as np

 import mindspore as ms
 import mindspore.nn as nn
 from mindspore.common.api import _executor
 from mindspore import Tensor
 from mindspore import context
 from mindspore.common.api import _executor
 from ..ut_filter import non_graph_engine


--- a/tests/ut/python/exec/test_flatten.py
+++ b/tests/ut/python/exec/test_flatten.py
@@ -16,8 +16,8 @@
 import numpy as np

 import mindspore.nn as nn
 from mindspore.ops import operations as P
 from mindspore import Tensor
 from mindspore.ops import operations as P
 from ..ut_filter import non_graph_engine


--- a/tests/ut/python/exec/test_pooling.py
+++ b/tests/ut/python/exec/test_pooling.py
@@ -15,12 +15,12 @@
 """
 test pooling api
 """
 import numpy as np
 import mindspore.nn as nn
 from mindspore import Tensor


 class MaxNet(nn.Cell):
    """MaxNet definition"""

    def __init__(self,
                 kernel_size,
                 stride=None):
--- a/tests/ut/python/exec/test_softmax.py
+++ b/tests/ut/python/exec/test_softmax.py
@@ -16,9 +16,11 @@
 test softmax api
 """
 import numpy as np

 import mindspore.nn as nn
 from mindspore import Tensor


 class Net(nn.Cell):
    def __init__(self, dim):
        super(Net, self).__init__()
--- a/tests/ut/python/exec/test_tensor_add.py
+++ b/tests/ut/python/exec/test_tensor_add.py
@@ -14,10 +14,12 @@
 # ============================================================================
 """ test TensorAdd """
 import numpy as np

 import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore.ops import operations as P


 class Net(nn.Cell):
    def __init__(self):
        super(Net, self).__init__()