clean code

4 years ago · f26870d437
--- a/mindspore/ccsrc/backend/common/session/cpu_session.cc
+++ b/mindspore/ccsrc/backend/common/session/cpu_session.cc
@@ -68,7 +68,6 @@ ParameterPtr CPUSession::CreateNewParameterFromParameter(const AnfNodePtr &anf,
    MS_LOG(EXCEPTION) << "anf[" << anf->DebugString() << "] is not a parameter";
  }
  auto valid_inputs = graph->MutableValidInputs();
  MS_EXCEPTION_IF_NULL(valid_inputs);
  auto graph_inputs = graph->MutableInputs();
  MS_EXCEPTION_IF_NULL(graph_inputs);
  TraceManager::DebugTrace(std::make_shared<TraceCopy>(anf->debug_info()));
--- a/mindspore/ccsrc/backend/common/session/executor.h
+++ b/mindspore/ccsrc/backend/common/session/executor.h
@@ -113,7 +113,10 @@ class RunOpsInGraphTask : public Task {
 class RunOpTask : public Task {
 public:
  RunOpTask() { type_ = kRunOp; }
  ~RunOpTask() override = default;
  ~RunOpTask() override {
    op_run_info_ = nullptr;
    input_tensors_ = nullptr;
  }
  void Run() override;
  OpRunInfo *op_run_info_{nullptr};
  GraphInfo graph_info_;
--- a/mindspore/ccsrc/backend/common/session/executor_manager.h
+++ b/mindspore/ccsrc/backend/common/session/executor_manager.h
@@ -26,7 +26,7 @@ class Executor;
 class ExecutorManager {
 public:
  static ExecutorManager &Instance() {
    static ExecutorManager instance;
    static ExecutorManager instance{};
    return instance;
  }
  std::shared_ptr<Executor> GetExecutor(const std::string &device_name, uint32_t device_id);
--- a/mindspore/ccsrc/backend/common/session/kernel_graph.h
+++ b/mindspore/ccsrc/backend/common/session/kernel_graph.h
@@ -349,8 +349,8 @@ class KernelGraph : public FuncGraph {

  bool HasPostGraph() const { return !post_graphs_.empty(); }

  void IncPreGraphFinishedCount() { pre_graph_finished_count_++; }
  void IncPostGraphFinishedCount() { post_graph_finished_count_++; }
  void IncPreGraphFinishedCount() { ++pre_graph_finished_count_; }
  void IncPostGraphFinishedCount() { ++post_graph_finished_count_; }
  void ResetGraphRunningStatus() {
    first_step_ = false;
    post_graph_finished_count_ = 0;
--- a/mindspore/ccsrc/backend/common/session/session_basic.h
+++ b/mindspore/ccsrc/backend/common/session/session_basic.h
@@ -232,13 +232,24 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
  virtual void UnifyMindIR(const KernelGraphPtr &graph);
  virtual void FinalOptimize(const KernelGraphPtr &graph) const;
  virtual GraphId CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) { return 0; }
  virtual GraphId CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) { return kInvalidGraphId; }
  virtual GraphId CompileGraphImpl(NotNull<FuncGraphPtr>) { return kInvalidGraphId; }
  virtual void BuildGraphImpl(GraphId) {}
  virtual void PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
                               const std::vector<tensor::TensorPtr> &inputs, VectorRef *const outputs) {}
                               const std::vector<tensor::TensorPtr> &inputs, VectorRef *const outputs) {
    MS_EXCEPTION_IF_NULL(kernel_graph);
    MS_EXCEPTION_IF_NULL(outputs);
    MS_LOG(INFO) << "Call default PreExecuteGraph with input size: " << inputs.size();
  }

  virtual void PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
                                const std::vector<tensor::TensorPtr> &inputs, VectorRef *const outputs) {}
  virtual void ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) {}
                                const std::vector<tensor::TensorPtr> &inputs, VectorRef *const outputs) {
    MS_EXCEPTION_IF_NULL(kernel_graph);
    MS_EXCEPTION_IF_NULL(outputs);
    MS_LOG(INFO) << "Call default PostExecuteGraph with input size: " << inputs.size();
  }

  virtual void ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) { MS_EXCEPTION_IF_NULL(kernel_graph); }

  void RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs);
  virtual KernelGraphPtr BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
                                     const std::vector<tensor::TensorPtr> &input_tensors,
@@ -275,7 +286,11 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
  virtual void ExecuteAllTaskInQueue() {}

  virtual void LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
                             const std::vector<tensor::TensorPtr> &inputs_const) const {}
                             const std::vector<tensor::TensorPtr> &inputs_const) const {
    MS_EXCEPTION_IF_NULL(kernel_graph);
    MS_LOG(INFO) << "Call default LoadInputData with input size: " << inputs_const.size();
  }

  void UpdateOutputs(const std::shared_ptr<KernelGraph> &kernel_graph, VectorRef *const outputs,
                     const std::vector<tensor::TensorPtr> &input_tensors,
                     std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node) const;
--- a/mindspore/ccsrc/common/thread_pool.h
+++ b/mindspore/ccsrc/common/thread_pool.h
@@ -33,7 +33,7 @@
 namespace mindspore {
 namespace common {
 enum Status { FAIL = -1, SUCCESS = 0 };
 using Task = std::function<int()>;
 using Task = std::function<Status()>;

 struct ThreadContext {
  std::mutex mutex;
@@ -48,7 +48,7 @@ class ThreadPool {
  ThreadPool &operator=(const ThreadPool &) = delete;
  static ThreadPool &GetInstance();
  bool SyncRun(const std::vector<Task> &tasks);
  size_t GetSyncRunThreadNum() { return max_thread_num_; }
  size_t GetSyncRunThreadNum() const { return max_thread_num_; }
  void ClearThreadPool();

 private:
--- a/mindspore/ccsrc/kernel/kernel.h
+++ b/mindspore/ccsrc/kernel/kernel.h
@@ -172,7 +172,7 @@ struct Address {
 };
 using AddressPtr = std::shared_ptr<Address>;
 using AddressPtrList = std::vector<AddressPtr>;

 using StreamType = void *;
 // The memory info of kernel launch.
 struct KernelLaunchInfo {
  AddressPtrList inputs_;
@@ -215,8 +215,8 @@ class KernelMod {
  const std::vector<AddressPtr> &GetInputsAddr() { return inputs_addr_; }
  const std::vector<AddressPtr> &GetWorkSpacesAddr() { return workspaces_addr_; }
  const std::vector<AddressPtr> &GetOutputsAddr() { return outputs_addr_; }
  void SetStream(void *stream) { stream_ = stream; }
  void *GetStream() const { return stream_; }
  void set_stream(StreamType stream) { stream_ = stream; }
  StreamType stream() const { return stream_; }
  void SetAtomicCleanNodes(const std::vector<CNodePtr> &atomic_clean_node);

 protected:
@@ -226,7 +226,7 @@ class KernelMod {
  std::string unique_name_;
  std::string fullname_;
  bool is_monad_{false};
  void *stream_{nullptr};
  StreamType stream_{nullptr};
  AnfNodeWeakPtr anf_node_;
  std::map<uint32_t, tensor::TensorPtr> depend_tensor_map_;
  std::vector<CNodeWeakPtr> atomic_clean_nodes_;
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_kernel_runtime.cc
@@ -765,9 +765,9 @@ void AscendKernelRuntime::SetKernelModStream(const std::vector<CNodePtr> &kernel
        MS_LOG(EXCEPTION) << "create communication stream failed, ret:" << ret;
      }
      stream_id_map_[stream_id] = stream;
      ascend_kernel_mod->SetStream(stream);
      ascend_kernel_mod->set_stream(stream);
    } else {
      ascend_kernel_mod->SetStream(iter->second);
      ascend_kernel_mod->set_stream(iter->second);
    }
    if (stream_id > 0) {
      last_kernel[stream_id_map_[stream_id]] = i;
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_reporter.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_reporter.cc
@@ -112,7 +112,7 @@ void ProfilingReporter::ReportStepPoint(const std::vector<std::shared_ptr<StepPo
    MS_EXCEPTION_IF_NULL(kernel_mod);
    // The tag of this function should report all tags, it will be saved to ts_track.data.<device_id>.slice_<index>
    // The first step index set to 1, here keep same with ge
    rtProfilerTraceEx(1, graph_id_, point->tag(), kernel_mod->GetStream());
    rtProfilerTraceEx(1, graph_id_, point->tag(), kernel_mod->stream());

    MS_LOG(INFO) << "Report step point, graph id: " << graph_id_ << ", op name: " << point->op_name()
                 << ", stream id: " << GetStreamId(op_name) << ", task id: " << GetTaskId(op_name)
--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_context.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_context.cc
@@ -765,7 +765,7 @@ void *AscendDeviceContext::GetKernelStream(const CNodePtr &node) const {
  if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) {
    return compute_stream_;
  } else {
    auto stream = kernel_mod->GetStream();
    auto stream = kernel_mod->stream();
    if (stream == nullptr) {
      stream = compute_stream_;
      MS_LOG(INFO) << "Assign default compute stream for node " << node->fullname_with_scope();
--- a/mindspore/ccsrc/plugin/device/cpu/hal/device/cpu_memory_manager.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/hal/device/cpu_memory_manager.cc
@@ -15,7 +15,6 @@
 */

 #include "plugin/device/cpu/hal/device/cpu_memory_manager.h"
 #include <memory>
 #include "backend/common/session/anf_runtime_algorithm.h"
 #include "utils/ms_context.h"
 #include "utils/convert_utils.h"
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/fused_ada_factor_cpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/fused_ada_factor_cpu_kernel.cc
@@ -38,8 +38,8 @@ static constexpr float kEps = 1e-30;
 void FusedAdaFactorCpuKernelMod::InitInputOutputSize(const CNodePtr &kernel_node) {
  NativeCpuKernelMod::InitInputOutputSize(kernel_node);
  (void)workspace_size_list_.emplace_back(elem_num_ * kSizeFloat32);
  (void)workspace_size_list_.emplace_back(elem_num_ / last_row_dim_size_ * kSizeFloat32);
  (void)workspace_size_list_.emplace_back(elem_num_ / last_col_dim_size_ * kSizeFloat32);
  (void)workspace_size_list_.emplace_back((elem_num_ / last_row_dim_size_) * kSizeFloat32);
  (void)workspace_size_list_.emplace_back((elem_num_ / last_col_dim_size_) * kSizeFloat32);
 }

 void FusedAdaFactorCpuKernelMod::InitKernel(const CNodePtr &kernel_node) {
@@ -156,7 +156,7 @@ void FusedAdaFactorCpuKernelMod::FactorUpdate(float *update, const std::vector<A
  task = [&](size_t start, size_t end) {
    for (size_t i = start; i < end; ++i) {
      float row_reduce = 0;
      size_t reduce_start = i / row_dim_size * last_row_col_size + i % row_dim_size;
      size_t reduce_start = (i / row_dim_size) * last_row_col_size + i % row_dim_size;
      for (size_t j = 0; j < col_dim_size; ++j) {
        row_reduce += update[reduce_start + j * row_dim_size];
      }
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/scatter_nd_update_cpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/scatter_nd_update_cpu_kernel.cc
@@ -157,7 +157,7 @@ void ScatterUpdateCpuKernelMod::LaunchKernel(const std::vector<AddressPtr> &inpu
  size_t once_compute_size = (num_units_ + max_thread_num - 1) / max_thread_num;
  while (start < num_units_) {
    size_t end = (start + once_compute_size) > num_units_ ? num_units_ : (start + once_compute_size);
    auto task = [&params, start, end]() -> int {
    auto task = [&params, start, end]() {
      Compute<T>(&params, start, end);
      return common::SUCCESS;
    };
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/sparse_apply_ftrl_cpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/sparse_apply_ftrl_cpu_kernel.cc
@@ -52,10 +52,10 @@ void ComputeFtrl(MultiThreadComputeParams<T> *input_params, size_t start, size_t
      float y;
      if (lr_power == -0.5) {
        y = std::sqrt(accum_new);
        linear[j] += summed_grad - (y - std::sqrt(accum[j])) / lr * var[j];
        linear[j] += (summed_grad - (y - std::sqrt(accum[j])) / lr) * var[j];
      } else {
        y = std::pow(accum_new, -lr_power);
        linear[j] += summed_grad - (y - std::pow(accum[j], -lr_power)) / lr * var[j];
        linear[j] += (summed_grad - (y - std::pow(accum[j], -lr_power)) / lr) * var[j];
      }
      accum[j] = accum_new;
      auto x = Sign(linear[j]) * l1 - linear[j];
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/stridedslice_cpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/stridedslice_cpu_kernel.cc
@@ -147,7 +147,8 @@ void StridedSliceCpuKernelMod::InitSliceParam(const CNodePtr &kernel_node, std::
  slice_param_.num_axes_ = DIMENSION_8D;
 }

 int StridedSliceCpuKernelMod::RunTaskOnOuter(const uint8_t *input_addr, uint8_t *output_addr, int start_pos) {
 common::Status StridedSliceCpuKernelMod::RunTaskOnOuter(const uint8_t *input_addr, uint8_t *output_addr,
                                                        int start_pos) {
  int begin_index = slice_param_.begins_[split_axis_];
  int inner_size = inner_ * data_size_;
  const uint8_t *cur_in_ptr = input_addr + (start_pos * input_shape_[split_axis_] + begin_index) * inner_size;
@@ -162,7 +163,8 @@ int StridedSliceCpuKernelMod::RunTaskOnOuter(const uint8_t *input_addr, uint8_t
  return common::SUCCESS;
 }

 int StridedSliceCpuKernelMod::RunTaskOnSplitAxis(const uint8_t *input_addr, uint8_t *output_addr, int start_pos) {
 common::Status StridedSliceCpuKernelMod::RunTaskOnSplitAxis(const uint8_t *input_addr, uint8_t *output_addr,
                                                            int start_pos) {
  int begin_index = slice_param_.begins_[split_axis_];
  int inner_size = inner_ * data_size_;
  const uint8_t *cur_in_ptr = input_addr + (start_pos * slice_param_.strides_[split_axis_] + begin_index) * inner_size;
@@ -179,7 +181,7 @@ int StridedSliceCpuKernelMod::RunTaskOnSplitAxis(const uint8_t *input_addr, uint
 void StridedSliceCpuKernelMod::ParallelRun(const uint8_t *input_addr, uint8_t *output_addr, int thread_num) {
  int thread_index = 0;
  std::vector<common::Task> tasks;
  std::function<int(StridedSliceCpuKernelMod *, const uint8_t *, uint8_t *, int)> execute_func;
  std::function<common::Status(StridedSliceCpuKernelMod *, const uint8_t *, uint8_t *, int)> execute_func;
  if (parallel_strategy_ == kOnOuter) {
    execute_func = &StridedSliceCpuKernelMod::RunTaskOnOuter;
  } else if (parallel_strategy_ == kOnSplitAxis) {
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/stridedslice_cpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/stridedslice_cpu_kernel.h
@@ -42,8 +42,8 @@ class StridedSliceCpuKernelMod : public NativeCpuKernelMod {
  bool MatchParallelPattern();
  void InitParallelParam();
  void ParallelRun(const uint8_t *input_addr, uint8_t *output_addr, int thread_num);
  int RunTaskOnOuter(const uint8_t *input_addr, uint8_t *output_addr, int start_pos);
  int RunTaskOnSplitAxis(const uint8_t *input_addr, uint8_t *output_addr, int start_pos);
  common::Status RunTaskOnOuter(const uint8_t *input_addr, uint8_t *output_addr, int start_pos);
  common::Status RunTaskOnSplitAxis(const uint8_t *input_addr, uint8_t *output_addr, int start_pos);
  void ParseMasks(const CNodePtr &kernel_node);

  TypeId dtype_;
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/unique_cpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/unique_cpu_kernel.cc
@@ -99,7 +99,7 @@ void UniqueCpuKernelMod::LaunchKernel(const std::vector<AddressPtr> &inputs, con
  params->workspace_idx_ = reinterpret_cast<IndexType *>(workspace[2]->addr);
  params->output_ = reinterpret_cast<DataType *>(outputs[0]->addr);
  params->inverse_idx_ = reinterpret_cast<IndexType *>(outputs[1]->addr);
  params->input_size_ = static_cast<IndexType>(input_size_);
  params->input_size_ = input_size_;
  params->output_size_ = 0;

  params->thread_num_ = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/unique_cpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/unique_cpu_kernel.h
@@ -36,8 +36,8 @@ struct UniqueParam {
  IndexType *inverse_idx_{nullptr};
  DataType *workspace_{nullptr};
  IndexType *workspace_idx_{nullptr};
  IndexType input_size_{0};
  IndexType output_size_{0};
  size_t input_size_{0};
  size_t output_size_{0};
  size_t thread_num_{0};
  bool need_sort_{true};
 };
@@ -48,15 +48,14 @@ class UniqueCpuKernelMod : public NativeCpuKernelMod {
  ~UniqueCpuKernelMod() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
  void InitInputOutputSize(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 protected:
  void InitInputOutputSize(const CNodePtr &kernel_node) override;
  template <typename DataType, typename IndexType>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                    const std::vector<AddressPtr> &outputs);

 protected:
  size_t input_size_{0};
  TypeId dtype_{kTypeUnknown};
  size_t output_size_{0};
@@ -64,16 +63,20 @@ class UniqueCpuKernelMod : public NativeCpuKernelMod {
  CNodeWeakPtr node_wpt_;

  template <typename DataType>
  static size_t BucketId(DataType data, size_t bucket_num) {
  static size_t BucketId(DataType input, size_t bucket_num) {
    if (input < 0) {
      input = -input;
    }
    size_t data = static_cast<size_t>(input);
    if (bucket_num < 1) {
      return static_cast<size_t>(data);
      return data;
    }
    return static_cast<size_t>(data) % bucket_num;
    return data % bucket_num;
  }

  template <typename DataType, typename IndexType>
  static void CalculateEachBucketSize(const std::shared_ptr<UniqueParam<DataType, IndexType>> &params,
                                      std::vector<IndexType> *each_bucket_size) {
                                      std::vector<size_t> *each_bucket_size) {
    MS_EXCEPTION_IF_NULL(params);
    MS_EXCEPTION_IF_NULL(params->input_);
    MS_EXCEPTION_IF_NULL(each_bucket_size);
@@ -81,17 +84,16 @@ class UniqueCpuKernelMod : public NativeCpuKernelMod {
    if (params->input_size_ < 1) {
      return;
    }
    for (IndexType i = 0; i < params->input_size_; ++i) {
    for (size_t i = 0; i < params->input_size_; ++i) {
      auto bucket_id = BucketId(params->input_[i], bucket_num);
      each_bucket_size->at(bucket_id)++;
    }
  }

  template <typename DataType, typename IndexType>
  static void SplitAndCalculateBucketSize(
    const std::shared_ptr<UniqueParam<DataType, IndexType>> &params,
    std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> *segments_ptr,
    std::vector<std::shared_ptr<std::vector<IndexType>>> *segment_bucket_sizes_ptr) {
  static void SplitAndCalculateBucketSize(const std::shared_ptr<UniqueParam<DataType, IndexType>> &params,
                                          std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> *segments_ptr,
                                          std::vector<std::shared_ptr<std::vector<size_t>>> *segment_bucket_sizes_ptr) {
    MS_EXCEPTION_IF_NULL(params);
    MS_EXCEPTION_IF_NULL(params->input_);
    MS_EXCEPTION_IF_NULL(segments_ptr);
@@ -99,21 +101,21 @@ class UniqueCpuKernelMod : public NativeCpuKernelMod {
    auto &segments = *segments_ptr;
    auto &segment_bucket_sizes = *segment_bucket_sizes_ptr;

    IndexType input_size = params->input_size_;
    size_t input_size = params->input_size_;
    size_t thread_num = params->thread_num_;
    if (thread_num < 1) {
      MS_LOG(EXCEPTION) << "For 'Unique', thread num should be greater than 0, but got " << thread_num;
    }
    IndexType thread_data_size = input_size / thread_num;
    size_t thread_data_size = input_size / thread_num;
    size_t left_data_size = input_size % thread_num;
    segments.reserve(thread_num);
    segment_bucket_sizes.reserve(thread_num);
    IndexType current_offset = 0;
    size_t current_offset = 0;
    std::vector<common::Task> tasks;
    tasks.reserve(thread_num);
    for (size_t i = 0; i < thread_num; ++i) {
      (void)segment_bucket_sizes.emplace_back(std::make_shared<std::vector<IndexType>>(thread_num, 0));
      IndexType data_size = thread_data_size;
      (void)segment_bucket_sizes.emplace_back(std::make_shared<std::vector<size_t>>(thread_num, 0));
      size_t data_size = thread_data_size;
      if (i < left_data_size) {
        data_size += 1;
      }
@@ -132,18 +134,17 @@ class UniqueCpuKernelMod : public NativeCpuKernelMod {
  }

  template <typename DataType, typename IndexType>
  static void SegmentToBuckets(const std::shared_ptr<UniqueParam<DataType, IndexType>> &segment,
                               IndexType segment_offset,
  static void SegmentToBuckets(const std::shared_ptr<UniqueParam<DataType, IndexType>> &segment, size_t segment_offset,
                               const std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> &buckets) {
    MS_LOG(DEBUG) << "Start";
    MS_EXCEPTION_IF_NULL(segment);
    MS_EXCEPTION_IF_NULL(segment->input_);
    std::vector<IndexType> bucket_data_num(segment->thread_num_, 0);
    std::vector<size_t> bucket_data_num(segment->thread_num_, 0);
    auto bucket_size = buckets.size();
    if (segment->input_size_ < 1) {
      return;
    }
    for (IndexType i = 0; i < segment->input_size_; ++i) {
    for (size_t i = 0; i < segment->input_size_; ++i) {
      DataType data = segment->input_[i];
      auto bucket_id = BucketId(data, segment->thread_num_);
      auto bucket_index = bucket_data_num[bucket_id];
@@ -160,7 +161,7 @@ class UniqueCpuKernelMod : public NativeCpuKernelMod {
        continue;
      }
      bucket->input_[bucket_index] = data;
      bucket->workspace_idx_[bucket_index] = segment_offset + i;
      bucket->workspace_idx_[bucket_index] = static_cast<IndexType>(segment_offset + i);
      bucket_data_num[bucket_id]++;
    }
    MS_LOG(DEBUG) << "End";
@@ -169,7 +170,7 @@ class UniqueCpuKernelMod : public NativeCpuKernelMod {
  template <typename DataType, typename IndexType>
  static void GatherSegmentsToBuckets(const std::shared_ptr<UniqueParam<DataType, IndexType>> &params,
                                      std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> *segments_ptr,
                                      std::vector<std::shared_ptr<std::vector<IndexType>>> *segment_bucket_sizes_ptr,
                                      std::vector<std::shared_ptr<std::vector<size_t>>> *segment_bucket_sizes_ptr,
                                      std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> *buckets_ptr) {
    MS_LOG(DEBUG) << "Start";
    MS_EXCEPTION_IF_NULL(params);
@@ -186,14 +187,14 @@ class UniqueCpuKernelMod : public NativeCpuKernelMod {
    auto &buckets = *buckets_ptr;
    auto thread_num = segments.size();
    buckets.reserve(thread_num);
    std::vector<IndexType> bucket_data_size(thread_num, 0);
    std::vector<size_t> bucket_data_size(thread_num, 0);
    for (size_t i = 0; i < thread_num; ++i) {
      for (size_t j = 0; j < thread_num; ++j) {
        bucket_data_size[j] += segment_bucket_sizes[i]->at(j);
      }
    }

    IndexType current_offset = 0;
    size_t current_offset = 0;
    for (size_t i = 0; i < thread_num; ++i) {
      auto bucket = std::make_shared<UniqueParam<DataType, IndexType>>();
      bucket->input_ = params->output_ + current_offset;
@@ -205,7 +206,7 @@ class UniqueCpuKernelMod : public NativeCpuKernelMod {
      current_offset += bucket_data_size[i];
      (void)buckets.emplace_back(bucket);
    }
    std::vector<IndexType> tmp_bucket_data_size(thread_num, 0);
    std::vector<size_t> tmp_bucket_data_size(thread_num, 0);
    std::vector<std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>>> thread_buckets;
    for (size_t i = 0; i < thread_num; ++i) {
      std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> local_buckets;
@@ -251,14 +252,14 @@ class UniqueCpuKernelMod : public NativeCpuKernelMod {
    if (params->input_size_ < 1) {
      return;
    }
    if (params->need_sort_) {
      for (IndexType i = 0; i < params->input_size_; ++i) {
        input_idx[i] = i;
    if (params->need_sort_ && !std::is_same<DataType, float>::value) {
      for (size_t i = 0; i < params->input_size_; ++i) {
        input_idx[i] = static_cast<IndexType>(i);
      }
      std::sort(input_idx, input_idx + params->input_size_,
                [&](IndexType left, IndexType right) { return input[left] < input[right]; });
                [&](size_t left, size_t right) { return input[left] < input[right]; });
      DataType last = input[0];
      for (IndexType i = 0; i < params->input_size_; ++i) {
      for (size_t i = 0; i < params->input_size_; ++i) {
        auto curr = input[input_idx[i]];
        if (i == 0 || curr != last) {
          if (i != 0) {
@@ -271,11 +272,11 @@ class UniqueCpuKernelMod : public NativeCpuKernelMod {
          inverse_idx[input_idx[i]] = j;
        }
      }
      params->output_size_ = j + 1;
      params->output_size_ = static_cast<size_t>(j + 1);
    } else {
      std::unordered_map<DataType, IndexType> uniq;
      uniq.reserve(params->input_size_);
      for (IndexType i = 0; i < params->input_size_; ++i) {
      for (size_t i = 0; i < params->input_size_; ++i) {
        auto it = uniq.emplace(input[i], j);
        inverse_idx[i] = it.first->second;
        if (it.second) {
@@ -285,7 +286,7 @@ class UniqueCpuKernelMod : public NativeCpuKernelMod {
      for (const auto &it : uniq) {
        output[it.second] = it.first;
      }
      params->output_size_ = j;
      params->output_size_ = static_cast<size_t>(j);
    }
    MS_LOG(DEBUG) << "End";
  }
@@ -310,7 +311,7 @@ class UniqueCpuKernelMod : public NativeCpuKernelMod {
  template <typename DataType, typename IndexType>
  static void TransformBucketReverseIndices(const std::shared_ptr<UniqueParam<DataType, IndexType>> &bucket,
                                            const std::shared_ptr<UniqueParam<DataType, IndexType>> &result,
                                            IndexType offset) {
                                            size_t offset) {
    MS_EXCEPTION_IF_NULL(bucket);
    MS_EXCEPTION_IF_NULL(bucket->inverse_idx_);
    MS_EXCEPTION_IF_NULL(bucket->workspace_idx_);
@@ -319,10 +320,14 @@ class UniqueCpuKernelMod : public NativeCpuKernelMod {
    if (bucket->input_size_ < 1) {
      return;
    }
    for (IndexType i = 0; i < bucket->input_size_; ++i) {
    for (size_t i = 0; i < bucket->input_size_; ++i) {
      auto origin_idx = bucket->workspace_idx_[i];
      if (origin_idx >= 0 && origin_idx < result->input_size_) {
        result->inverse_idx_[origin_idx] = bucket->inverse_idx_[i] + offset;
      if (origin_idx < 0) {
        continue;
      }
      size_t index = static_cast<size_t>(origin_idx);
      if (index < result->input_size_) {
        result->inverse_idx_[index] = bucket->inverse_idx_[i] + offset;
      }
    }
  }
@@ -334,8 +339,8 @@ class UniqueCpuKernelMod : public NativeCpuKernelMod {
    MS_EXCEPTION_IF_NULL(result);
    MS_EXCEPTION_IF_NULL(result->output_);
    size_t thread_num = buckets.size();
    std::vector<IndexType> bucket_offsets(thread_num);
    IndexType current_size = 0;
    std::vector<size_t> bucket_offsets(thread_num);
    size_t current_size = 0;
    for (size_t i = 0; i < thread_num; ++i) {
      auto bucket = buckets[i];
      MS_EXCEPTION_IF_NULL(bucket);
@@ -368,7 +373,7 @@ class UniqueCpuKernelMod : public NativeCpuKernelMod {
    MS_EXCEPTION_IF_NULL(params);
    std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> segments;
    std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> buckets;
    std::vector<std::shared_ptr<std::vector<IndexType>>> segment_bucket_sizes;
    std::vector<std::shared_ptr<std::vector<size_t>>> segment_bucket_sizes;
    SplitAndCalculateBucketSize(params, &segments, &segment_bucket_sizes);
    GatherSegmentsToBuckets(params, &segments, &segment_bucket_sizes, &buckets);
    UniqueEachBucket(buckets);
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
@@ -1483,7 +1483,7 @@ bool KernelRuntime::LaunchKernel(const session::KernelGraph &graph, const AnfNod
  auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
  MS_EXCEPTION_IF_NULL(kernel_mod);
  KernelLaunchInfo kernel_launch_info;
  auto stream = kernel_mod->GetStream();
  auto stream = kernel_mod->stream();
  if (stream == nullptr) {
    if (AnfAlgo::IsCommunicationOp(kernel)) {
      stream = communication_stream_;
--- a/mindspore/ccsrc/runtime/device/memory_scheduler.h
+++ b/mindspore/ccsrc/runtime/device/memory_scheduler.h
@@ -27,6 +27,8 @@ namespace mindspore {
 namespace device {
 class MemHandler {
 public:
  MemHandler() = default;
  virtual ~MemHandler() = default;
  virtual size_t GetAvailableMemSize() = 0;
  virtual void *MallocDevice(size_t mem_size) = 0;
  virtual void FreeDevice(void *ptr) = 0;
--- a/mindspore/core/utils/ms_exception.cc
+++ b/mindspore/core/utils/ms_exception.cc
@@ -18,12 +18,12 @@

 namespace mindspore {
 MsException &MsException::Instance() {
  static MsException instance;
  static MsException instance{};
  return instance;
 }

 StaticAnalysisException &StaticAnalysisException::Instance() {
  static StaticAnalysisException instance;
  static StaticAnalysisException instance{};
  return instance;
 }
 }  // namespace mindspore