!27427 Bugs Fix of Ascend MindRT

Merge pull request !27427 from hwjiaorui/mindrt-bug-fix
4 years ago · 1ecbadd7a8
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_compile.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_compile.cc
@@ -298,8 +298,8 @@ void TbeKernelCompileManager::ParseTargetJobStatus(const nlohmann::json &json, T
      MS_LOG(EXCEPTION) << "Parse query result error.";
    }
    auto json_name = GetJsonValue<std::string>(query_result, kFusionOpName);
    auto target_job_id = query_result.at(kJobId);
    auto status = query_result.at(kStatus);
    auto target_job_id = GetJsonValue<int>(query_result, kJobId);
    auto status = GetJsonValue<std::string>(query_result, kStatus);
    auto all_logs = GetJsonValue<std::vector<nlohmann::json>>(query_result, kProcessInfo);
    auto message = FilterExceptionMessage(all_logs);
    // save job status and exception message
@@ -354,8 +354,8 @@ void TbeKernelCompileManager::JsonAssemble(const std::string &job_type, const nl
    (*dst_json)[kJobContent] = job_info;
  } else if (job_type == kQuery) {
    nlohmann::json content;
    content[kSourceId] = src_json[kSourceId];
    content[kJobId] = src_json[kJobId];
    content[kSourceId] = GetJsonValue<int>(src_json, kSourceId);
    content[kJobId] = GetJsonValue<int>(src_json, kJobId);
    (*dst_json)[kJobContent] = content;
  } else {
    (*dst_json)[kJobContent] = src_json;
@@ -438,7 +438,8 @@ void TbeKernelCompileManager::SaveIOSizeInfo(const nlohmann::json &json, const s
  std::vector<size_t> input_size_list;
  std::vector<size_t> output_size_list;
  if (!output_nodes.empty()) {
    (void)TbeKernelBuild::GetIOSize(json[kOpList], output_nodes, &input_size_list, &output_size_list);
    (void)TbeKernelBuild::GetIOSize(GetJsonValue<nlohmann::json>(json, kOpList), output_nodes, &input_size_list,
                                    &output_size_list);
  } else {
    (void)TbeKernelBuild::GetIOSize(json, &input_size_list, &output_size_list);
  }
--- a/mindspore/ccsrc/pipeline/jit/pipeline.cc
+++ b/mindspore/ccsrc/pipeline/jit/pipeline.cc
@@ -1642,6 +1642,7 @@ void FinalizeHccl() {
  (void)FinalizeBackend();
 #else
  session::ExecutorManager::Instance().Clear();
  device::DeviceContextManager::GetInstance().ClearDeviceContexts();
  device::KernelRuntimeManager::Instance().ClearRuntimeResource();
 #endif
 }
--- a/mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc
@@ -255,18 +255,17 @@ bool CheckHitTargetDtype(const std::map<TypeId, TypeId> &type_map, const TypeId
 }

 bool TagRaiseReduce(const std::shared_ptr<kernel::KernelBuildInfo> &kernel_build_info, const CNodePtr &cnode,
                    const std::map<TypeId, TypeId> &type_map) {
                    const std::map<TypeId, TypeId> &type_map, bool *int64_flag) {
  // filte kernel info that unsupported raise or reduce datatype
  MS_EXCEPTION_IF_NULL(cnode);
  MS_EXCEPTION_IF_NULL(kernel_build_info);
  bool flag = false;
  for (size_t input_index = 0; input_index < kernel_build_info->GetInputNum(); ++input_index) {
    auto in_dtype = AnfAlgo::GetPrevNodeOutputInferDataType(cnode, input_index);
    auto device_dtype = kernel_build_info->GetInputDeviceType(input_index);
    if (device_dtype == kNumberTypeFloat || device_dtype == kNumberTypeFloat32) {
      device_dtype = kNumberTypeFloat32;
    }
    if (!CheckHitTargetDtype(type_map, in_dtype, device_dtype, &flag)) {
    if (!CheckHitTargetDtype(type_map, in_dtype, device_dtype, int64_flag)) {
      return false;
    }
  }
@@ -278,14 +277,10 @@ bool TagRaiseReduce(const std::shared_ptr<kernel::KernelBuildInfo> &kernel_build
      device_dtype = kNumberTypeFloat32;
    }

    if (!CheckHitTargetDtype(type_map, in_dtype, device_dtype, &flag)) {
    if (!CheckHitTargetDtype(type_map, in_dtype, device_dtype, int64_flag)) {
      return false;
    }
  }
  if (flag) {
    auto node_name = AnfAlgo::GetCNodeName(cnode);
    MS_LOG(WARNING) << "Operator:[" << node_name << "] don't support int64, reduce precision from int64 to int32.";
  }
  return true;
 }

@@ -298,10 +293,11 @@ std::vector<std::shared_ptr<kernel::KernelBuildInfo>> FilterRaisedOrReducePrecis
  const std::map<TypeId, TypeId> reduce_map = {{kNumberTypeInt64, kNumberTypeInt32},
                                               {kNumberTypeFloat, kNumberTypeFloat16},
                                               {kNumberTypeFloat32, kNumberTypeFloat16}};
  bool int64_reduce = false;
  // raise precision
  for (size_t info_index = 0; info_index < kernel_info_list.size(); ++info_index) {
    MS_EXCEPTION_IF_NULL(kernel_info_list[info_index]);
    if (TagRaiseReduce(kernel_info_list[info_index], cnode, raise_map)) {
    if (TagRaiseReduce(kernel_info_list[info_index], cnode, raise_map, &int64_reduce)) {
      filtered_kernel_info_list.push_back(kernel_info_list[info_index]);
    }
  }
@@ -317,7 +313,7 @@ std::vector<std::shared_ptr<kernel::KernelBuildInfo>> FilterRaisedOrReducePrecis
  if (context_ptr->get_param<bool>(MS_CTX_ENABLE_REDUCE_PRECISION)) {
    for (size_t info_index = 0; info_index < kernel_info_list.size(); ++info_index) {
      MS_EXCEPTION_IF_NULL(kernel_info_list[info_index]);
      if (TagRaiseReduce(kernel_info_list[info_index], cnode, reduce_map)) {
      if (TagRaiseReduce(kernel_info_list[info_index], cnode, reduce_map, &int64_reduce)) {
        filtered_kernel_info_list.push_back(kernel_info_list[info_index]);
      }
    }
@@ -325,6 +321,10 @@ std::vector<std::shared_ptr<kernel::KernelBuildInfo>> FilterRaisedOrReducePrecis
  if (!filtered_kernel_info_list.empty()) {
    *precision_reduce = true;
  }
  if (int64_reduce) {
    auto node_name = AnfAlgo::GetCNodeName(cnode);
    MS_LOG(WARNING) << "Operator:[" << node_name << "] don't support int64, reduce precision from int64 to int32.";
  }
  return filtered_kernel_info_list;
 }

--- a/mindspore/ccsrc/runtime/framework/actor/actor_common.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/actor_common.cc
@@ -292,5 +292,15 @@ std::string FetchActorName(KernelTransformType kernel_type, const std::string &a
  }
  return actor_name;
 }

 bool CheckMemcpyInDevice(const DeviceTensor *dst_device_addr, const DeviceTensor *src_device_addr) {
  MS_EXCEPTION_IF_NULL(dst_device_addr);
  if (src_device_addr == nullptr) {
    return false;
  }
  return (src_device_addr->DeviceType() == dst_device_addr->DeviceType() &&
          src_device_addr->format() == dst_device_addr->format() &&
          src_device_addr->type_id() == dst_device_addr->type_id());
 }
 }  // namespace runtime
 }  // namespace mindspore
--- a/mindspore/ccsrc/runtime/framework/actor/actor_common.h
+++ b/mindspore/ccsrc/runtime/framework/actor/actor_common.h
@@ -210,6 +210,8 @@ KernelTransformType FetchKernelTransformType(const AnfNodePtr &node, const Kerne
                                             GraphExecutionStrategy strategy = GraphExecutionStrategy::kPipeline);
 std::string FetchActorName(KernelTransformType kernel_type, const std::string &actor_set_name,
                           const AnfNodePtr &node = nullptr, const KernelGraphPtr &graph = nullptr);

 bool CheckMemcpyInDevice(const DeviceTensor *dst_device_tensor, const DeviceTensor *src_device_tensor);
 }  // namespace runtime
 }  // namespace mindspore

--- a/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc
@@ -239,10 +239,14 @@ void HostQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *cons
    auto tensor_device_address = std::dynamic_pointer_cast<DeviceTensor>(host_tensor->device_address());
    // Sync data from host_tensor_device_address to device_tensor.
    if (tensor_device_address != nullptr) {
      if ((tensor_device_address.get() != device_tensor) && (!Copy(device_tensor, tensor_device_address.get()))) {
        SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "Copy data failed.");
      if (CheckMemcpyInDevice(device_tensor, tensor_device_address.get())) {
        if ((tensor_device_address.get() != device_tensor) && (!Copy(device_tensor, tensor_device_address.get()))) {
          SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "Copy data failed.");
        }
        continue;
      } else {
        host_tensor->data_sync(false);
      }
      continue;
    }

    // Sync data from host_tensor to device_tensor.
--- a/mindspore/ccsrc/runtime/framework/actor/output_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/output_actor.cc
@@ -109,7 +109,7 @@ TensorPtr OutputActor::CreateOutputTensor(const AnfNodePtr &output_node, size_t
  const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(output_node, output_index, false);
  MS_EXCEPTION_IF_NULL(device_tensor);
  // In the input as output scenario, use the device tensor of node.
  if (output_node->isa<ValueNode>() || output_node->isa<Parameter>()) {
  if (IsPersistentDeviceTensor(output_node)) {
    tensor->set_device_address(device_tensor);
    return tensor;
  }
@@ -151,7 +151,7 @@ void OutputActor::UpdateOutputDeviceAddress() {
    auto output_index = output_nodes_[i].second;
    auto &tensor = outputs_[i];
    // In the input as output scenario, the output device tensor may come from the input tensor and can't be replaced.
    if ((output_node == nullptr) || output_node->isa<ValueNode>() || output_node->isa<Parameter>()) {
    if ((output_node == nullptr) || IsPersistentDeviceTensor(output_node)) {
      continue;
    }

--- a/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc
+++ b/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.cc
@@ -28,6 +28,7 @@
 #include "runtime/hardware/ascend/ascend_graph_optimization.h"
 #include "backend/kernel_compiler/ascend_kernel_mod.h"
 #include "runtime/device/ascend/ascend_bucket.h"
 #include "common/util/error_manager/error_manager.h"

 #ifndef ENABLE_SECURITY
 #include "debug/data_dump/dump_json_parser.h"
@@ -65,6 +66,7 @@ namespace ascend {
 using KernelGraph = mindspore::session::KernelGraph;
 const char kMsVm[] = "vm";
 constexpr size_t kAtomicCleanInputSize = 2;
 constexpr auto kUnknowErrorString = "Unknown error occurred";
 namespace {
 CNodePtr GetNextLabelSet(const std::vector<CNodePtr> &kernel_nodes, uint32_t index) {
  size_t node_sizes = kernel_nodes.size();
@@ -582,10 +584,30 @@ bool AscendDeviceContext::LaunchGraph(const KernelGraphPtr &graph) const {
  runtime_instance_->SetContext();
  device::KernelAdjust::GetInstance().LoadDeviceLoopCtrlParameters(graph);
  auto ret = ExecuteGraph(graph);
  if (!ret) {
    MS_LOG(ERROR) << "run task error!";
    ReportErrorMessage();
    return ret;
  }
  ReportWarningMessage();
  MS_LOG(INFO) << "Status record: end launch graph. graph id: " << graph->graph_id();
  return ret;
 }

 void AscendDeviceContext::ReportErrorMessage() const {
  const string &error_message = ErrorManager::GetInstance().GetErrorMessage();
  if (!error_message.empty() && error_message.find(kUnknowErrorString) == string::npos) {
    MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
  }
 }

 void AscendDeviceContext::ReportWarningMessage() const {
  const string &warning_message = ErrorManager::GetInstance().GetWarningMessage();
  if (!warning_message.empty()) {
    MS_LOG(WARNING) << "Ascend warning message:\n" << warning_message;
  }
 }

 bool AscendDeviceContext::SyncStream(size_t stream_id) const {
  MS_EXCEPTION_IF_NULL(runtime_instance_);
  return runtime_instance_->SyncStream();
@@ -597,7 +619,9 @@ bool AscendDeviceContext::IsExecutingSink(const KernelGraphPtr &graph) const {
  return ms_context->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) && IsGraphMode();
 }

 bool AscendDeviceContext::IsLoopCountSink(const KernelGraphPtr &graph) const { return IsGraphMode(); }
 bool AscendDeviceContext::IsLoopCountSink(const KernelGraphPtr &graph) const {
  return device::KernelAdjust::NeedLoopSink() && IsGraphMode();
 }

 // kernel by kernel mode interface
 void AscendDeviceContext::OptimizeSingleOpGraph(const KernelGraphPtr &graph) const {
--- a/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.h
+++ b/mindspore/ccsrc/runtime/hardware/ascend/ascend_device_context.h
@@ -138,6 +138,9 @@ class AscendDeviceContext : public DeviceContext {
  static bool IsGraphMode();
  bool SyncRuning() const;

  void ReportErrorMessage() const;
  void ReportWarningMessage() const;

  // Kernel Runtime  --- only for task sink
  AscendKernelRuntime *runtime_instance_{nullptr};
  std::shared_ptr<MemoryManager> mem_manager_{nullptr};