| @@ -97,8 +97,8 @@ void DynamicReshapeKernel::Execute() { | |||
| size_t input_size_byte = LongToSize(arr_prod) * abstract::TypeIdSize(type_x); | |||
| auto output_addr = AnfAlgo::GetOutputAddr(cnode, 0); | |||
| MS_EXCEPTION_IF_NULL(output_addr); | |||
| if (!output_addr->SyncDeviceToDevice(output_shapes, input_size_byte, address_x->type_id(), address_x->GetPtr(), | |||
| address_x->format())) { | |||
| if (!output_addr->SyncDeviceToDeviceWithSameFormatType(output_shapes, input_size_byte, address_x->type_id(), | |||
| address_x->GetPtr(), address_x->format())) { | |||
| MS_LOG(EXCEPTION) << "Host Reshape sync device to device failed."; | |||
| } | |||
| MS_LOG(INFO) << "Execute host ReshapeKernel End"; | |||
| @@ -79,6 +79,12 @@ void SyncMemory(void *dst, const void *src, uint64_t size, aclrtMemcpyKind kind) | |||
| if (size == 0) { | |||
| return; | |||
| } | |||
| if (dst == nullptr) { | |||
| MS_LOG(EXCEPTION) << "dst ptr is null, please check the address is set correctly."; | |||
| } | |||
| if (src == nullptr) { | |||
| MS_LOG(EXCEPTION) << "src ptr is null, please check the address is set correctly."; | |||
| } | |||
| auto ms_context = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(ms_context); | |||
| auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID); | |||
| @@ -433,8 +439,8 @@ bool AscendDeviceAddress::SyncHostToDevice(const ShapeVector &shape, size_t size | |||
| return sync_ok; | |||
| } | |||
| bool AscendDeviceAddress::SyncDeviceToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *src_ptr, | |||
| const std::string &format) const { | |||
| bool AscendDeviceAddress::SyncDeviceToDeviceWithSameFormatType(const ShapeVector &shape, size_t size, TypeId type, | |||
| const void *src_ptr, const std::string &format) const { | |||
| if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) { | |||
| return true; | |||
| } | |||
| @@ -457,6 +463,57 @@ bool AscendDeviceAddress::SyncDeviceToDevice(const ShapeVector &shape, size_t si | |||
| return true; | |||
| } | |||
| bool AscendDeviceAddress::SyncDeviceToDeviceWithDiffFormatType(const DeviceSync *src_device_addr) const { | |||
| MS_EXCEPTION_IF_NULL(src_device_addr); | |||
| if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) { | |||
| return true; | |||
| } | |||
| auto src_device_address = dynamic_cast<const AscendDeviceAddress *>(src_device_addr); | |||
| MS_EXCEPTION_IF_NULL(src_device_address); | |||
| if (size_ < src_device_address->GetSize()) { | |||
| MS_LOG(ERROR) << "Src size is greater than det size, src size is: " << src_device_address->GetSize() | |||
| << ", dst size is: " << size_; | |||
| return false; | |||
| } | |||
| BindDevice(); | |||
| auto host_shape = src_device_address->host_shape(); | |||
| if (host_shape.empty()) { | |||
| MS_LOG(ERROR) << "host shape is empty, please check whether the host shape of source device address" | |||
| << src_device_address << " is set."; | |||
| return false; | |||
| } | |||
| auto host_tensor = std::make_shared<tensor::Tensor>(src_device_address->type_id(), host_shape); | |||
| auto host_tensor_size = LongToSize(host_tensor->data().nbytes()); | |||
| auto host_tensor_type = host_tensor->data_type(); | |||
| if (!src_device_address->SyncDeviceToHost(host_shape, host_tensor_size, host_tensor_type, host_tensor->data_c())) { | |||
| MS_LOG(ERROR) << "Sync device to device failed at the stage of sync device to intermediate Tensor."; | |||
| return false; | |||
| } | |||
| if (!SyncHostToDevice(host_shape, host_tensor_size, host_tensor_type, host_tensor->data_c(), | |||
| host_tensor->device_info().host_format_)) { | |||
| MS_LOG(ERROR) << "Sync device to device failed at the stage of sync intermediate tensor to device."; | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| bool AscendDeviceAddress::SyncDeviceToDevice(const DeviceSync *src_device_addr) const { | |||
| MS_EXCEPTION_IF_NULL(src_device_addr); | |||
| auto src_device_address = dynamic_cast<const AscendDeviceAddress *>(src_device_addr); | |||
| if (format_ == src_device_address->format() && type_id_ == src_device_address->type_id()) { | |||
| return SyncDeviceToDeviceWithSameFormatType(ShapeVector(), src_device_address->GetSize(), | |||
| src_device_address->type_id(), src_device_address->GetPtr(), | |||
| src_device_address->format()); | |||
| } else { | |||
| MS_LOG(WARNING) << "Can not copy from device to device directly, format or type is different, src(format:" | |||
| << src_device_address->format() << ", type_id:" << TypeIdLabel(src_device_address->type_id()) | |||
| << "), dst(format:" << format_ << ", type_id:" << TypeIdLabel(type_id_) | |||
| << ", use the intermediate Tensor copy instead."; | |||
| return SyncDeviceToDeviceWithDiffFormatType(src_device_addr); | |||
| } | |||
| } | |||
| bool AscendDeviceAddress::AsyncDeviceToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *src_ptr, | |||
| const std::string &format) const { | |||
| MS_LOG(INFO) << "AsyncDeviceToDevice, dst(format:" << format_ << ", type_id:" << TypeIdLabel(type_id_) | |||
| @@ -52,8 +52,10 @@ class AscendDeviceAddress : public DeviceAddress { | |||
| const std::string &format = "DefaultFormat") const override; | |||
| bool AsyncDeviceToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *src_ptr, | |||
| const std::string &format) const override; | |||
| bool SyncDeviceToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *src_ptr, | |||
| const std::string &format) const override; | |||
| bool SyncDeviceToDeviceWithSameFormatType(const ShapeVector &shape, size_t size, TypeId type, const void *src_ptr, | |||
| const std::string &format) const override; | |||
| bool SyncDeviceToDeviceWithDiffFormatType(const DeviceSync *src_device_addr) const override; | |||
| bool SyncDeviceToDevice(const DeviceSync *src_device_addr) const override; | |||
| void ClearDeviceMemory() override; | |||
| DeviceAddressType DeviceType() const override { return DeviceAddressType::kAscend; } | |||
| #ifndef ENABLE_SECURITY | |||
| @@ -615,6 +615,7 @@ void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph &graph) { | |||
| MS_EXCEPTION_IF_NULL(address.addr); | |||
| device_address = CreateDeviceAddress(address.addr, address.size, AnfAlgo::GetOutputFormat(item, index), | |||
| output_type_id, {item, index}); | |||
| device_address->set_host_shape(trans::GetRuntimePaddingShape(item, index)); | |||
| AnfAlgo::SetOutputAddr(device_address, index, item.get()); | |||
| continue; | |||
| } | |||
| @@ -644,6 +645,7 @@ void KernelRuntime::GetDeviceAddress(const AnfNodePtr &item, | |||
| TypeId output_type_id = AnfAlgo::GetOutputDeviceDataType(item, index); | |||
| *device_address = | |||
| CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id, {item, index}); | |||
| (*device_address)->set_host_shape(trans::GetRuntimePaddingShape(item, index)); | |||
| MS_LOG(INFO) << "Assign Static Memory for Input node, size:" << tensor_size | |||
| << " node:" << item->fullname_with_scope() << " index: " << index; | |||
| if (mem_manager_->MallocMem(kStaticMem, tensor_size, *device_address, graph_id) == nullptr) { | |||
| @@ -699,6 +701,9 @@ void KernelRuntime::UpdateRefNodeOutputMem(const session::KernelGraph &graph) { | |||
| MS_LOG(DEBUG) << "REF address is not same, ref node output need address update"; | |||
| MS_LOG(DEBUG) << "REF origin op is " << origin_pair.first->DebugString() << ", output index is " | |||
| << origin_pair.second << ", cur op is " << kernel->DebugString() << ", out index is " << i; | |||
| if (!cur_node_output_addr->host_shape().empty()) { | |||
| origin_node_output_addr->set_host_shape(cur_node_output_addr->host_shape()); | |||
| } | |||
| AnfAlgo::SetOutputAddr(origin_node_output_addr, i, kernel.get()); | |||
| } | |||
| } | |||
| @@ -763,6 +768,7 @@ void KernelRuntime::AssignCommunicationNodeOutputMem(MemType type, const AnfNode | |||
| } else { | |||
| address->set_ptr(output_ptr); | |||
| } | |||
| address->set_host_shape(trans::GetRuntimePaddingShape(node, j)); | |||
| AnfAlgo::SetOutputAddr(address, j, node.get()); | |||
| output_ptr += align_size_list[j]; | |||
| } | |||
| @@ -944,6 +950,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const | |||
| auto output_format = AnfAlgo::GetOutputFormat(value_node, output_idx); | |||
| DeviceAddressPtr address = | |||
| CreateDeviceAddress(nullptr, node_size, output_format, output_type_id, {value_node, output_idx}); | |||
| address->set_host_shape(trans::GetRuntimePaddingShape(value_node, output_idx)); | |||
| address->set_from_persistent_mem(true); | |||
| MS_EXCEPTION_IF_NULL(address); | |||
| if (ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER) && | |||
| @@ -153,9 +153,7 @@ bool Copy(const DeviceTensor *dst_device_tensor, const DeviceTensor *src_device_ | |||
| // Other device tensor copy to CPU device tensor. | |||
| return src_device_tensor->SyncDeviceToHost(copy_size, dst_device_tensor->GetMutablePtr()); | |||
| } else if (dst_device_tensor->DeviceType() == src_device_tensor->DeviceType()) { | |||
| return dst_device_tensor->SyncDeviceToDevice(ShapeVector(), src_device_tensor->GetSize(), | |||
| src_device_tensor->type_id(), src_device_tensor->GetPtr(), | |||
| src_device_tensor->format()); | |||
| return dst_device_tensor->SyncDeviceToDevice(src_device_tensor); | |||
| } else { | |||
| MS_LOG(ERROR) << "Invalid device type, src device type: " << src_device_tensor->DeviceType() | |||
| << ", dst device type: " << dst_device_tensor->DeviceType(); | |||
| @@ -292,15 +290,5 @@ std::string FetchActorName(KernelTransformType kernel_type, const std::string &a | |||
| } | |||
| return actor_name; | |||
| } | |||
| bool NeedSyncByTensor(const DeviceTensor *dst_device_addr, const DeviceTensor *src_device_addr) { | |||
| MS_EXCEPTION_IF_NULL(dst_device_addr); | |||
| MS_EXCEPTION_IF_NULL(src_device_addr); | |||
| if (src_device_addr->DeviceType() != dst_device_addr->DeviceType()) { | |||
| return false; | |||
| } | |||
| return (src_device_addr->format() != dst_device_addr->format() || | |||
| src_device_addr->type_id() != dst_device_addr->type_id()); | |||
| } | |||
| } // namespace runtime | |||
| } // namespace mindspore | |||
| @@ -210,8 +210,6 @@ KernelTransformType FetchKernelTransformType(const AnfNodePtr &node, const Kerne | |||
| GraphExecutionStrategy strategy = GraphExecutionStrategy::kPipeline); | |||
| std::string FetchActorName(KernelTransformType kernel_type, const std::string &actor_set_name, | |||
| const AnfNodePtr &node = nullptr, const KernelGraphPtr &graph = nullptr); | |||
| bool NeedSyncByTensor(const DeviceTensor *dst_device_tensor, const DeviceTensor *src_device_tensor); | |||
| } // namespace runtime | |||
| } // namespace mindspore | |||
| @@ -166,7 +166,7 @@ void ExitActor::CopyDeviceAddress(OpContext<DeviceTensor> *const context) { | |||
| SET_OPCONTEXT_MEMORY_ALLOC_FAIL_BY_STRATEGY(GraphExecutionStrategy::kPipeline, *context, *device_contexts_[i], | |||
| GetAID().Name(), new_device_tensor->GetSize()); | |||
| } | |||
| if (!new_device_tensor->SyncDeviceToDevice( | |||
| if (!new_device_tensor->SyncDeviceToDeviceWithSameFormatType( | |||
| trans::GetRuntimePaddingShape(node_with_index.first, node_with_index.second), | |||
| input_device_tensor->GetSize(), input_device_tensor->type_id(), input_device_tensor->GetPtr(), | |||
| input_device_tensor->format())) { | |||
| @@ -242,15 +242,10 @@ void HostQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *cons | |||
| if (tensor_device_address.get() == device_tensor) { | |||
| continue; | |||
| } | |||
| if (NeedSyncByTensor(device_tensor, tensor_device_address.get())) { | |||
| host_tensor->data_sync(false); | |||
| } else { | |||
| if ((!Copy(device_tensor, tensor_device_address.get()))) { | |||
| SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "Copy data failed."); | |||
| } | |||
| continue; | |||
| if ((!Copy(device_tensor, tensor_device_address.get()))) { | |||
| SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "Copy data failed."); | |||
| } | |||
| continue; | |||
| } | |||
| // Sync data from host_tensor to device_tensor. | |||
| @@ -56,6 +56,7 @@ void OutputActor::RunOpControl(AID *const, OpContext<DeviceTensor> *const contex | |||
| if (outputs_[device_tensor_store_key.first] == nullptr) { | |||
| SET_OPCONTEXT_FAIL_RET_WITH_ERROR(*context, "Create output tensor failed."); | |||
| } | |||
| output_nodes_[device_tensor_store_key.first] = {device_tensor_store_key.second, 0}; | |||
| } | |||
| current_outputs_num_ = 0; | |||
| @@ -175,6 +176,7 @@ void OutputActor::UpdateOutputDeviceAddress() { | |||
| auto node_with_index = device_tensor->GetNodeIndex(); | |||
| tensor_device_address->SetNodeIndex(node_with_index.first, node_with_index.second); | |||
| tensor_device_address->set_from_persistent_mem(device_tensor->from_persistent_mem()); | |||
| tensor_device_address->set_host_shape(device_tensor->host_shape()); | |||
| // The outputs may have the same output node, so need skip when the node has been done. | |||
| if (device_tensor->GetPtr() == nullptr) { | |||
| continue; | |||
| @@ -190,9 +192,9 @@ void OutputActor::UpdateOutputDeviceAddress() { | |||
| << output_node->fullname_with_scope() << ", alloc size: " << tensor_device_address->GetSize() | |||
| << "B."; | |||
| } | |||
| if (!tensor_device_address->SyncDeviceToDevice(trans::GetRuntimePaddingShape(output_node, output_index), | |||
| device_tensor->GetSize(), device_tensor->type_id(), | |||
| device_tensor->GetPtr(), device_tensor->format())) { | |||
| if (!tensor_device_address->SyncDeviceToDeviceWithSameFormatType( | |||
| trans::GetRuntimePaddingShape(output_node, output_index), device_tensor->GetSize(), | |||
| device_tensor->type_id(), device_tensor->GetPtr(), device_tensor->format())) { | |||
| MS_LOG(EXCEPTION) << "Sync device to device failed, device type: " << tensor_device_address->DeviceType(); | |||
| } | |||
| } else { | |||
| @@ -258,8 +258,10 @@ void AscendDeviceContext::DumpAllGraphs(const std::vector<KernelGraphPtr> &all_g | |||
| if (json_parser.e2e_dump_enabled() || json_parser.async_dump_enabled()) { | |||
| std::string root_dir = json_parser.path() + "/rank_" + std::to_string(rank_id_); | |||
| std::string target_dir = root_dir + "/graphs"; | |||
| std::string cst_file_dir = GenerateDumpPath(graph->root_graph_id(), rank_id_, true); | |||
| std::string ir_file_path = target_dir + "/" + "ms_output_" + final_graph + ".ir"; | |||
| DumpIRProtoWithSrcInfo(graph, final_graph, target_dir, kDebugWholeStack); | |||
| DumpConstantInfo(graph, cst_file_dir); | |||
| DumpIR("trace_code_graph", graph, true, kWholeStack, ir_file_path); | |||
| DumpGraphExeOrder("ms_execution_order_graph_" + std::to_string(graph->graph_id()) + ".csv", root_dir, | |||
| graph->execution_order()); | |||
| @@ -453,8 +455,8 @@ void AscendDeviceContext::AssignOutputNopNodeDeviceAddress(const KernelGraphPtr | |||
| auto output_type = AnfAlgo::GetOutputDeviceDataType(output, 0); | |||
| auto device_address = CreateDeviceAddress(const_cast<void *>(ptr), size, output_format, output_type); | |||
| device_address->set_is_ptr_persisted(true); | |||
| device_address->set_host_shape(trans::GetRuntimePaddingShape(output, 0)); | |||
| AnfAlgo::SetOutputAddr(device_address, 0, output.get()); | |||
| AnfAlgo::SetNodeAttr(kAttrSkipNopOpAddr, MakeValue(false), output); | |||
| MS_LOG(INFO) << "Assign device address to output nop node " << output->fullname_with_scope(); | |||
| } | |||
| @@ -39,10 +39,12 @@ class DeviceSync { | |||
| virtual bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const = 0; | |||
| virtual bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr, | |||
| const std::string &format = "DefaultFormat") const = 0; | |||
| virtual bool SyncDeviceToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *src_ptr, | |||
| const std::string &format) const { | |||
| virtual bool SyncDeviceToDevice(const DeviceSync *src_device_addr) const { return true; } | |||
| virtual bool SyncDeviceToDeviceWithSameFormatType(const ShapeVector &shape, size_t size, TypeId type, | |||
| const void *src_ptr, const std::string &format) const { | |||
| return true; | |||
| } | |||
| virtual bool SyncDeviceToDeviceWithDiffFormatType(const DeviceSync *src_device_addr) const { return true; } | |||
| virtual bool AsyncDeviceToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *src_ptr, | |||
| const std::string &format) const { | |||
| return true; | |||