Merge pull request !30660 from jiaorui/host_shaper1.7
| @@ -245,10 +245,10 @@ void ClearGraphDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *d | |||
| continue; | |||
| } | |||
| MS_EXCEPTION_IF_NULL(device_context); | |||
| auto new_device_address = device_context->CreateDeviceAddress( | |||
| nullptr, device_address->GetSize(), device_address->format(), device_address->type_id()); | |||
| auto new_device_address = | |||
| device_context->CreateDeviceAddress(nullptr, device_address->GetSize(), device_address->format(), | |||
| device_address->type_id(), device_address->host_shape()); | |||
| MS_EXCEPTION_IF_NULL(new_device_address); | |||
| new_device_address->set_host_shape(device_address->host_shape()); | |||
| new_device_address->set_original_ref_count(device_address->original_ref_count()); | |||
| new_device_address->ResetRefCount(); | |||
| if (is_gradient_out) { | |||
| @@ -286,7 +286,7 @@ std::shared_ptr<device::DeviceAddress> CreateAscendDeviceAddress(const KernelLau | |||
| device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({kAscendDevice, device_id}); | |||
| auto format = kOpFormat_DEFAULT; | |||
| MS_EXCEPTION_IF_NULL(addr_ptr); | |||
| return device_context->CreateDeviceAddress(addr_ptr->addr, addr_ptr->size, format, type); | |||
| return device_context->CreateDeviceAddress(addr_ptr->addr, addr_ptr->size, format, type, ShapeVector()); | |||
| } | |||
| void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path, | |||
| @@ -86,7 +86,7 @@ void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint | |||
| } | |||
| auto format = kOpFormat_DEFAULT; | |||
| auto device_addr = device_context->CreateDeviceAddress(addr->addr, addr->size, format, type); | |||
| auto device_addr = device_context->CreateDeviceAddress(addr->addr, addr->size, format, type, ShapeVector()); | |||
| string input_tensor_name = input_kernel_name + ':' + "0"; | |||
| ShapeVector int_shapes = trans::GetRuntimePaddingShape(input_kernel, PARAMETER_OUTPUT_INDEX); | |||
| auto ret = device_addr->LoadMemToHost(input_tensor_name, UintToInt(exec_order), format, int_shapes, type, 0, true, | |||
| @@ -122,7 +122,7 @@ void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uin | |||
| } | |||
| auto format = kOpFormat_DEFAULT; | |||
| auto device_addr = device_context->CreateDeviceAddress(addr->addr, addr->size, format, type); | |||
| auto device_addr = device_context->CreateDeviceAddress(addr->addr, addr->size, format, type, ShapeVector()); | |||
| string tensor_name = kernel_name + ':' + std::to_string(j); | |||
| ShapeVector int_shapes = trans::GetRuntimePaddingShape(cnode, j); | |||
| auto ret = | |||
| @@ -934,6 +934,18 @@ void AscendDeviceContext::InsertEventBeforeRunTask(const KernelGraphPtr &graph) | |||
| graph_event_[graph->graph_id()] = compute_event; | |||
| } | |||
| DeviceAddressPtr AscendDeviceContext::CreateDeviceAddress(void *const device_ptr, size_t device_size, | |||
| const string &format, TypeId type_id, | |||
| const ShapeVector &shape) const { | |||
| auto device_address = std::make_shared<AscendDeviceAddress>( | |||
| device_ptr, device_size, format, type_id, device_context_key_.device_name_, device_context_key_.device_id_); | |||
| if (shape.empty()) { | |||
| MS_LOG(WARNING) << "shape size is empty."; | |||
| } | |||
| device_address->set_host_shape(shape); | |||
| return device_address; | |||
| } | |||
| MS_REGISTER_DEVICE(kAscendDevice, AscendDeviceContext); | |||
| } // namespace ascend | |||
| } // namespace device | |||
| @@ -91,11 +91,8 @@ class AscendDeviceContext : public DeviceContext { | |||
| const std::vector<size_t> &size_list) const override; | |||
| // Create concrete device address according different device type. | |||
| DeviceAddressPtr CreateDeviceAddress(void *const device_ptr, size_t device_size, const string &format, | |||
| TypeId type_id) const override { | |||
| return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id, | |||
| device_context_key_.device_name_, device_context_key_.device_id_); | |||
| } | |||
| DeviceAddressPtr CreateDeviceAddress(void *const device_ptr, size_t device_size, const string &format, TypeId type_id, | |||
| const ShapeVector &shape = ShapeVector()) const override; | |||
| // Get device address type according different device type, such GPU, Ascend. | |||
| DeviceAddressType GetDeviceAddressType() const override { return DeviceAddressType::kAscend; } | |||
| @@ -119,7 +119,7 @@ void CPUDeviceContext::FreeMemory(void *const ptr) const { | |||
| } | |||
| DeviceAddressPtr CPUDeviceContext::CreateDeviceAddress(void *const device_ptr, size_t device_size, const string &format, | |||
| TypeId type_id) const { | |||
| TypeId type_id, const ShapeVector &shape) const { | |||
| return std::make_shared<CPUDeviceAddress>(device_ptr, device_size, format, type_id, device_context_key_.device_name_, | |||
| device_context_key_.device_id_); | |||
| } | |||
| @@ -43,8 +43,8 @@ class CPUDeviceContext : public DeviceContext { | |||
| void *AllocateMemory(size_t size) const override; | |||
| void FreeMemory(void *const ptr) const override; | |||
| DeviceAddressPtr CreateDeviceAddress(void *const device_ptr, size_t device_size, const string &format, | |||
| TypeId type_id) const override; | |||
| DeviceAddressPtr CreateDeviceAddress(void *const device_ptr, size_t device_size, const string &format, TypeId type_id, | |||
| const ShapeVector &shape = ShapeVector()) const override; | |||
| DeviceAddressType GetDeviceAddressType() const override { return DeviceAddressType::kCPU; } | |||
| void OptimizeGraph(const KernelGraphPtr &graph) const override; | |||
| @@ -233,7 +233,7 @@ void GPUDeviceContext::FreeMemory(void *const ptr) const { | |||
| } | |||
| DeviceAddressPtr GPUDeviceContext::CreateDeviceAddress(void *const device_ptr, size_t device_size, const string &format, | |||
| TypeId type_id) const { | |||
| TypeId type_id, const ShapeVector &shape) const { | |||
| return std::make_shared<GPUDeviceAddress>(device_ptr, device_size, format, type_id, device_context_key_.device_name_, | |||
| device_context_key_.device_id_); | |||
| } | |||
| @@ -49,8 +49,8 @@ class GPUDeviceContext : public DeviceContext { | |||
| void *AllocateMemory(size_t size) const override; | |||
| void FreeMemory(void *const ptr) const override; | |||
| DeviceAddressPtr CreateDeviceAddress(void *const device_ptr, size_t device_size, const string &format, | |||
| TypeId type_id) const override; | |||
| DeviceAddressPtr CreateDeviceAddress(void *const device_ptr, size_t device_size, const string &format, TypeId type_id, | |||
| const ShapeVector &shape = ShapeVector()) const override; | |||
| DeviceAddressType GetDeviceAddressType() const override { return DeviceAddressType::kGPU; } | |||
| // Optimize the kernel graph for graph mode. | |||
| @@ -172,8 +172,9 @@ void ExitActor::CopyDeviceAddress(OpContext<DeviceTensor> *const context) { | |||
| } | |||
| MS_EXCEPTION_IF_NULL(device_contexts_[i]); | |||
| // Create the new device tensor to take over the input_device_tensors which are the outputs of kernel graphs. | |||
| auto new_device_tensor = device_contexts_[i]->CreateDeviceAddress( | |||
| nullptr, input_device_tensor->GetSize(), input_device_tensor->format(), input_device_tensor->type_id()); | |||
| auto new_device_tensor = | |||
| device_contexts_[i]->CreateDeviceAddress(nullptr, input_device_tensor->GetSize(), input_device_tensor->format(), | |||
| input_device_tensor->type_id(), input_device_tensor->host_shape()); | |||
| MS_EXCEPTION_IF_NULL(new_device_tensor); | |||
| (void)created_device_tensors_.emplace_back(new_device_tensor); | |||
| (void)new_device_tensors.emplace_back(new_device_tensor.get()); | |||
| @@ -467,8 +467,9 @@ void DataPrepareActor::PrepareDataForStepMode(const std::vector<std::vector<Tens | |||
| output_type_id = common::AnfAlgo::GetOutputInferDataType(input_node, 0); | |||
| } | |||
| size_t tensor_size = AnfAlgo::GetOutputTensorMemSize(input_node, 0); | |||
| auto device_address = device_context->CreateDeviceAddress( | |||
| nullptr, tensor_size, AnfAlgo::GetOutputFormat(input_node, 0), output_type_id); | |||
| auto device_address = | |||
| device_context->CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(input_node, 0), | |||
| output_type_id, trans::GetRuntimePaddingShape(input_node, 0)); | |||
| MS_EXCEPTION_IF_NULL(device_address); | |||
| AnfAlgo::SetOutputAddr(device_address, 0, input_node.get()); | |||
| device_address->SetNodeIndex(input_node, 0); | |||
| @@ -668,8 +669,9 @@ void DataPrepareActor::PrepareDataForWeightNode(const AnfNodePtr &backend_node, | |||
| // The step mode can't reuse the device tensor, because other actors may use the device tensor in step mode. | |||
| if ((strategy_ == GraphExecutionStrategy::kStep) || | |||
| (device_tensor->DeviceType() != device_context->GetDeviceAddressType())) { | |||
| host_tensor_address = device_context->CreateDeviceAddress(nullptr, device_tensor->GetSize(), | |||
| device_tensor->format(), device_tensor->type_id()); | |||
| host_tensor_address = | |||
| device_context->CreateDeviceAddress(nullptr, device_tensor->GetSize(), device_tensor->format(), | |||
| device_tensor->type_id(), device_tensor->host_shape()); | |||
| host_tensor_address->set_from_persistent_mem(tensor->is_parameter()); | |||
| } else { | |||
| host_tensor_address = device_tensor; | |||
| @@ -127,7 +127,8 @@ void KernelActor::FetchWorkspaceDeviceTensor() { | |||
| launch_info_.workspaces_.erase(launch_info_.workspaces_.end() - size, launch_info_.workspaces_.end()); | |||
| } else if (launch_info_.workspaces_.size() < workspace_sizes.size()) { | |||
| for (size_t i = launch_info_.workspaces_.size(); i < workspace_sizes.size(); ++i) { | |||
| auto device_address = device_contexts_[0]->CreateDeviceAddress(nullptr, workspace_sizes[i], "", kTypeUnknown); | |||
| auto device_address = | |||
| device_contexts_[0]->CreateDeviceAddress(nullptr, workspace_sizes[i], "", kTypeUnknown, ShapeVector()); | |||
| MS_LOG(DEBUG) << "Create addr for node:" << common::AnfAlgo::GetNodeDebugString(kernel_) | |||
| << " addr:" << device_address; | |||
| AnfAlgo::SetWorkspaceAddr(device_address, i, kernel_.get()); // set to kernel_info | |||
| @@ -309,8 +310,9 @@ void KernelActor::CopyInputDeviceTensor(const OpData<DeviceTensor> *input_data, | |||
| SET_OPCONTEXT_FAIL_RET_WITH_ERROR_BY_STRATEGY(strategy_, *context, "The input index is of range."); | |||
| } | |||
| if (copy_input_device_tensors_[input_data->index_] == nullptr) { | |||
| copy_input_device_tensors_[input_data->index_] = device_contexts_[0]->CreateDeviceAddress( | |||
| nullptr, device_tensor->GetSize(), device_tensor->format(), device_tensor->type_id()); | |||
| copy_input_device_tensors_[input_data->index_] = | |||
| device_contexts_[0]->CreateDeviceAddress(nullptr, device_tensor->GetSize(), device_tensor->format(), | |||
| device_tensor->type_id(), device_tensor->host_shape()); | |||
| } | |||
| auto &new_device_tensor = copy_input_device_tensors_[input_data->index_]; | |||
| MS_EXCEPTION_IF_NULL(new_device_tensor); | |||
| @@ -149,8 +149,9 @@ TensorPtr OutputActor::CreateOutputTensor(const AnfNodePtr &output_node, size_t | |||
| if (output_node_to_tensor_device_address_.count({output_node, output_index}) > 0) { | |||
| tensor->set_device_address(output_node_to_tensor_device_address_[{output_node, output_index}]); | |||
| } else { | |||
| auto tensor_device_address = device_context->CreateDeviceAddress(nullptr, device_tensor->GetSize(), | |||
| device_tensor->format(), device_tensor->type_id()); | |||
| auto tensor_device_address = | |||
| device_context->CreateDeviceAddress(nullptr, device_tensor->GetSize(), device_tensor->format(), | |||
| device_tensor->type_id(), device_tensor->host_shape()); | |||
| MS_EXCEPTION_IF_NULL(tensor_device_address); | |||
| tensor->set_device_address(tensor_device_address); | |||
| output_node_to_tensor_device_address_[{output_node, output_index}] = tensor_device_address; | |||
| @@ -283,7 +283,7 @@ void CreateDeviceTensorForValueNode(const KernelWithIndex &front_node_with_index | |||
| // Create device tensor. | |||
| std::string output_format = AnfAlgo::GetOutputFormat(backend_node, 0); | |||
| device::DeviceAddressPtr address = | |||
| device_context->CreateDeviceAddress(nullptr, tensor_size, output_format, output_type_id); | |||
| device_context->CreateDeviceAddress(nullptr, tensor_size, output_format, output_type_id, ShapeVector()); | |||
| MS_EXCEPTION_IF_NULL(address); | |||
| MS_LOG(DEBUG) << "Create address for node:" << common::AnfAlgo::GetNodeDebugString(front_node) << " addr:" << address | |||
| << " size:" << tensor_size; | |||
| @@ -311,7 +311,8 @@ void CreateDeviceTensorForFrontNode(const KernelWithIndex &front_node_with_index | |||
| size_t size = AnfAlgo::GetOutputTensorMemSize(node, 0); | |||
| // Create device tensor. | |||
| device::DeviceAddressPtr address = device_context->CreateDeviceAddress(nullptr, size, kOpFormat_DEFAULT, type_id); | |||
| device::DeviceAddressPtr address = | |||
| device_context->CreateDeviceAddress(nullptr, size, kOpFormat_DEFAULT, type_id, ShapeVector()); | |||
| MS_EXCEPTION_IF_NULL(address); | |||
| MS_LOG(INFO) << "Create address for node that has no corresponding backend node:" | |||
| << common::AnfAlgo::GetNodeDebugString(node) << " addr:" << address << " size:" << size | |||
| @@ -100,10 +100,10 @@ void CreateParameterDeviceAddress(const DeviceContext *device_context, const Ker | |||
| } | |||
| size_t tensor_size = AnfAlgo::GetOutputTensorMemSize(item, index); | |||
| auto device_address = device_context->CreateDeviceAddress(nullptr, tensor_size, | |||
| AnfAlgo::GetOutputFormat(item, index), output_type_id); | |||
| auto device_address = | |||
| device_context->CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id, | |||
| trans::GetRuntimePaddingShape(item, index)); | |||
| device_address->set_from_persistent_mem(item->isa<Parameter>()); | |||
| device_address->set_host_shape(trans::GetRuntimePaddingShape(item, index)); | |||
| MS_LOG(DEBUG) << "Create addr for node:" << common::AnfAlgo::GetNodeDebugString(item) | |||
| << " addr:" << device_address; | |||
| AnfAlgo::SetOutputAddr(device_address, index, item.get()); | |||
| @@ -144,8 +144,8 @@ void CreateDeviceAddressForTensorValue(const DeviceContext *device_context, cons | |||
| } | |||
| std::string output_format = AnfAlgo::GetOutputFormat(value_node, output_idx); | |||
| device::DeviceAddressPtr address = | |||
| device_context->CreateDeviceAddress(nullptr, tensor_size, output_format, output_type_id); | |||
| device::DeviceAddressPtr address = device_context->CreateDeviceAddress( | |||
| nullptr, tensor_size, output_format, output_type_id, trans::GetRuntimePaddingShape(value_node, output_idx)); | |||
| MS_LOG(DEBUG) << "Create addr for node:" << common::AnfAlgo::GetNodeDebugString(value_node) << " addr:" << address; | |||
| MS_EXCEPTION_IF_NULL(address); | |||
| address->set_from_persistent_mem(true); | |||
| @@ -169,7 +169,8 @@ void CreateValueNodeDeviceAddress(const DeviceContext *device_context, const Ker | |||
| } else if (node_value->isa<StringImm>()) { | |||
| auto value = GetValue<std::string>(node_value); | |||
| size_t tensor_size = value.size(); | |||
| auto address = device_context->CreateDeviceAddress(nullptr, tensor_size, kOpFormat_DEFAULT, kNumberTypeUInt8); | |||
| auto address = | |||
| device_context->CreateDeviceAddress(nullptr, tensor_size, kOpFormat_DEFAULT, kNumberTypeUInt8, ShapeVector()); | |||
| MS_EXCEPTION_IF_NULL(address); | |||
| address->set_from_persistent_mem(true); | |||
| MS_LOG(DEBUG) << "Create addr for node:" << common::AnfAlgo::GetNodeDebugString(value_node) | |||
| @@ -199,8 +200,8 @@ void CreateKernelOutputDeviceAddress(const DeviceContext *device_context, const | |||
| auto output_format = AnfAlgo::GetOutputFormat(kernel, i); | |||
| auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i); | |||
| auto address_size = AnfAlgo::GetOutputTensorMemSize(kernel, i); | |||
| auto device_address = device_context->CreateDeviceAddress(nullptr, address_size, output_format, output_type); | |||
| device_address->set_host_shape(trans::GetRuntimePaddingShape(kernel, i)); | |||
| auto device_address = device_context->CreateDeviceAddress(nullptr, address_size, output_format, output_type, | |||
| trans::GetRuntimePaddingShape(kernel, i)); | |||
| if (is_gradient_out) { | |||
| device_address->set_from_persistent_mem(true); | |||
| } | |||
| @@ -227,7 +228,8 @@ void CreateKernelWorkspaceDeviceAddress(const DeviceContext *device_context, con | |||
| if (AnfAlgo::WorkspaceAddrExist(kernel, i)) { | |||
| break; | |||
| } | |||
| auto device_address = device_context->CreateDeviceAddress(nullptr, workspace_sizes[i], "", kTypeUnknown); | |||
| auto device_address = | |||
| device_context->CreateDeviceAddress(nullptr, workspace_sizes[i], "", kTypeUnknown, ShapeVector()); | |||
| MS_LOG(DEBUG) << "Create addr for node:" << common::AnfAlgo::GetNodeDebugString(kernel) | |||
| << " addr:" << device_address; | |||
| AnfAlgo::SetWorkspaceAddr(device_address, i, kernel.get()); | |||
| @@ -2046,8 +2046,9 @@ void GraphScheduler::PersistDeviceTensor(const GraphCompilerInfo &graph_compiler | |||
| if (DeviceTensorStore::GetInstance().Fetch(front_node.get(), device_context->GetDeviceAddressType()) == nullptr) { | |||
| MS_LOG(WARNING) << "Fetch no device tensor store by:" << front_node->fullname_with_scope() | |||
| << ", type:" << device_context->GetDeviceAddressType(); | |||
| auto other_type_device_tensor = device_context->CreateDeviceAddress( | |||
| nullptr, device_tensor->GetSize(), device_tensor->format(), device_tensor->type_id()); | |||
| auto other_type_device_tensor = | |||
| device_context->CreateDeviceAddress(nullptr, device_tensor->GetSize(), device_tensor->format(), | |||
| device_tensor->type_id(), device_tensor->host_shape()); | |||
| other_type_device_tensor->SetNodeIndex(input_node, 0); | |||
| other_type_device_tensor->set_from_persistent_mem(input_node->isa<Parameter>()); | |||
| AddDeviceTensorStore(front_node.get(), other_type_device_tensor); | |||
| @@ -2089,8 +2090,9 @@ void GraphScheduler::PersistDeviceTensorForRootGraphControlNode(const GraphCompi | |||
| auto sub_device_tensor = AnfAlgo::GetMutableOutputAddr(backend_node, 0, false); | |||
| MS_EXCEPTION_IF_NULL(sub_device_tensor); | |||
| auto new_device_tensor = device_context->CreateDeviceAddress( | |||
| nullptr, sub_device_tensor->GetSize(), sub_device_tensor->format(), sub_device_tensor->type_id()); | |||
| auto new_device_tensor = | |||
| device_context->CreateDeviceAddress(nullptr, sub_device_tensor->GetSize(), sub_device_tensor->format(), | |||
| sub_device_tensor->type_id(), sub_device_tensor->host_shape()); | |||
| MS_EXCEPTION_IF_NULL(new_device_tensor); | |||
| new_device_tensor->SetNodeIndex(backend_node, 0); | |||
| new_device_tensor->set_is_ptr_persisted(sub_device_tensor->is_ptr_persisted()); | |||
| @@ -88,7 +88,7 @@ class DeviceContext { | |||
| // Create concrete device address according different device type. | |||
| virtual DeviceAddressPtr CreateDeviceAddress(void *const device_ptr, size_t device_size, const string &format, | |||
| TypeId type_id) const = 0; | |||
| TypeId type_id, const ShapeVector &shape) const = 0; | |||
| // Get device address type according different device type, such GPU, Ascend. | |||
| virtual DeviceAddressType GetDeviceAddressType() const = 0; | |||