| @@ -1018,7 +1018,6 @@ void AscendSession::AdjustKernel(const std::shared_ptr<KernelGraph> &kernel_grap | |||
| void AscendSession::RunOpAdjustKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const { | |||
| MS_LOG(INFO) << "Start!"; | |||
| opt::HideNopNode(kernel_graph.get()); | |||
| // Insert CLearZero op | |||
| // prepare for next step from json get atomic info | |||
| BuildKernel(kernel_graph); | |||
| @@ -1079,7 +1078,6 @@ void AscendSession::RunOpMemoryAlloc(const std::vector<tensor::TensorPtr> &input | |||
| KernelGraph *kernel_graph) const { | |||
| MS_LOG(INFO) << "Start memory alloc!"; | |||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||
| opt::RemoveNopNode(kernel_graph); | |||
| auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); | |||
| MS_EXCEPTION_IF_NULL(runtime_instance); | |||
| runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph); | |||
| @@ -418,8 +418,6 @@ void GPUSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &grap | |||
| SelectKernel(kernel_graph); | |||
| RunOpHardwareOptimize(kernel_graph); | |||
| StartKernelRT(); | |||
| // Hide NopOp from execution graph | |||
| opt::HideNopNode(kernel_graph.get()); | |||
| BuildKernel(kernel_graph); | |||
| run_op_graphs_[graph_info] = kernel_graph; | |||
| } | |||
| @@ -434,8 +432,6 @@ void GPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info, | |||
| // run op | |||
| auto kernel_graph = run_op_graphs_[graph_info]; | |||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||
| // Remove NopOp from execution graph | |||
| opt::RemoveNopNode(kernel_graph.get()); | |||
| RunOpAllocateMemory(*input_tensors, kernel_graph.get()); | |||
| // Execute the computation | |||
| LoadInputData(kernel_graph, *input_tensors); | |||
| @@ -1173,7 +1173,12 @@ void SessionBasic::UpdateOutputs(const std::shared_ptr<KernelGraph> &kernel_grap | |||
| auto &tensor = item.first; | |||
| auto &node = item.second.first; | |||
| auto &output_index = item.second.second; | |||
| auto address = AnfAlgo::GetMutableOutputAddr(node, output_index); | |||
| DeviceAddressPtr address = nullptr; | |||
| if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) { | |||
| address = AnfAlgo::GetMutableOutputAddr(node, output_index, false); | |||
| } else { | |||
| address = AnfAlgo::GetMutableOutputAddr(node, output_index); | |||
| } | |||
| MS_EXCEPTION_IF_NULL(tensor); | |||
| tensor->set_device_address(address); | |||
| tensor->SetNeedWait(false); | |||
| @@ -988,7 +988,7 @@ AnfNodePtr PynativeExecutor::GetInput(const py::object &obj, bool op_mask) { | |||
| // out = op(cell1(x, y)) | |||
| // out = op(cell1(x, y)[0]) | |||
| node = GetObjNode(obj, obj_id); | |||
| } else if (py::isinstance<py::tuple>(obj)) { | |||
| } else if (py::isinstance<py::tuple>(obj) || py::isinstance<py::list>(obj)) { | |||
| // out = op((x, y)) | |||
| // out = cell((x, y)) | |||
| auto tuple = obj.cast<py::tuple>(); | |||
| @@ -1100,6 +1100,23 @@ void PynativeExecutor::CleanPreMemoryInValueNode(const std::string &cell_id) { | |||
| top_cell_id_ = cell_id; | |||
| return; | |||
| } | |||
| if (dynamic_cell_) { | |||
| std::set<std::string> forward_op_tensor_id; | |||
| for (const auto &elem : cell_op_index_with_tensor_id_[top_cell_id_]) { | |||
| const auto &tensor_id_list = elem.second; | |||
| for (const auto &tensor_id : tensor_id_list) { | |||
| forward_op_tensor_id.emplace(tensor_id); | |||
| } | |||
| } | |||
| for (auto &tensor : all_value_node_tensors_) { | |||
| if (tensor->device_address() != nullptr && | |||
| forward_op_tensor_id.find(tensor->id()) != forward_op_tensor_id.end()) { | |||
| tensor->device_address()->ClearDeviceMemory(); | |||
| tensor->set_device_address(nullptr); | |||
| } | |||
| } | |||
| all_value_node_tensors_.clear(); | |||
| } | |||
| const auto &tensor_id_with_tensor = cell_tensor_id_with_tensor_[top_cell_id_]; | |||
| for (const auto &elem : tensor_id_with_tensor) { | |||
| const auto &tensors_in_value_node = elem.second; | |||
| @@ -2111,6 +2128,37 @@ std::string PynativeExecutor::GetGradCellId(bool has_sens, const py::object &cel | |||
| return cell_id; | |||
| } | |||
| void PynativeExecutor::SaveAllValueNodeTensors(const FuncGraphPtr &graph) { | |||
| std::unordered_set<tensor::TensorPtr> all_value_node_tensors; | |||
| auto trace_function = [&all_value_node_tensors](const AnfNodePtr &anf_node) { | |||
| auto value = GetValueNode(anf_node); | |||
| if (value) { | |||
| if (value->isa<tensor::Tensor>()) { | |||
| auto tensor = value->cast<tensor::TensorPtr>(); | |||
| MS_EXCEPTION_IF_NULL(tensor); | |||
| if (tensor->device_address()) { | |||
| all_value_node_tensors.emplace(tensor); | |||
| } | |||
| } else if (value->isa<ValueTuple>()) { | |||
| auto tuple = value->cast<ValueTuplePtr>(); | |||
| MS_EXCEPTION_IF_NULL(tuple); | |||
| for (size_t i = 0; i < tuple->size(); i++) { | |||
| if ((*tuple)[i]->isa<tensor::Tensor>()) { | |||
| auto tensor = (*tuple)[i]->cast<tensor::TensorPtr>(); | |||
| MS_EXCEPTION_IF_NULL(tensor); | |||
| if (tensor->device_address()) { | |||
| all_value_node_tensors.emplace(tensor); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return FOLLOW; | |||
| }; | |||
| (void)TopoSort(graph->get_return(), SuccDeeperSimple, trace_function); | |||
| all_value_node_tensors_ = all_value_node_tensors; | |||
| } | |||
| void PynativeExecutor::GradNetInner(const GradOperationPtr &grad, const py::object &cell, const py::object &weights, | |||
| const py::args &args) { | |||
| auto size = args.size(); | |||
| @@ -2152,6 +2200,9 @@ void PynativeExecutor::GradNetInner(const GradOperationPtr &grad, const py::obje | |||
| resource->results()[pipeline::kBackend] = compile::CreateBackend(); | |||
| MS_LOG(INFO) << "Start opt"; | |||
| if (dynamic_cell_) { | |||
| SaveAllValueNodeTensors(resource->func_graph()); | |||
| } | |||
| PynativeOptimizeAction(resource); | |||
| SaveTensorsInValueNode(resource); | |||
| TaskEmitAction(resource); | |||
| @@ -200,6 +200,7 @@ class PynativeExecutor : public std::enable_shared_from_this<PynativeExecutor> { | |||
| // Update the abstract and device address info of value node and tensors in bprop graph | |||
| void UpdateAbstractAndDeviceAddress(const OpExecInfoPtr &op_exec_info, const py::object &out_real); | |||
| void SaveTensorsInValueNode(const ResourcePtr &resource); | |||
| void SaveAllValueNodeTensors(const FuncGraphPtr &graph); | |||
| void CleanPreMemoryInValueNode(const std::string &cell_id); | |||
| // Construct grad graph | |||
| @@ -306,6 +307,7 @@ class PynativeExecutor : public std::enable_shared_from_this<PynativeExecutor> { | |||
| std::unordered_map<std::string, TensorIdWithTensor> cell_tensor_id_with_tensor_; | |||
| std::unordered_map<std::string, abstract::AbstractBasePtr> node_abs_map_; | |||
| std::unordered_map<std::string, AbstractListMap> prim_abs_list_; | |||
| std::unordered_set<tensor::TensorPtr> all_value_node_tensors_; | |||
| }; | |||
| using PynativeExecutorPtr = std::shared_ptr<PynativeExecutor>; | |||
| @@ -612,7 +612,7 @@ bool AscendDeviceAddress::ConvertFormatAndSyncHostToDevice(const ShapeVector &sh | |||
| return sync_ok; | |||
| } | |||
| AscendDeviceAddress::~AscendDeviceAddress() { | |||
| void AscendDeviceAddress::ClearDeviceMemory() { | |||
| if (ptr_ == nullptr) { | |||
| return; | |||
| } | |||
| @@ -627,6 +627,8 @@ AscendDeviceAddress::~AscendDeviceAddress() { | |||
| } | |||
| } | |||
| AscendDeviceAddress::~AscendDeviceAddress() { ClearDeviceMemory(); } | |||
| bool AscendDeviceAddress::DumpMemToFile(bool trans_flag, const std::string &filepath, const std::string &host_fmt, | |||
| const ShapeVector &host_shape, TypeId host_type) const { | |||
| bool ret = false; | |||
| @@ -41,6 +41,7 @@ class AscendDeviceAddress : public DeviceAddress { | |||
| ~AscendDeviceAddress() override; | |||
| bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const override; | |||
| bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr) const override; | |||
| void ClearDeviceMemory() override; | |||
| DeviceAddressType DeviceType() const override { return DeviceAddressType::kAscend; } | |||
| bool DumpMemToFile(bool dump_mode, const std::string &filepath, const std::string &host_fmt, | |||
| const ShapeVector &host_shape, TypeId host_type) const override; | |||
| @@ -35,6 +35,7 @@ class CPUDeviceAddress : public DeviceAddress { | |||
| bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const override; | |||
| bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr) const override; | |||
| void ClearDeviceMemory() override {} | |||
| DeviceAddressType DeviceType() const override { return DeviceAddressType::kCPU; } | |||
| }; | |||
| } // namespace cpu | |||
| @@ -69,7 +69,7 @@ bool GPUDeviceAddress::SyncHostToDevice(const ShapeVector &, size_t size, TypeId | |||
| return GPUDeviceManager::GetInstance().SyncStream(stream); | |||
| } | |||
| GPUDeviceAddress::~GPUDeviceAddress() { | |||
| void GPUDeviceAddress::ClearDeviceMemory() { | |||
| if (ptr_ == nullptr) { | |||
| return; | |||
| } | |||
| @@ -78,6 +78,8 @@ GPUDeviceAddress::~GPUDeviceAddress() { | |||
| ptr_ = nullptr; | |||
| } | |||
| } | |||
| GPUDeviceAddress::~GPUDeviceAddress() { ClearDeviceMemory(); } | |||
| #ifdef ENABLE_DEBUGGER | |||
| bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt, | |||
| const ShapeVector &host_shape, TypeId host_type, size_t slot, | |||
| @@ -38,6 +38,7 @@ class GPUDeviceAddress : public DeviceAddress { | |||
| bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const override; | |||
| bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr) const override; | |||
| void ClearDeviceMemory() override; | |||
| void set_status(DeviceAddressStatus status) { status_ = status; } | |||
| DeviceAddressStatus status() const { return status_; } | |||
| DeviceAddressType DeviceType() const override { return DeviceAddressType::kGPU; } | |||
| @@ -819,6 +819,9 @@ void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod | |||
| if (AnfAlgo::GetCNodeName(cnode) == kAtomicAddrCleanOpName) { | |||
| return GenAddrCleanLaunchArgs(cnode, kernel_inputs); | |||
| } | |||
| auto ms_context = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(ms_context); | |||
| auto visit_nop_node = (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode); | |||
| for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) { | |||
| auto op_name = AnfAlgo::GetCNodeName(cnode); | |||
| constexpr auto none_placeholder_index = 3; | |||
| @@ -833,7 +836,7 @@ void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod | |||
| } | |||
| } | |||
| auto real_input = AnfAlgo::GetRealInputIndex(kernel, i); | |||
| auto device_address = AnfAlgo::GetPrevNodeOutputAddr(kernel, real_input); | |||
| auto device_address = AnfAlgo::GetPrevNodeOutputAddr(kernel, real_input, visit_nop_node); | |||
| MS_EXCEPTION_IF_NULL(device_address); | |||
| kernel::AddressPtr input = std::make_shared<kernel::Address>(); | |||
| MS_EXCEPTION_IF_NULL(input); | |||
| @@ -844,7 +847,7 @@ void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod | |||
| } | |||
| for (size_t i = 0; i < kernel_mod.GetOutputSizeList().size(); ++i) { | |||
| auto device_address = AnfAlgo::GetOutputAddr(kernel, i); | |||
| auto device_address = AnfAlgo::GetOutputAddr(kernel, i, visit_nop_node); | |||
| kernel::AddressPtr output = std::make_shared<kernel::Address>(); | |||
| MS_EXCEPTION_IF_NULL(output); | |||
| output->addr = device_address->ptr_; | |||
| @@ -33,6 +33,7 @@ class DeviceSync { | |||
| virtual bool SyncDeviceToHost(const ShapeVector &shape, size_t size, TypeId type, void *host_ptr) const = 0; | |||
| virtual bool SyncHostToDevice(const ShapeVector &shape, size_t size, TypeId type, const void *host_ptr) const = 0; | |||
| virtual void *GetMutablePtr() const = 0; | |||
| virtual void ClearDeviceMemory() = 0; | |||
| }; | |||
| using DeviceSyncPtr = std::shared_ptr<DeviceSync>; | |||
| } // namespace mindspore | |||