/** * Copyright 2019-2020 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "session/gpu_session.h" #include "device/gpu/kernel_info_setter.h" #include "device/gpu/gpu_kernel_build.h" #include "device/gpu/gpu_kernel_runtime.h" #include "device/gpu/gpu_stream_assign.h" #include "pre_activate/common/optimizer.h" #include "pre_activate/common/pass_manager.h" #include "pre_activate/common/helper.h" #include "pre_activate/pass/communication_op_fusion.h" #include "pre_activate/pass/getitem_tuple.h" #include "pre_activate/gpu/adam_weight_decay_fusion.h" #include "pre_activate/gpu/adam_fusion.h" #include "device/kernel_runtime_manager.h" #include "predict/predict.h" #include "common/utils.h" #include "common/trans.h" #include "utils/context/ms_context.h" #include "utils/base_ref_extends.h" namespace mindspore { namespace session { namespace gpu { using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm; void GPUSession::SelectKernel(const std::shared_ptr &kernel_graph) const { MS_EXCEPTION_IF_NULL(kernel_graph); for (const auto &kernel_node : kernel_graph->execution_order()) { MS_EXCEPTION_IF_NULL(kernel_node); device::gpu::SetKernelInfo(kernel_node); } } void GPUSession::StartKernelRT() const { auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); if (!runtime_instance->Init()) { MS_LOG(EXCEPTION) << "GPU start kernel runtime failed"; } } void GPUSession::Optimize(const std::shared_ptr &kernel_graph) { MS_EXCEPTION_IF_NULL(kernel_graph); auto optimizer = std::make_shared(); auto pm = std::make_shared(); pm->AddPass(std::make_shared()); pm->AddPass(std::make_shared()); optimizer->AddPassManager(pm); (void)optimizer->Optimize(kernel_graph); kernel_graph->SetExecOrderByDefault(); } void GPUSession::HardwareOptimize(const std::shared_ptr &kernel_graph) { auto optimizer = std::make_shared(); auto pm = std::make_shared(); pm->AddPass(std::make_shared()); pm->AddPass(std::make_shared()); optimizer->AddPassManager(pm); (void)optimizer->Optimize(kernel_graph); kernel_graph->SetExecOrderByDefault(); } void GPUSession::AssignStream(const std::shared_ptr &kernel_graph) { MS_EXCEPTION_IF_NULL(kernel_graph); device::gpu::AssignGpuStream(kernel_graph); } void GPUSession::BuildKernel(const std::shared_ptr &kernel_graph) const { device::gpu::GpuBuild(kernel_graph); } void GPUSession::AllocateMemory(KernelGraph *kernel_graph) const { MS_EXCEPTION_IF_NULL(kernel_graph); auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); runtime_instance->AssignMemory(kernel_graph); } void GPUSession::RunOpAllocateMemory(const std::vector &input_tensors, KernelGraph *kernel_graph) const { MS_EXCEPTION_IF_NULL(kernel_graph); auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph); } void GPUSession::RunOpClearMemory(KernelGraph *kernel_graph) const { MS_EXCEPTION_IF_NULL(kernel_graph); auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); runtime_instance->RunOpClearMemory(kernel_graph); } void GPUSession::LoadInputData(const std::shared_ptr &kernel_graph, const std::vector &inputs_const) const { std::vector inputs(inputs_const); MS_EXCEPTION_IF_NULL(kernel_graph); auto input_nodes = kernel_graph->inputs(); auto ms_context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(ms_context); for (size_t i = 0; i < inputs.size(); ++i) { auto tensor = inputs[i]; MS_EXCEPTION_IF_NULL(tensor); auto input_node = input_nodes[i]; MS_EXCEPTION_IF_NULL(input_node); if (input_node->isa() && AnfAlgo::OutputAddrExist(input_node, 0)) { auto pk_node = input_node->cast(); auto device_address = AnfAlgo::GetMutableOutputAddr(pk_node, 0); auto tensor_address = tensor->device_address(); bool need_sync = false; if (ms_context->enable_pynative_infer()) { if (tensor_address == nullptr || tensor_address != device_address) { need_sync = true; } } else if (tensor->is_dirty() || tensor_address == nullptr) { need_sync = true; } else if (tensor_address != device_address) { if (tensor_address->DeviceType() == device_address->DeviceType()) { AnfAlgo::SetOutputAddr(tensor_address, 0, pk_node.get()); } else { need_sync = true; } } if (need_sync) { tensor->set_device_address(device_address); MS_EXCEPTION_IF_NULL(device_address); if (!device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(pk_node, 0), LongToSize(tensor->data().nbytes()), tensor->data_type(), tensor->data_c())) { MS_LOG(EXCEPTION) << "SyncHostToDevice failed."; } } } tensor->set_dirty(false); } } void GPUSession::Execute(const std::shared_ptr &kernel_graph) const { auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); if (!runtime_instance->Run(kernel_graph.get())) { MS_LOG(EXCEPTION) << "GPU execute graph failed!"; } } GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) { // Construct graph, if successfully, graph_sum_ + 1 auto graph_id = graph_sum_; auto graph = ConstructKernelGraph(lst, outputs); MS_EXCEPTION_IF_NULL(graph); // Optimize Optimize(graph); // Select kernel build info SelectKernel(graph); // Convert kernel Graph to model predictmodel::StepConvertGraph(graph); // Start gpu kernel runtime StartKernelRT(); // HardwareOptimize HardwareOptimize(graph); // Assign CUDA streams AssignStream(graph); // Hide NoOp from execution graph opt::HideNopNode(graph.get()); // Build kernel if node is cnode BuildKernel(graph); // Set graph execution order before memory alloc, ensure that memory alloc is according to the reorder graph auto execution_order = graph->execution_order(); Reorder(&execution_order); graph->set_execution_order(execution_order); // Get summary nodes. GetSummaryNodes(graph.get()); // Remove NoOp from execution graph opt::RemoveNopNode(graph.get()); // Alloc memory, including static memory and dynamic memory AllocateMemory(graph.get()); MS_EXCEPTION_IF_NULL(context_); FuncGraphManagerPtr manager = MakeManager({graph}); context_->AddManager(manager); if (manager) { manager->AddFuncGraph(graph); graph->set_manager(manager); } return graph_id; } void GPUSession::RunGraph(const GraphId &graph_id, const std::vector &inputs, VectorRef *outputs) { auto &kernel_graph = graphs_[graph_id]; // Load input data from user input LoadInputData(kernel_graph, inputs); MS_EXCEPTION_IF_NULL(kernel_graph); // Convert inputs to model predictmodel::StepConvertWeight(inputs); { py::gil_scoped_release gil_release; // Run graph on GPU Execute(kernel_graph); } // Get result from GPU UpdateOutputs(kernel_graph, outputs, inputs); // Summary auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); if (context_ptr->enable_gpu_summary()) { Summary(kernel_graph.get()); } } void GPUSession::BuildOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info, const std::vector &input_tensors, const std::vector &tensors_mask) { // Check if the graph cache exists. if (run_op_graphs_.find(graph_info) != run_op_graphs_.end()) { return; } // Prepare the graph auto kernel_graph = ConstructSingleOpGraph(op_run_info, input_tensors, tensors_mask); MS_EXCEPTION_IF_NULL(kernel_graph); SelectKernel(kernel_graph); StartKernelRT(); // Hide NoOp from execution graph opt::HideNopNode(kernel_graph.get()); BuildKernel(kernel_graph); run_op_graphs_[graph_info] = kernel_graph; } py::tuple GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info, const std::vector &input_tensors) { auto kernel_graph = run_op_graphs_[graph_info]; MS_EXCEPTION_IF_NULL(kernel_graph); // Remove NoOp from execution graph opt::RemoveNopNode(kernel_graph.get()); RunOpAllocateMemory(input_tensors, kernel_graph.get()); // Execute the computation LoadInputData(kernel_graph, input_tensors); Execute(kernel_graph); // Fetch outputs VectorRef outputs; UpdateOutputs(kernel_graph, &outputs, input_tensors); // Trans output to tuple auto output_tensors = TransformBaseRefListToTuple(outputs); if (!utils::isa(output_tensors) || !py::isinstance(utils::cast(output_tensors).object_)) { MS_EXCEPTION(NotSupportError) << "The output tensors should be a tuple !"; } py::object tuple_obj = utils::cast(output_tensors).object_; py::tuple tuple_tensors = py::cast(tuple_obj); RunOpClearMemory(kernel_graph.get()); return tuple_tensors; } } // namespace gpu } // namespace session } // namespace mindspore