Browse Source

!25132 [PyNative][MindRT][GPU] Op Lazy Build

Merge pull request !25132 from caifubi/master-pynative-mindrt-gpu-async-build
tags/v1.6.0
i-robot Gitee 4 years ago
parent
commit
1a7a04e4c9
15 changed files with 506 additions and 171 deletions
  1. +1
    -0
      mindspore/ccsrc/CMakeLists.txt
  2. +0
    -5
      mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc
  3. +3
    -3
      mindspore/ccsrc/backend/session/ascend_session.cc
  4. +22
    -14
      mindspore/ccsrc/backend/session/session_basic.cc
  5. +7
    -5
      mindspore/ccsrc/backend/session/session_basic.h
  6. +33
    -11
      mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
  7. +44
    -20
      mindspore/ccsrc/runtime/framework/graph_compiler.cc
  8. +10
    -5
      mindspore/ccsrc/runtime/framework/graph_compiler.h
  9. +5
    -0
      mindspore/ccsrc/runtime/op_builder/CMakeLists.txt
  10. +45
    -0
      mindspore/ccsrc/runtime/op_builder/op_lazy_builder.cc
  11. +120
    -0
      mindspore/ccsrc/runtime/op_builder/op_lazy_builder.h
  12. +189
    -85
      mindspore/ccsrc/vm/backend.cc
  13. +27
    -9
      mindspore/ccsrc/vm/backend.h
  14. +0
    -13
      mindspore/ccsrc/vm/vm.cc
  15. +0
    -1
      mindspore/ccsrc/vm/vm.h

+ 1
- 0
mindspore/ccsrc/CMakeLists.txt View File

@@ -220,6 +220,7 @@ set(SUB_COMP
runtime/device
runtime/framework
runtime/hardware
runtime/op_builder
runtime/hccl_adapter
frontend/optimizer
frontend/parallel


+ 0
- 5
mindspore/ccsrc/backend/kernel_compiler/gpu/data/dataset_iterator_kernel.cc View File

@@ -70,11 +70,6 @@ bool DatasetIteratorKernel::Init(const CNodePtr &kernel_node) {
total_bytes_ += bytes;
}

handle_ = GpuBufferMgr::GetInstance().Open(0, queue_name_, output_size_list_);
if (handle_ == HandleMgr::INVALID_HANDLE) {
MS_LOG(EXCEPTION) << "Gpu Queue(" << queue_name_ << ") Open Failed";
}

#ifndef ENABLE_SECURITY
auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance();
MS_EXCEPTION_IF_NULL(profiler_inst);


+ 3
- 3
mindspore/ccsrc/backend/session/ascend_session.cc View File

@@ -991,13 +991,13 @@ void AscendSession::BuildOpsInGraph(const GraphId &graph_id, const std::map<AnfN
InputTensorInfo input_tensor_info;
GetOpInputStubTensors(kernel, parameter_index, graph_inputs, op_output_info, &input_tensor_info);
// Get OpRunInfo and GraphInfo
OpRunInfo op_run_info;
GetSingleOpRunInfo(kernel, &op_run_info);
const GraphInfo &graph_info = GetSingleOpGraphInfo(kernel, input_tensor_info.input_tensors);
OpRunInfo op_run_info = GetSingleOpRunInfo(kernel, graph_info, input_tensor_info);
if (op_run_info.is_dynamic_shape) {
MS_LOG(INFO) << "BuildOpsInGraph stop, op " << op_run_info.op_name << " is dynamic shape.";
break;
}
const GraphInfo &graph_info = GetSingleOpGraphInfo(kernel, input_tensor_info.input_tensors);
const auto &single_op_graph_iter = run_op_graphs_.find(graph_info);
if (single_op_graph_iter != run_op_graphs_.end()) {
// if graph of same single op exists, the output tensor of current op should be generated


+ 22
- 14
mindspore/ccsrc/backend/session/session_basic.cc View File

@@ -1218,20 +1218,29 @@ GraphInfo SessionBasic::GetSingleOpGraphInfo(const CNodePtr &kernel,
return graph_info;
}

void SessionBasic::GetSingleOpRunInfo(const CNodePtr cnode, OpRunInfo *run_info) {
OpRunInfo SessionBasic::GetSingleOpRunInfo(const CNodePtr &cnode, const GraphInfo &graph_info,
const InputTensorInfo &tensor_info) {
MS_EXCEPTION_IF_NULL(cnode);
MS_EXCEPTION_IF_NULL(run_info);
auto primitive = AnfAlgo::GetCNodePrimitive(cnode);
run_info->primitive = primitive;
run_info->op_name = primitive->name();
const auto &abstract = cnode->abstract();
if (abstract == nullptr) {
MS_LOG(EXCEPTION) << "Abstract is nullptr, node = " << cnode->DebugString();
}
run_info->abstract = abstract;
const auto &shape = abstract->BuildShape();
MS_EXCEPTION_IF_NULL(shape);
run_info->is_dynamic_shape = shape->IsDynamic();

OpRunInfo op_run_info = {.op_name = primitive->name(),
.primitive = primitive.get(),
.abstract = abstract,
.is_dynamic_shape = shape->IsDynamic(),
.is_auto_mixed_precision = false,
.lazy_build = false,
.next_op_name = std::string(),
.next_input_index = 0,
.graph_info = graph_info,
.tensor_mask = tensor_info.input_tensors_mask,
.input_tensors = tensor_info.input_tensors};
return op_run_info;
}

void SessionBasic::GetParameterIndex(const KernelGraph *graph, const std::vector<tensor::TensorPtr> &inputs,
@@ -2143,7 +2152,7 @@ std::shared_ptr<KernelGraph> SessionBasic::ConstructSingleOpGraph(const OpRunInf
graph_sum_++;
std::vector<AnfNodePtr> inputs;
// set input[0]
PrimitivePtr op_prim = op_run_info.primitive;
auto op_prim = op_run_info.primitive;
MS_EXCEPTION_IF_NULL(op_prim);
// Decoupling of frontend PrimitivePy and backend Primitive
inputs.push_back(std::make_shared<ValueNode>(std::make_shared<Primitive>(*op_prim)));
@@ -2238,11 +2247,11 @@ void SessionBasic::BuildGraph(GraphId graph_id) {
executor_->BuildGraph(shared_from_this(), graph_id);
}

void SessionBasic::RunOp(OpRunInfo *op_run_info, const GraphInfo &graph_info,
std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs,
const std::vector<int64_t> &tensors_mask) {
void SessionBasic::RunOp(OpRunInfo *op_run_info, VectorRef *outputs) {
MS_EXCEPTION_IF_NULL(executor_);
executor_->RunOp(shared_from_this(), op_run_info, graph_info, input_tensors, outputs, tensors_mask);
MS_EXCEPTION_IF_NULL(op_run_info);
executor_->RunOp(shared_from_this(), op_run_info, op_run_info->graph_info, &op_run_info->input_tensors, outputs,
op_run_info->tensor_mask);
}

void SessionBasic::RunOpsInGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs,
@@ -2305,13 +2314,12 @@ void SessionBasic::RunOpsInGraphImpl(const GraphId &graph_id, const std::vector<
InputTensorInfo input_tensor_info;
GetOpInputTensors(kernel, op_output_map, parameter_index, inputs, &input_tensor_info);

VectorRef op_outputs;
// Get OpRunInfo and GraphInfo
OpRunInfo run_info;
GetSingleOpRunInfo(kernel, &run_info);
GraphInfo graph_info = GetSingleOpGraphInfo(kernel, input_tensor_info.input_tensors);
OpRunInfo run_info = GetSingleOpRunInfo(kernel, graph_info, input_tensor_info);

// Build and run current single op
VectorRef op_outputs;
RunOpImplOrigin(graph_info, &run_info, &input_tensor_info.input_tensors, &op_outputs,
input_tensor_info.input_tensors_mask);
graph_output_info.graph_output_tensors.clear();


+ 7
- 5
mindspore/ccsrc/backend/session/session_basic.h View File

@@ -58,17 +58,20 @@ using AnyListPtr = std::shared_ptr<AnyList>;

struct OpRunInfo {
std::string op_name;
PrimitivePtr primitive;
Primitive *primitive;
AbstractBasePtr abstract;
bool is_dynamic_shape = false;
bool is_auto_mixed_precision = false;
bool lazy_build = false;
std::string next_op_name = "";
std::string next_op_name;
#if defined(__APPLE__)
int next_input_index = 0;
#else
size_t next_input_index = 0;
#endif
std::string graph_info;
std::vector<int64_t> tensor_mask;
std::vector<tensor::TensorPtr> input_tensors;
};

struct InputTensorInfo {
@@ -108,8 +111,7 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
void BuildGraph(GraphId graphId);
void RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs);
void RunGraphAsync(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs);
void RunOp(OpRunInfo *, const GraphInfo &, std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs,
const std::vector<int64_t> &tensors_mask);
void RunOp(OpRunInfo *, VectorRef *outputs);
void RunOpsInGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs);

#ifndef ENABLE_SECURITY
@@ -276,7 +278,7 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
CNodePtr ConstructOutput(const AnfNodePtrList &outputs, const std::shared_ptr<KernelGraph> &graph);
// Generate graph info for a single op graph
GraphInfo GetSingleOpGraphInfo(const CNodePtr &kernel, const std::vector<tensor::TensorPtr> &input_tensors);
void GetSingleOpRunInfo(const CNodePtr cnode, OpRunInfo *run_info);
OpRunInfo GetSingleOpRunInfo(const CNodePtr &cnode, const GraphInfo &graph_info, const InputTensorInfo &tensor_info);
tensor::TensorPtr GetValueNodeOutputTensor(const AnfNodePtr &node, size_t output_index);
tensor::TensorPtr GetParameterOutputTensor(const AnfNodePtr &node,
const std::map<AnfNodePtr, size_t> &parameter_index,


+ 33
- 11
mindspore/ccsrc/pipeline/pynative/pynative_execute.cc View File

@@ -751,6 +751,9 @@ void UpdateTensorInfo(const tensor::TensorPtr &new_tensor, const std::vector<ten
pre_tensor->set_device_address(new_tensor->device_address());
continue;
}
if (mind_rt_backend != nullptr) {
mind_rt_backend->SyncLazyTasks();
}
// Replace data in device address when run in CPU device.
if (pre_tensor->device_address() != nullptr) {
auto old_device_address = std::dynamic_pointer_cast<device::DeviceAddress>(pre_tensor->device_address());
@@ -811,6 +814,10 @@ py::object GetDstType(const TypeId &type_id) {
MS_EXCEPTION_IF_NULL(value);
return py::cast(value);
}

bool IsPyObjTypeInvalid(const py::object &obj) {
return !py::isinstance<tensor::Tensor>(obj) && !py::isinstance<py::int_>(obj) && !py::isinstance<py::float_>(obj);
}
} // namespace

py::object RealRunOp(const py::args &args) {
@@ -1276,7 +1283,7 @@ void ForwardExecutor::DoSignatureCast(const PrimitivePyPtr &prim,
continue;
}

if (!py::isinstance<tensor::Tensor>(obj) && !py::isinstance<py::int_>(obj) && !py::isinstance<py::float_>(obj)) {
if (IsPyObjTypeInvalid(obj)) {
MS_EXCEPTION(TypeError) << "For '" << prim->name() << "', the " << i << "th input " << signature[i].name
<< " is a not support implicit conversion. "
<< "Its type is " << py::cast<std::string>(obj.attr("__class__").attr("__name__"))
@@ -2013,28 +2020,35 @@ py::object ForwardExecutor::RunOpInMs(const OpExecInfoPtr &op_exec_info, Pynativ
ConvertAttrToUnifyMindIR(op_exec_info);
// get graph info for checking it whether existing in the cache
GetSingleOpGraphInfo(op_exec_info, input_tensors, tensors_mask, &graph_info);
VectorRef outputs;
#if defined(__APPLE__)
session::OpRunInfo op_run_info = {op_exec_info->op_name,
op_exec_info->py_primitive,
op_exec_info->py_primitive.get(),
op_exec_info->abstract,
op_exec_info->is_dynamic_shape,
op_exec_info->is_mixed_precision_cast,
op_exec_info->lazy_build,
op_exec_info->next_op_name,
static_cast<int>(op_exec_info->next_input_index)};
static_cast<int>(op_exec_info->next_input_index),
graph_info,
tensors_mask,
input_tensors};
#else
session::OpRunInfo op_run_info = {op_exec_info->op_name,
op_exec_info->py_primitive,
op_exec_info->py_primitive.get(),
op_exec_info->abstract,
op_exec_info->is_dynamic_shape,
op_exec_info->is_mixed_precision_cast,
op_exec_info->lazy_build,
op_exec_info->next_op_name,
op_exec_info->next_input_index};
op_exec_info->next_input_index,
graph_info,
tensors_mask,
input_tensors};
#endif
VectorRef outputs;
if (!ms_context->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
kSession->RunOp(&op_run_info, graph_info, &input_tensors, &outputs, tensors_mask);
kSession->RunOp(&op_run_info, &outputs);
} else {
if (mind_rt_backend == nullptr) {
const auto &device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
@@ -2043,9 +2057,7 @@ py::object ForwardExecutor::RunOpInMs(const OpExecInfoPtr &op_exec_info, Pynativ
}

mindspore::ScopedLongRunning long_running;
const compile::ActorInfo &actor_info =
mind_rt_backend->CompileGraph(op_run_info, graph_info, &tensors_mask, &input_tensors);
mind_rt_backend->RunGraph(actor_info, &op_run_info, &tensors_mask, &input_tensors, &outputs);
mind_rt_backend->RunOp(&op_run_info, &outputs);
}

if (op_exec_info->is_dynamic_shape) {
@@ -3222,6 +3234,9 @@ void PynativeExecutor::ClearGrad(const py::object &cell, const py::args &args) {
void PynativeExecutor::ClearRes() {
MS_LOG(DEBUG) << "Clear all res";
session::PynativeTaskManager::GetInstance().Reset();
if (mind_rt_backend != nullptr) {
mind_rt_backend->ClearOpBuilderResource();
}
SetLazyBuild(false);
cell_depth_ = 0;

@@ -3279,6 +3294,8 @@ void PynativeExecutor::Sync() {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);

ExecuteAllTask();

if (!ms_context->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
if (kSession == nullptr) {
MS_EXCEPTION(NotExistsError) << "No session has been created!";
@@ -3312,7 +3329,12 @@ void PynativeExecutor::ExitCell() {

bool PynativeExecutor::IsTopCell() const { return cell_depth_ == 0; }

void PynativeExecutor::ExecuteAllTask() { session::PynativeTaskManager::GetInstance().ExecuteRemainingTasks(); }
void PynativeExecutor::ExecuteAllTask() {
session::PynativeTaskManager::GetInstance().ExecuteRemainingTasks();
if (mind_rt_backend != nullptr) {
mind_rt_backend->SyncLazyTasks();
}
}

REGISTER_PYBIND_DEFINE(PynativeExecutor_, ([](const py::module *m) {
(void)py::class_<PynativeExecutor, std::shared_ptr<PynativeExecutor>>(*m, "PynativeExecutor_")


+ 44
- 20
mindspore/ccsrc/runtime/framework/graph_compiler.cc View File

@@ -18,7 +18,9 @@
#include <numeric>
#include <map>
#include <utility>
#include <algorithm>
#include "runtime/framework/graph_scheduler.h"
#include "runtime/op_builder/op_lazy_builder.h"
#include "runtime/device/device_address.h"
#include "common/trans.h"
#include "utils/convert_utils.h"
@@ -179,16 +181,16 @@ void CreateKernelOutputDeviceAddress(const DeviceContext *device_context, const
if (AnfAlgo::IsControlOpExecInBackend(kernel)) {
continue;
}
auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
MS_EXCEPTION_IF_NULL(kernel_mod);
auto output_sizes = kernel_mod->GetOutputSizeList();
for (size_t i = 0; i < output_sizes.size(); ++i) {

auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
for (size_t i = 0; i < output_size; ++i) {
if (AnfAlgo::OutputAddrExist(kernel, i)) {
continue;
}
auto output_format = AnfAlgo::GetOutputFormat(kernel, i);
auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i);
auto device_address = device_context->CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type);
auto address_size = AnfAlgo::GetOutputTensorMemSize(kernel, i);
auto device_address = device_context->CreateDeviceAddress(nullptr, address_size, output_format, output_type);
MS_LOG(DEBUG) << "Create addr for node:" << AnfAlgo::GetNodeDebugString(kernel) << " addr:" << device_address;
AnfAlgo::SetOutputAddr(device_address, i, kernel.get());
}
@@ -465,13 +467,12 @@ GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const Devic
return graph->graph_id();
}

GraphId GraphCompiler::CompileGraph(const session::OpRunInfo &op_run_info, const GraphInfo &graph_info,
const std::vector<int64_t> *tensors_mask,
std::vector<TensorPtr> *const input_tensors, bool *single_op_cache_hit,
GraphId GraphCompiler::CompileGraph(const session::OpRunInfo &op_run_info, bool *single_op_cache_hit,
const DeviceContext *device_context) {
// Check if the graph cache exists.
auto iter = run_op_graphs_.find(graph_info);
if (iter != run_op_graphs_.end()) {
auto iter = run_op_graphs_.find(op_run_info.graph_info);
auto &op_lazy_builder = runtime::OpLazyBuilder::GetInstance();
if (iter != run_op_graphs_.end() && op_lazy_builder.QueueEmpty()) {
const auto &graph = iter->second;
MS_EXCEPTION_IF_NULL(graph);
*single_op_cache_hit = true;
@@ -480,22 +481,20 @@ GraphId GraphCompiler::CompileGraph(const session::OpRunInfo &op_run_info, const
*single_op_cache_hit = false;
// Generate kernel graph.
MS_EXCEPTION_IF_NULL(session_);
KernelGraphPtr graph = session_->ConstructSingleOpGraph(op_run_info, *input_tensors, *tensors_mask);
KernelGraphPtr graph =
session_->ConstructSingleOpGraph(op_run_info, op_run_info.input_tensors, op_run_info.tensor_mask);
MS_EXCEPTION_IF_NULL(graph);

MS_EXCEPTION_IF_NULL(device_context);
device_context->OptimizeSingleOpGraph(graph);

// Generate 'KernelMod' for kernel in graph.
device_context->CreateKernel(graph->execution_order());

device_context->PreprocessBeforeRunSingleOpGraph(graph);

// Create device address for all anf nodes of graph.
CreateDeviceAddress(graph, device_context);
CreateDeviceAddressWithoutWorkspace(graph, device_context);

graph->set_is_all_nop_node(opt::IsAllNopNode(graph.get()));
run_op_graphs_[graph_info] = graph;
run_op_graphs_[op_run_info.graph_info] = graph;

auto output_nodes = graph->outputs();
auto &outputs_with_index = run_op_graph_output_nodes_[graph->graph_id()];
@@ -509,6 +508,22 @@ GraphId GraphCompiler::CompileGraph(const session::OpRunInfo &op_run_info, const
return graph->graph_id();
}

void GraphCompiler::BuildSingleOpGraphs(const std::vector<KernelGraphPtr> &graphs,
const DeviceContext *device_context) const {
MS_EXCEPTION_IF_NULL(device_context);
std::vector<CNodePtr> node_to_build;
for (const auto &graph : graphs) {
const auto &nodes = graph->execution_order();
std::copy(nodes.begin(), nodes.end(), std::back_inserter(node_to_build));
}

device_context->CreateKernel(node_to_build);

for (const auto &graph : graphs) {
CreateKernelWorkspaceDeviceAddress(device_context, graph);
}
}

KernelGraphPtr GraphCompiler::Fetch(GraphId graph_id) const {
MS_EXCEPTION_IF_NULL(session_);
return session_->GetGraph(graph_id);
@@ -532,6 +547,14 @@ void GraphCompiler::CreateDeviceAddress(const KernelGraphPtr &graph, const Devic
UpdateDeviceAddressForRefNode(graph);
}

void GraphCompiler::CreateDeviceAddressWithoutWorkspace(const KernelGraphPtr &graph,
const DeviceContext *device_context) const {
CreateParameterDeviceAddress(device_context, graph);
CreateValueNodeDeviceAddress(device_context, graph);
CreateKernelOutputDeviceAddress(device_context, graph);
UpdateDeviceAddressForInplaceNode(graph);
}

void GraphCompiler::GetParamAndOutputIndex(
const KernelGraphPtr &graph, const std::vector<TensorPtr> &inputs, VectorRef *const outputs,
std::map<AnfNodePtr, size_t> *parameter_index,
@@ -560,12 +583,13 @@ TensorPtr GraphCompiler::GetSingleOpInputTensorByIndex(const CNodePtr &kernel,
input_index);
}

void GraphCompiler::GetSingleOpRunInfoAndGraphInfo(const CNodePtr &kernel, const std::vector<TensorPtr> &input_tensors,
OpRunInfo *const run_info, GraphInfo *const graph_info) {
void GraphCompiler::GetSingleOpRunInfoAndGraphInfo(const CNodePtr &kernel, const InputTensorInfo &tensor_info,
OpRunInfo *run_info, GraphInfo *graph_info) {
MS_EXCEPTION_IF_NULL(session_);
MS_EXCEPTION_IF_NULL(graph_info);
session_->GetSingleOpRunInfo(kernel, run_info);
*graph_info = session_->GetSingleOpGraphInfo(kernel, input_tensors);

*graph_info = session_->GetSingleOpGraphInfo(kernel, tensor_info.input_tensors);
*run_info = session_->GetSingleOpRunInfo(kernel, *graph_info, tensor_info);
}

void GraphCompiler::CalculateRefCount(const KernelGraphPtr &graph, std::map<KernelWithIndex, size_t> *ref_count) const {


+ 10
- 5
mindspore/ccsrc/runtime/framework/graph_compiler.h View File

@@ -104,9 +104,11 @@ class GraphCompiler {
GraphId CompileGraph(const FuncGraphPtr &func_graph, const DeviceContext *device_context);

// Construct single op kernel graph and compile the kernel graph in PyNative mode.
GraphId CompileGraph(const session::OpRunInfo &op_run_info, const GraphInfo &graph_info,
const std::vector<int64_t> *tensors_mask, std::vector<TensorPtr> *const input_tensors,
bool *single_op_cache_hit, const DeviceContext *device_context);
GraphId CompileGraph(const session::OpRunInfo &op_run_info, bool *single_op_cache_hit,
const DeviceContext *device_context);

// Create kernel and Create workspace for graphs in PyNative mode.
void BuildSingleOpGraphs(const std::vector<KernelGraphPtr> &graphs, const DeviceContext *device_context) const;

// Get graph by graph id, if not exist return nullptr, used in Graph mode.
KernelGraphPtr Fetch(GraphId graph_id) const;
@@ -135,8 +137,8 @@ class GraphCompiler {
InputTensorInfo *const input_tensor_info, size_t input_index);

// Get OpRunInfo and GraphInfo for single op compile and run.
void GetSingleOpRunInfoAndGraphInfo(const CNodePtr &kernel, const std::vector<TensorPtr> &input_tensors,
OpRunInfo *const run_info, GraphInfo *const graph_info);
void GetSingleOpRunInfoAndGraphInfo(const CNodePtr &kernel, const InputTensorInfo &tensor_info, OpRunInfo *run_info,
GraphInfo *graph_info);

// Calculate ref count of PyNative back propagation operators.
void CalculateRefCount(const KernelGraphPtr &graph, std::map<KernelWithIndex, size_t> *ref_count) const;
@@ -181,6 +183,9 @@ class GraphCompiler {
// Create device address for all anf nodes of graph.
void CreateDeviceAddress(const KernelGraphPtr &graph, const DeviceContext *device_context) const;

// Create device address for input and output of ops.
void CreateDeviceAddressWithoutWorkspace(const KernelGraphPtr &graph, const DeviceContext *device_context) const;

// Single op kernel graph cache for PyNative mode.
std::unordered_map<GraphInfo, KernelGraphPtr> run_op_graphs_;
// Single op kernel graph output nodes cache for PyNative mode.


+ 5
- 0
mindspore/ccsrc/runtime/op_builder/CMakeLists.txt View File

@@ -0,0 +1,5 @@
file(GLOB_RECURSE BUILDER_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
"op_lazy_builder.cc")

set_property(SOURCE ${BUILDER_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
add_library(_mindspore_runtime_op_builder_obj OBJECT ${BUILDER_SRC_LIST})

+ 45
- 0
mindspore/ccsrc/runtime/op_builder/op_lazy_builder.cc View File

@@ -0,0 +1,45 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "runtime/op_builder/op_lazy_builder.h"

namespace mindspore::runtime {
void OpLazyBuilder::Register(const std::function<void()> &callback) {
execute_callback_ = callback;
registered_ = true;
}

void OpLazyBuilder::Reset() {
ClearAllResources();
execute_callback_ = nullptr;
registered_ = false;
}

void OpLazyBuilder::ClearAllResources() {
op_build_tasks.clear();
std::queue<std::shared_ptr<OpTask>> empty;
std::swap(op_run_tasks, empty);
}

void OpLazyBuilder::ExecuteRemainingTasks() {
if (!executing_) {
ExecuteGuard guard;
if (execute_callback_ != nullptr) {
execute_callback_();
}
}
}
} // namespace mindspore::runtime

+ 120
- 0
mindspore/ccsrc/runtime/op_builder/op_lazy_builder.h View File

@@ -0,0 +1,120 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_OP_BUILDER_OP_LAZY_BUILDER_H_
#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_OP_BUILDER_OP_LAZY_BUILDER_H_

#include <vector>
#include <memory>
#include <queue>
#include <map>
#include <string>
#include <utility>
#include "backend/session/kernel_graph.h"
#include "backend/session/anf_runtime_algorithm.h"
#include "runtime/hardware/device_context.h"
#include "runtime/framework/graph_scheduler.h"

namespace mindspore::runtime {
class OpLazyBuilderContext {
public:
OpLazyBuilderContext(GraphCompilerInfo *graph_compiler_info, KernelGraphPtr graph,
std::vector<session::KernelWithIndex> output_nodes, const session::OpRunInfo &op_run_info,
device::DeviceContext *device_context)
: graph_compiler_info_(graph_compiler_info),
graph_(std::move(graph)),
output_nodes_(std::move(output_nodes)),
op_run_info_(op_run_info),
device_context_(device_context) {}
~OpLazyBuilderContext() = default;

GraphCompilerInfo *graph_compiler_info() const { return graph_compiler_info_; }
const KernelGraphPtr &graph() const { return graph_; }
const std::vector<session::KernelWithIndex> &output_nodes() const { return output_nodes_; }
const session::OpRunInfo &op_run_info() const { return op_run_info_; }
device::DeviceContext *device_context() const { return device_context_; }

private:
GraphCompilerInfo *graph_compiler_info_;
KernelGraphPtr graph_;
std::vector<session::KernelWithIndex> output_nodes_;
session::OpRunInfo op_run_info_;
device::DeviceContext *device_context_;
};

class OpTask {
public:
explicit OpTask(std::shared_ptr<OpLazyBuilderContext> context) : context_(std::move(context)) {}
virtual ~OpTask() = default;
const std::shared_ptr<OpLazyBuilderContext> &context() { return context_; }

protected:
std::shared_ptr<OpLazyBuilderContext> context_;
};

class OpBuildTask : public OpTask {
public:
explicit OpBuildTask(std::shared_ptr<OpLazyBuilderContext> context) : OpTask(std::move(context)) {}
~OpBuildTask() override = default;
};

class OpRunTask : public OpTask {
public:
explicit OpRunTask(std::shared_ptr<OpLazyBuilderContext> context) : OpTask(std::move(context)) {}
~OpRunTask() override = default;
};

class OpLazyBuilder {
public:
static OpLazyBuilder &GetInstance() {
static OpLazyBuilder instance;
return instance;
}

class ExecuteGuard {
public:
ExecuteGuard() { OpLazyBuilder::GetInstance().executing_ = true; }
~ExecuteGuard() { OpLazyBuilder::GetInstance().executing_ = false; }
};

void Register(const std::function<void()> &callback);
const std::vector<std::shared_ptr<OpTask>> &GetOpBuildTasks() const { return op_build_tasks; }
const std::queue<std::shared_ptr<OpTask>> &GetOpRunTasks() const { return op_run_tasks; }
void ClearOpBuildTasks() { op_build_tasks.clear(); }
void Reset();
void ClearAllResources();
void ExecuteRemainingTasks();

void PushOpBuildTask(const std::shared_ptr<OpTask> &op_build_task) { op_build_tasks.push_back(op_build_task); }
void PushOpRunTask(const std::shared_ptr<OpTask> &op_run_task) { op_run_tasks.push(op_run_task); }
void PopOpRunTask() { op_run_tasks.pop(); }
bool QueueEmpty() const { return op_run_tasks.empty() && op_build_tasks.empty(); }
bool QueueFull() const { return op_build_tasks.size() > kMaxQueueSize || op_run_tasks.size() > kMaxQueueSize; }
bool registered() const { return registered_; }

private:
OpLazyBuilder() = default;
~OpLazyBuilder() = default;
DISABLE_COPY_AND_ASSIGN(OpLazyBuilder);
std::vector<std::shared_ptr<OpTask>> op_build_tasks;
std::queue<std::shared_ptr<OpTask>> op_run_tasks;
std::function<void()> execute_callback_{nullptr};
inline static size_t kMaxQueueSize = 100;
bool executing_{false};
bool registered_{false};
};
} // namespace mindspore::runtime
#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_OP_BUILDER_OP_LAZY_BUILDER_H_

+ 189
- 85
mindspore/ccsrc/vm/backend.cc View File

@@ -21,6 +21,7 @@

#include "vm/transform.h"
#include "backend/session/session_factory.h"
#include "runtime/op_builder/op_lazy_builder.h"
#include "backend/optimizer/common/helper.h"
#include "pipeline/pynative/pynative_execute.h"
#include "pipeline/jit/parse/data_converter.h"
@@ -207,6 +208,8 @@ TensorPtr CreateOutputTensor(const AnfNodePtr &output_node, size_t output_index)
MS_EXCEPTION_IF_NULL(device_tensor);
tensor->set_device_address(device_tensor);

// MindRT is disabled in the multi graphs scenario
// Delete tensor->data_sync() when MindRT is enabled in all scenes.
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode) {
@@ -490,45 +493,6 @@ void MindRTBackend::CompileGraph(const GraphSegmentPtr &segment, bool contain_mu
}
}

const ActorInfo &MindRTBackend::CompileGraph(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
const std::vector<int64_t> *tensors_mask,
std::vector<tensor::TensorPtr> *input_tensors) {
MS_EXCEPTION_IF_NULL(graph_compiler_);
// Get the device context.
const auto &device_context =
device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_name_, device_id_});
MS_EXCEPTION_IF_NULL(device_context);
device_context->Initialize();

bool single_op_cache_hit = true;
auto graph_id = graph_compiler_->CompileGraph(op_run_info, graph_info, tensors_mask, input_tensors,
&single_op_cache_hit, device_context);
// The actor set name: graph_id + single operator name.
std::string actor_info = std::to_string(graph_id) + "_" + op_run_info.op_name;
if (single_op_cache_hit) {
auto iter = actor_to_graph_compiler_info_.find(actor_info);
if (iter == actor_to_graph_compiler_info_.end()) {
MS_LOG(EXCEPTION) << "Can not find graph compiler info for actor set: " << actor_info;
}
return iter->first;
}

graph_info_to_device_context_.clear();
graph_info_to_device_context_[graph_info] = device_context;

auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
bool enable_cache = context_ptr->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE);
auto graph_compiler_info = ConstructGraphCompilerInfo(actor_info, tensors_mask, input_tensors, !enable_cache);
const auto actor_set = runtime::GraphScheduler::GetInstance().Transform(*graph_compiler_info);
runtime::GraphScheduler::GetInstance().Schedule(actor_set);
MS_EXCEPTION_IF_NULL(graph_compiler_info);
graph_compiler_info->input_tensors_.clear();

auto ret = actor_to_graph_compiler_info_.emplace(actor_info, std::move(graph_compiler_info));
return ret.first->first;
}

namespace {
void GetControlOpInput(const std::shared_ptr<GraphCompiler> &graph_compiler, const CNodePtr &front_cnode,
const CNodePtr &backend_cnode, const std::map<KernelWithIndex, tensor::TensorPtr> &op_output_map,
@@ -776,6 +740,7 @@ void PushTupleTensor(const VectorRef &args, const std::vector<AnfNodePtr> &param

void MindRTBackend::RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs,
const std::vector<std::vector<tensor::TensorPtr>> &inputs, VectorRef *outputs) {
runtime::OpLazyBuilder::GetInstance().ExecuteRemainingTasks();
MS_EXCEPTION_IF_NULL(graph_compiler_);
for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
const auto &graph = graphs[graph_index];
@@ -810,16 +775,14 @@ void MindRTBackend::RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs
GraphInfo graph_info;
graph_compiler_->GetSingleOpInputTensors(kernel, op_output_map, parameter_index, inputs[graph_index],
&input_tensor_info);
graph_compiler_->GetSingleOpRunInfoAndGraphInfo(kernel, input_tensor_info.input_tensors, &op_run_info,
&graph_info);
graph_compiler_->GetSingleOpRunInfoAndGraphInfo(kernel, input_tensor_info, &op_run_info, &graph_info);

const ActorInfo &actor_info = CompileGraph(op_run_info, graph_info, &input_tensor_info.input_tensors_mask,
&input_tensor_info.input_tensors);
RunGraph(actor_info, &op_run_info, &input_tensor_info.input_tensors_mask, &input_tensor_info.input_tensors,
&op_outputs);
RunOp(&op_run_info, &op_outputs);
} else {
RunControlOperator(graph_compiler_, graph, kernel, op_output_map, parameter_index, inputs[graph_index],
&input_tensor_info, &op_outputs);
// Execute remaining lazy tasks before PyNative hook exit.
runtime::OpLazyBuilder::GetInstance().ExecuteRemainingTasks();
}

graph_compiler_->UpdateRefCount(input_tensor_info.input_kernel, &cnode_ref_count, &op_output_map);
@@ -999,6 +962,10 @@ void MindRTBackend::SetDebuggerInit() {
}
#endif

void MindRTBackend::SyncLazyTasks() const { runtime::OpLazyBuilder::GetInstance().ExecuteRemainingTasks(); }

void MindRTBackend::ClearOpBuilderResource() const { runtime::OpLazyBuilder::GetInstance().Reset(); }

std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(const FuncGraphPtr &root_graph) {
MS_EXCEPTION_IF_NULL(root_graph);
MS_EXCEPTION_IF_NULL(graph_compiler_);
@@ -1092,22 +1059,21 @@ void MindRTBackend::EraseSingleOpCache(const ActorInfo &actor_info, const Kernel
actor_to_graph_compiler_info_.erase(actor_info);
}

void MindRTBackend::RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info,
const std::vector<int64_t> *tensors_mask,
const std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs) {
MS_EXCEPTION_IF_NULL(input_tensors);
MS_EXCEPTION_IF_NULL(op_run_info);
MS_EXCEPTION_IF_NULL(tensors_mask);

void MindRTBackend::RunSingleOpGraph(const KernelGraphPtr &graph,
const std::vector<session::KernelWithIndex> &output_nodes,
const OpRunInfo &op_run_info, const GraphCompilerInfo *graph_compiler_info,
DeviceContext *device_context) {
// Erase value node tensor.
std::vector<tensor::TensorPtr> tensors_without_value_node;
if (input_tensors->size() != tensors_mask->size()) {
MS_LOG(EXCEPTION) << "Input tensors size " << input_tensors->size() << " should be equal to tensors mask size "
<< tensors_mask->size();
}
for (size_t index = 0; index < tensors_mask->size(); ++index) {
if (tensors_mask->at(index) != kValueNodeTensorMask) {
(void)tensors_without_value_node.emplace_back(input_tensors->at(index));
const auto &input_tensors = op_run_info.input_tensors;
const auto &tensors_mask = op_run_info.tensor_mask;
if (input_tensors.size() != tensors_mask.size()) {
MS_LOG(EXCEPTION) << "Input tensors size " << input_tensors.size() << " should be equal to tensors mask size "
<< tensors_mask.size();
}
for (size_t index = 0; index < tensors_mask.size(); ++index) {
if (tensors_mask.at(index) != kValueNodeTensorMask) {
(void)tensors_without_value_node.emplace_back(input_tensors.at(index));
}
}

@@ -1119,30 +1085,11 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info
}

// Run actor DAG.
const auto &actor_set = runtime::GraphScheduler::GetInstance().Fetch(actor_info);
const auto &actor_set = runtime::GraphScheduler::GetInstance().Fetch(graph_compiler_info->name_);
MS_EXCEPTION_IF_NULL(actor_set);
runtime::GraphScheduler::GetInstance().Run(actor_set, {}, {tensors_without_value_node}, *input_tensors,
runtime::GraphScheduler::GetInstance().Run(actor_set, {}, {tensors_without_value_node}, input_tensors,
runtime::GraphExecutionStrategy::kStep);

// Fetch outputs.
const auto &graph_iter = actor_to_graph_compiler_info_.find(actor_info);
if (graph_iter == actor_to_graph_compiler_info_.end()) {
MS_LOG(EXCEPTION) << "Can't find the graph compiler info.";
}
MS_EXCEPTION_IF_NULL(graph_iter->second);
const auto &graph_compiler_info = *(graph_iter->second);
const auto &graph = graph_compiler_info.graphs_.front();
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(graph_compiler_);
const auto &output_nodes = graph_compiler_->GetGraphOutputNodes(graph->graph_id());
MS_EXCEPTION_IF_NULL(outputs);
UpdateOutput(output_nodes, outputs);

// Update output abstract of dynamic op to op_run_info
if (op_run_info->is_dynamic_shape) {
UpdateOutputAbstract(graph, op_run_info);
}

// Release the kernel resource.
const auto &kernels = graph->execution_order();
for (const auto &kernel : kernels) {
@@ -1154,14 +1101,171 @@ void MindRTBackend::RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info
}
}
}
}

void MindRTBackend::CompileSingleOpGraphs(const std::vector<std::shared_ptr<runtime::OpTask>> &build_tasks) {
if (build_tasks.empty()) {
return;
}
std::vector<KernelGraphPtr> graphs;
std::vector<GraphCompilerInfo *> graph_compiler_infos;
for (const auto &task : build_tasks) {
MS_EXCEPTION_IF_NULL(task);
const auto &context = task->context();
MS_EXCEPTION_IF_NULL(context);
graphs.push_back(context->graph());
graph_compiler_infos.push_back(context->graph_compiler_info());
}
MS_EXCEPTION_IF_NULL(build_tasks[0]);
auto &task_context = build_tasks[0]->context();
MS_EXCEPTION_IF_NULL(task_context);
auto device_context = task_context->device_context();
graph_compiler_->BuildSingleOpGraphs(graphs, device_context);

for (const auto &graph_compiler_info : graph_compiler_infos) {
MS_EXCEPTION_IF_NULL(graph_compiler_info);
auto actor_set = runtime::GraphScheduler::GetInstance().Transform(*graph_compiler_info);
graph_compiler_info->input_tensors_.clear();
runtime::GraphScheduler::GetInstance().Schedule(actor_set);
}
}

void MindRTBackend::LazyExecuteTaskCallback() {
auto &op_lazy_builder = runtime::OpLazyBuilder::GetInstance();
if (op_lazy_builder.QueueEmpty()) {
return;
}

try {
MS_LOG(DEBUG) << "Start";
auto ms_context = MsContext::GetInstance();
auto infer_flag = ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER);
ms_context->set_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER, true);

CompileSingleOpGraphs(op_lazy_builder.GetOpBuildTasks());
op_lazy_builder.ClearOpBuildTasks();

// Run op one by one
auto &op_run_tasks = op_lazy_builder.GetOpRunTasks();
while (!op_run_tasks.empty()) {
auto &op_run_task = op_run_tasks.front();
const auto &context = op_run_task->context();
RunSingleOpGraph(context->graph(), context->output_nodes(), context->op_run_info(),
context->graph_compiler_info(), context->device_context());
UpdateOutputDeviceAddress(context->output_nodes(), context->device_context());
UpdateInputDeviceAddress(context->graph());

op_lazy_builder.PopOpRunTask();
}

ms_context->set_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER, infer_flag);
MS_LOG(DEBUG) << "End";
} catch (const std::exception &ex) {
op_lazy_builder.Reset();
throw(std::runtime_error(ex.what()));
} catch (...) {
op_lazy_builder.Reset();
std::string exName(abi::__cxa_current_exception_type()->name());
MS_LOG(EXCEPTION) << "Error occurred when execute task in queue. Exception name: " << exName;
}
}

// Update device address for input and output of graph.
UpdateOutputDeviceAddress(output_nodes, graph_compiler_info.device_contexts_.front());
UpdateInputDeviceAddress(graph);
void MindRTBackend::RunOpInternal(bool single_op_cache_hit, GraphCompilerInfo *graph_compiler_info,
OpRunInfo *op_run_info, VectorRef *outputs) {
MS_EXCEPTION_IF_NULL(op_run_info);
MS_EXCEPTION_IF_NULL(graph_compiler_info);
// Fetch outputs.
const auto &graph = graph_compiler_info->graphs_.front();
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(graph_compiler_);
const auto &output_nodes = graph_compiler_->GetGraphOutputNodes(graph->graph_id());
MS_EXCEPTION_IF_NULL(outputs);

if (graph_compiler_info.need_erase_) {
EraseSingleOpCache(actor_info, graph);
auto device_context = graph_compiler_info->device_contexts_.front();
auto &op_lazy_builder = runtime::OpLazyBuilder::GetInstance();
bool lazy_build_disabled = graph_compiler_info->need_erase_ ||
(single_op_cache_hit && op_lazy_builder.QueueEmpty()) || !op_run_info->lazy_build;
if (lazy_build_disabled) {
if (!single_op_cache_hit) {
CompileSingleOpGraph(graph, device_context, graph_compiler_info);
}
RunSingleOpGraph(graph, output_nodes, *op_run_info, graph_compiler_info, device_context);
UpdateOutput(output_nodes, outputs);
UpdateOutputDeviceAddress(output_nodes, device_context);
UpdateInputDeviceAddress(graph);
if (op_run_info->is_dynamic_shape) {
UpdateOutputAbstract(graph, op_run_info);
}
if (graph_compiler_info->need_erase_) {
EraseSingleOpCache(graph_compiler_info->name_, graph);
}
} else {
UpdateOutput(output_nodes, outputs);
auto run_op_context = std::make_shared<runtime::OpLazyBuilderContext>(
graph_compiler_info, graph, output_nodes, *op_run_info, graph_compiler_info->device_contexts_.front());
if (!single_op_cache_hit) {
op_lazy_builder.PushOpBuildTask(std::make_shared<runtime::OpBuildTask>(run_op_context));
}
op_lazy_builder.PushOpRunTask(std::make_shared<runtime::OpRunTask>(run_op_context));
if (!op_lazy_builder.registered()) {
op_lazy_builder.Register([this]() { LazyExecuteTaskCallback(); });
}
if (op_lazy_builder.QueueFull()) {
op_lazy_builder.ExecuteRemainingTasks();
}
}
}

void MindRTBackend::RunOp(OpRunInfo *op_run_info, VectorRef *outputs) {
MS_EXCEPTION_IF_NULL(op_run_info);
MS_EXCEPTION_IF_NULL(graph_compiler_);
// Get the device context.
const auto &device_context =
device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_name_, device_id_});
MS_EXCEPTION_IF_NULL(device_context);
device_context->Initialize();

bool single_op_cache_hit = true;
auto graph_id = graph_compiler_->CompileGraph(*op_run_info, &single_op_cache_hit, device_context);
std::string actor_info = std::to_string(graph_id) + "_" + op_run_info->op_name;
GraphCompilerInfo *graph_compiler_info_ptr;
if (single_op_cache_hit) {
auto iter = actor_to_graph_compiler_info_.find(actor_info);
if (iter == actor_to_graph_compiler_info_.end()) {
MS_LOG(EXCEPTION) << "Can not find graph compiler info for actor set: " << actor_info;
}
graph_compiler_info_ptr = iter->second.get();
} else {
graph_info_to_device_context_.clear();
graph_info_to_device_context_[op_run_info->graph_info] = device_context;
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
bool enable_cache = context_ptr->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE);
auto graph_compiler_info =
ConstructGraphCompilerInfo(actor_info, &op_run_info->tensor_mask, &op_run_info->input_tensors, !enable_cache);
graph_compiler_info_ptr = graph_compiler_info.get();

auto ret = actor_to_graph_compiler_info_.try_emplace(actor_info, std::move(graph_compiler_info));
if (!ret.second) {
MS_LOG(WARNING) << "ActorInfo:" << actor_info << " already exist in the map.";
}
}

RunOpInternal(single_op_cache_hit, graph_compiler_info_ptr, op_run_info, outputs);
}

void MindRTBackend::CompileSingleOpGraph(const KernelGraphPtr &graph, const DeviceContext *device_context,
GraphCompilerInfo *graph_compiler_info) const {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(device_context);
graph_compiler_->BuildSingleOpGraphs({graph}, device_context);
MS_EXCEPTION_IF_NULL(graph_compiler_info);
auto actor_set = runtime::GraphScheduler::GetInstance().Transform(*graph_compiler_info);
graph_compiler_info->input_tensors_.clear();
// Actor::Init() is called in Schedule.
// Workspace need to be initialized in Actor::Init().
// So `Schedule` need to execute after `CreateKernelWorkspaceDeviceAddress`.
runtime::GraphScheduler::GetInstance().Schedule(actor_set);
}
} // namespace compile
} // namespace mindspore

+ 27
- 9
mindspore/ccsrc/vm/backend.h View File

@@ -32,6 +32,7 @@
#include "backend/session/session_basic.h"
#include "runtime/hardware/device_context.h"
#include "runtime/framework/graph_scheduler.h"
#include "runtime/op_builder/op_lazy_builder.h"

namespace mindspore {
namespace compile {
@@ -108,21 +109,19 @@ class MindRTBackend : public Backend {
// all sub graphs to call CompileGraph.
const ActorInfo &CompileGraphs(const FuncGraphPtr &root_graph);

// Compile single op kernel graph in the pyNative mode.
const ActorInfo &CompileGraph(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
const std::vector<int64_t> *tensors_mask,
std::vector<tensor::TensorPtr> *input_tensors);

// Run Graph in the graph mode.
void RunGraph(const ActorInfo &actor_info, const VectorRef &args, VectorRef *outputs);

// Run Graph in the pyNative mode.
void RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info, const std::vector<int64_t> *tensors_mask,
const std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs);
// Run single op in the PyNative mode.
void RunOp(OpRunInfo *op_run_info, VectorRef *outputs);
#ifdef ENABLE_DEBUGGER
void SetDebuggerInit();
#endif

// Execute all tasks in queue when lazy build is enabled in PyNative mode.
void SyncLazyTasks() const;
// Clear resource when python exit.
void ClearOpBuilderResource() const;

private:
// The parameter func_graph is a graph, it can be either a root graph or a sub graph,
// The result of graph compiler is stored in graph_id_to_device_context_ and control_nodes_.
@@ -132,6 +131,13 @@ class MindRTBackend : public Backend {
// Compile the kernel graph by the segment which is from the function graph partition.
void CompileGraph(const GraphSegmentPtr &segment, bool contain_multi_target);

// CreateKernel, Transform and Schedule have not been finished when LazyBuild is enabled in PyNative mode.
void CompileSingleOpGraph(const KernelGraphPtr &graph, const DeviceContext *device_context,
GraphCompilerInfo *graph_compiler_info) const;

// Get saved OpBuildTask in OpLazyBuilder and build all the kernels together in PyNative mode.
void CompileSingleOpGraphs(const std::vector<std::shared_ptr<runtime::OpTask>> &build_tasks);

// Restore the outputs tuple by the origin funcGraph output node and output tensors.
void ConstructOutputs(const AnfNodePtr &output_node, const std::vector<tensor::TensorPtr> &output_tensors,
size_t *output_position, VectorRef *outputs);
@@ -149,6 +155,18 @@ class MindRTBackend : public Backend {
// so the latest single op cache should be erased when cache list size exceeds threshold value.
void EraseSingleOpCache(const ActorInfo &actor_info, const KernelGraphPtr &graph);

// Run op immediately when the single_op_cache hit and the queue of OpLazyBuilder is empty in PyNative mode.
void RunSingleOpGraph(const KernelGraphPtr &graph, const std::vector<session::KernelWithIndex> &output_nodes,
const OpRunInfo &op_run_info, const GraphCompilerInfo *graph_compiler_info,
DeviceContext *device_context);

// Execute OpBuildTask and OpRunTask when the OpLazyBuilder queue is full in PyNative mode.
void LazyExecuteTaskCallback();

// Run op immediately or save OpBuildTask and OpRunTask in OpLazyBuilder.
void RunOpInternal(bool single_op_cache_hit, GraphCompilerInfo *graph_compiler_info, OpRunInfo *op_run_info,
VectorRef *outputs);

// Split complete kernel graph to single op graph in PyNative back
// propagation, then compile and run single op graph.
void RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs,


+ 0
- 13
mindspore/ccsrc/vm/vm.cc View File

@@ -482,18 +482,5 @@ void FinalVM::InstPushPrim(const VectorRef &args) {

MS_LOG(DEBUG) << "End";
}

void FinalVM::SyncData(const py::object &arg) {
if (py::isinstance<py::tuple>(arg)) {
auto arg_list = py::cast<py::tuple>(arg);
for (size_t i = 0; i < arg_list.size(); i++) {
SyncData(arg_list[i]);
}
}
if (py::isinstance<tensor::Tensor>(arg)) {
auto tensor = py::cast<tensor::TensorPtr>(arg);
tensor->data_sync();
}
}
} // namespace compile
} // namespace mindspore

+ 0
- 1
mindspore/ccsrc/vm/vm.h View File

@@ -130,7 +130,6 @@ class FinalVM {
void Pushsp();
void Popsp();
void DoJmp(const BaseRef &jmp);
void SyncData(const py::object &args);

private:
InstSet insts_;


Loading…
Cancel
Save