Browse Source

reconstruct session code

pull/14965/head
kswang 4 years ago
parent
commit
2a48b2ecb8
9 changed files with 275 additions and 269 deletions
  1. +187
    -79
      mindspore/ccsrc/backend/session/ascend_session.cc
  2. +7
    -1
      mindspore/ccsrc/backend/session/ascend_session.h
  3. +11
    -24
      mindspore/ccsrc/backend/session/cpu_session.cc
  4. +5
    -1
      mindspore/ccsrc/backend/session/cpu_session.h
  5. +2
    -1
      mindspore/ccsrc/backend/session/executor.cc
  6. +33
    -34
      mindspore/ccsrc/backend/session/gpu_session.cc
  7. +5
    -3
      mindspore/ccsrc/backend/session/gpu_session.h
  8. +17
    -122
      mindspore/ccsrc/backend/session/session_basic.cc
  9. +8
    -4
      mindspore/ccsrc/backend/session/session_basic.h

+ 187
- 79
mindspore/ccsrc/backend/session/ascend_session.cc View File

@@ -59,6 +59,7 @@
#include "debug/data_dump/e2e_dump.h"
#include "debug/anf_ir_dump.h"
#include "debug/dump_proto.h"
#include "abstract/utils.h"
#ifdef ENABLE_DEBUGGER
#include "debug/debugger/proto_exporter.h"
#else
@@ -209,69 +210,6 @@ void GenOpOutputStubTensor(const KernelGraphPtr &single_op_graph, const CNodePtr
(*op_output_info)[kernel_with_index] = output_tensor_info;
}
}
} // namespace

void AscendSession::Init(uint32_t device_id) { InitExecutor(kAscendDevice, device_id); }

void AscendSession::UnifyMindIR(const KernelGraphPtr &graph) {
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
if (save_graphs) {
std::string file_name = "hwopt_d_before_unify_mindir_graph_" + std::to_string(graph->graph_id()) + ".ir";
DumpIR(file_name, graph);
DumpIRProto(graph, "before_unify_mindir_hwopt_" + std::to_string(graph->graph_id()));
}
auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto unify_mindir_pm = std::make_shared<opt::PassManager>("unify_mindir_pm");
unify_mindir_pm->AddPass(std::make_shared<opt::SpaceToBatchNDAttrUpdate>());
unify_mindir_pm->AddPass(std::make_shared<opt::BatchToSpaceNDAttrUpdate>());
unify_mindir_pm->AddPass(std::make_shared<opt::MaxPool2MaxPoolWithArgmax>());
unify_mindir_pm->AddPass(std::make_shared<opt::MaxPoolWithArgmaxUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::MaxPoolGradWithArgmaxUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::Conv2DUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::Conv2DBackpropInputUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::Conv2DBackpropFilterUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::SliceGradUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::AvgPoolGradUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::FtrlUnifyOutput>());
unify_mindir_pm->AddPass(std::make_shared<opt::MomentumUnifyOutput>());
unify_mindir_pm->AddPass(std::make_shared<opt::RMSPropUnifyOutput>());
unify_mindir_pm->AddPass(std::make_shared<opt::CenteredRMSPropUnifyOutput>());
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode) {
unify_mindir_pm->AddPass(std::make_shared<opt::DropoutAndDropoutGradUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::DropoutUnifyMindIR0>());
unify_mindir_pm->AddPass(std::make_shared<opt::GradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::GradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIRV2>());
unify_mindir_pm->AddPass(std::make_shared<opt::SparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
} else {
unify_mindir_pm->AddPass(std::make_shared<opt::PynativeSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::PynativeGradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
}
unify_mindir_pm->AddPass(std::make_shared<opt::DropoutUnifyMindIR1>());
unify_mindir_pm->AddPass(std::make_shared<opt::DropoutGradUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::BatchNormGradUnifyMindIR>());

optimizer->AddPassManager(unify_mindir_pm);
(void)optimizer->Optimize(graph);
graph->SetExecOrderByDefault();
if (save_graphs) {
std::string file_name = "hwopt_d_after_unify_mindir_graph_" + std::to_string(graph->graph_id()) + ".ir";
DumpIR(file_name, graph);
}
}

GraphId AscendSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
MS_LOG(INFO) << "Start";
// construct graph, if successfully, graph_sum_ + 1
auto graph = ConstructKernelGraph(lst, outputs);
auto graph_id = graph->graph_id();
InitAllBucket(graph);
MS_LOG(INFO) << "Compile graph " << graph_id << " success";
return graph_id;
}

bool IsBackward(const CNodePtr &cnode) {
auto prim = GetValueNode<PrimitivePtr>(cnode->input(0));
@@ -357,6 +295,183 @@ void ReorderSendRecv(std::vector<CNodePtr> *execution_order) {
}
}

size_t LoadCtrlInputTensor(const std::shared_ptr<KernelGraph> &graph, std::vector<tensor::TensorPtr> *inputs) {
MS_EXCEPTION_IF_NULL(graph);
MS_LOG(INFO) << "Load kInputCtrlTensors";
auto inputs_params = graph->input_ctrl_tensors();
if (inputs_params == nullptr) {
return 0;
}
if (inputs_params->size() < 3) {
MS_LOG(EXCEPTION) << "Illegal inputs_params size";
}
// update current loop tensor to 0 per iterator
auto cur_loop_tensor = (*inputs_params)[0];
MS_EXCEPTION_IF_NULL(cur_loop_tensor);
auto *cur_val = static_cast<int32_t *>(cur_loop_tensor->data_c());
MS_EXCEPTION_IF_NULL(cur_val);
*cur_val = 0;
cur_loop_tensor->set_sync_status(kNeedSyncHostToDevice);
// set loop_count to zero
MS_EXCEPTION_IF_NULL(inputs);
inputs->push_back(cur_loop_tensor);

// update next loop tensor to 0 per iterator
auto next_loop_tensor = (*inputs_params)[1];
MS_EXCEPTION_IF_NULL(next_loop_tensor);
auto *next_val = static_cast<int32_t *>(next_loop_tensor->data_c());
MS_EXCEPTION_IF_NULL(next_val);
*next_val = 0;
next_loop_tensor->set_sync_status(kNeedSyncHostToDevice);
// set loop_count to zero
MS_EXCEPTION_IF_NULL(inputs);
inputs->push_back(next_loop_tensor);

auto epoch_tensor = (*inputs_params)[2];
MS_EXCEPTION_IF_NULL(epoch_tensor);
auto *epoch_val = static_cast<int32_t *>(epoch_tensor->data_c());
MS_EXCEPTION_IF_NULL(epoch_val);
*epoch_val = graph->current_epoch();
epoch_tensor->set_sync_status(kNeedSyncHostToDevice);
inputs->push_back(epoch_tensor);
MS_LOG(INFO) << "Load epoch_val:" << *epoch_val;
graph->set_current_epoch(graph->current_epoch() + 1);
return inputs_params->size();
}

bool TensorNeedSync(const AnfNodePtr &parameter, const tensor::TensorPtr &tensor) {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
auto device_address = AnfAlgo::GetMutableOutputAddr(parameter, 0);
if (ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER)) {
return tensor->device_address().get() == nullptr || tensor->device_address() != device_address;
}
if (tensor->NeedSyncHostToDevice()) {
return true;
}
auto tensor_address = tensor->device_address();
if (tensor_address != device_address) {
tensor->data_sync(false);
return true;
}
return false;
}
} // namespace

void AscendSession::Init(uint32_t device_id) { InitExecutor(kAscendDevice, device_id); }

void AscendSession::UnifyMindIR(const KernelGraphPtr &graph) {
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
if (save_graphs) {
std::string file_name = "hwopt_d_before_unify_mindir_graph_" + std::to_string(graph->graph_id()) + ".ir";
DumpIR(file_name, graph);
DumpIRProto(graph, "before_unify_mindir_hwopt_" + std::to_string(graph->graph_id()));
}
auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto unify_mindir_pm = std::make_shared<opt::PassManager>("unify_mindir_pm");
unify_mindir_pm->AddPass(std::make_shared<opt::SpaceToBatchNDAttrUpdate>());
unify_mindir_pm->AddPass(std::make_shared<opt::BatchToSpaceNDAttrUpdate>());
unify_mindir_pm->AddPass(std::make_shared<opt::MaxPool2MaxPoolWithArgmax>());
unify_mindir_pm->AddPass(std::make_shared<opt::MaxPoolWithArgmaxUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::MaxPoolGradWithArgmaxUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::Conv2DUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::Conv2DBackpropInputUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::Conv2DBackpropFilterUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::SliceGradUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::AvgPoolGradUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::FtrlUnifyOutput>());
unify_mindir_pm->AddPass(std::make_shared<opt::MomentumUnifyOutput>());
unify_mindir_pm->AddPass(std::make_shared<opt::RMSPropUnifyOutput>());
unify_mindir_pm->AddPass(std::make_shared<opt::CenteredRMSPropUnifyOutput>());
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode) {
unify_mindir_pm->AddPass(std::make_shared<opt::DropoutAndDropoutGradUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::DropoutUnifyMindIR0>());
unify_mindir_pm->AddPass(std::make_shared<opt::GradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::GradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIRV2>());
unify_mindir_pm->AddPass(std::make_shared<opt::SparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
} else {
unify_mindir_pm->AddPass(std::make_shared<opt::PynativeSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::PynativeGradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
}
unify_mindir_pm->AddPass(std::make_shared<opt::DropoutUnifyMindIR1>());
unify_mindir_pm->AddPass(std::make_shared<opt::DropoutGradUnifyMindIR>());
unify_mindir_pm->AddPass(std::make_shared<opt::BatchNormGradUnifyMindIR>());

optimizer->AddPassManager(unify_mindir_pm);
(void)optimizer->Optimize(graph);
graph->SetExecOrderByDefault();
if (save_graphs) {
std::string file_name = "hwopt_d_after_unify_mindir_graph_" + std::to_string(graph->graph_id()) + ".ir";
DumpIR(file_name, graph);
}
}

void AscendSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
const std::vector<tensor::TensorPtr> &inputs_const) const {
std::vector<tensor::TensorPtr> inputs(inputs_const);
size_t input_ctrl_size = 3;
MS_EXCEPTION_IF_NULL(kernel_graph);
if (kernel_graph->input_ctrl_tensors()) {
input_ctrl_size = LoadCtrlInputTensor(kernel_graph, &inputs);
}
auto &input_nodes = kernel_graph->input_nodes();
auto extra_param_size = kernel_graph->GetExtraParamAndTensor().size();
if ((inputs.size() + input_ctrl_size) - 3 != input_nodes.size() - extra_param_size) {
MS_LOG(EXCEPTION) << "Tensor input:" << inputs.size() << " is not equal graph inputs:" << input_nodes.size()
<< ", input_ctrl_size:" << input_ctrl_size << ", extra_param_size:" << extra_param_size;
}
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
for (size_t i = 0; i < inputs.size(); ++i) {
auto tensor = inputs[i];
MS_EXCEPTION_IF_NULL(tensor);
auto input_node = input_nodes[i];
MS_EXCEPTION_IF_NULL(input_node);
auto size = LongToSize(tensor->data().nbytes());
if (input_node->isa<Parameter>() && input_node->cast<ParameterPtr>()->is_used_by_dynamic_kernel()) {
auto tensor_shape = tensor->shape();
std::vector<size_t> shape_tmp;
(void)std::transform(tensor_shape.begin(), tensor_shape.end(), std::back_inserter(shape_tmp), IntToSize);
AnfAlgo::SetOutputInferTypeAndShape({AnfAlgo::GetOutputInferDataType(input_node, 0)}, {shape_tmp},
input_node.get());
size = abstract::ShapeSize(shape_tmp) * abstract::TypeIdSize(tensor->data_type());
}
if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0) && TensorNeedSync(input_node, tensor)) {
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
const std::string &param_name = input_node->fullname_with_scope();
if (ps::ps_cache_instance.IsHashTable(param_name)) {
continue;
}
#endif
auto device_address = AnfAlgo::GetMutableOutputAddr(input_node, 0);
MS_EXCEPTION_IF_NULL(device_address);
if (size != 0 && !device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(input_node, 0), size,
tensor->data_type(), tensor->data_c())) {
MS_LOG(EXCEPTION) << "SyncHostToDevice failed.";
}
if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode ||
AnfAlgo::IsParameterWeight(input_node->cast<ParameterPtr>())) {
tensor->set_device_address(device_address);
}
}
tensor->set_sync_status(kNoNeedSync);
}
}

GraphId AscendSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
MS_LOG(INFO) << "Start";
// construct graph, if successfully, graph_sum_ + 1
auto graph = ConstructKernelGraph(lst, outputs);
auto graph_id = graph->graph_id();
InitAllBucket(graph);
MS_LOG(INFO) << "Compile graph " << graph_id << " success";
return graph_id;
}

GraphId AscendSession::CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) {
MS_LOG(INFO) << "Start";
std::vector<KernelGraphPtr> all_graphs;
@@ -559,16 +674,8 @@ void AscendSession::CompileChildGraph(const KernelGraphPtr &child_graph) {

bool AscendSession::IsSupportSummary() { return !device::KernelAdjust::NeedInsertSwitch(); }

void AscendSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs,
VectorRef *const outputs) {
MS_LOG(INFO) << "Start";
auto kernel_graph = GetGraph(graph_id);
MS_EXCEPTION_IF_NULL(kernel_graph);
// if none of child graph and no anf output exists
if (!kernel_graph->executable()) {
MS_LOG(INFO) << "No child graph has anf output";
return;
}
void AscendSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
const std::vector<tensor::TensorPtr> &inputs, VectorRef *const outputs) {
// load data to extra params
std::set<KernelGraphPtr> memo;
SyncDataToExtraParams(NOT_NULL(kernel_graph), NOT_NULL(&memo));
@@ -580,14 +687,14 @@ void AscendSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tens
// Initialize parameter server
InitPSParamAndOptim(kernel_graph, inputs);
std::string channel_name;
if (ps::PsDataPrefetch::GetInstance().cache_enable() && IsGetNextGraph(graph_id, &channel_name)) {
if (ps::PsDataPrefetch::GetInstance().cache_enable() && IsGetNextGraph(kernel_graph, &channel_name)) {
ps::ps_cache_instance.IncreaseGraphStep(channel_name);
}
#endif
{
// run task on device
Execute(kernel_graph, true);
}
}
void AscendSession::PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
const std::vector<tensor::TensorPtr> &inputs, VectorRef *const outputs) {
// summary
Summary(kernel_graph.get());
// load tensor from device for debugger
@@ -598,9 +705,10 @@ void AscendSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tens
if (debugger_) {
debugger_->PostExecute();
}
MS_LOG(INFO) << "Finish!";
}

void AscendSession::ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) { Execute(kernel_graph, true); }

void AscendSession::RunOpHardwareOptimize(const std::shared_ptr<session::KernelGraph> &kernel_graph) const {
MS_LOG(INFO) << "Start";
// data layout optimization


+ 7
- 1
mindspore/ccsrc/backend/session/ascend_session.h View File

@@ -50,7 +50,13 @@ class AscendSession : public SessionBasic {
GraphId CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) override;
GraphId CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) override;
bool IsSupportSummary() override;
void RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) override;
void LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
const std::vector<tensor::TensorPtr> &inputs_const) const override;
void PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, const std::vector<tensor::TensorPtr> &inputs,
VectorRef *const outputs) override;
void PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, const std::vector<tensor::TensorPtr> &inputs,
VectorRef *const outputs) override;
void ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) override;
void BuildGraphImpl(GraphId) override;
void BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
const std::vector<tensor::TensorPtr> &input_tensors,


+ 11
- 24
mindspore/ccsrc/backend/session/cpu_session.cc View File

@@ -102,20 +102,19 @@ GraphId CPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr
Optimize(graph);
MS_LOG(INFO) << "Build kernel";
BuildKernel(graph.get());

// Remove reorder after PS feature finish adapting push/pull in auto_monad.
auto execution_order = graph->execution_order();
Reorder(&execution_order);
graph->set_execution_order(execution_order);

// runtime init
if (!runtime_.Init()) {
MS_LOG(EXCEPTION) << "Kernel runtime init error.";
}

MS_LOG(INFO) << "Assign kernel address";
runtime_.AssignKernelAddress(graph.get());

// set summary node
SetSummaryNodes(graph.get());
runtime_.IncreaseSummaryRefCount(graph->summary_nodes());
DumpGraph(graph);
return graph_id;
}
@@ -154,38 +153,26 @@ void CPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
}
}

void CPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs,
VectorRef *outputs) {
auto kernel_graph = GetGraph(graph_id);
MS_EXCEPTION_IF_NULL(kernel_graph);
void CPUSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
const std::vector<tensor::TensorPtr> &inputs, VectorRef *const outputs) {
MS_LOG(INFO) << "Bind input output address";
runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs);

#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
InitPSParamAndOptim(kernel_graph, inputs);
#endif
}

MS_LOG(INFO) << "Run graph start";

bool enable_summary = summary_callback_ != nullptr;
NamedSummaryOutputs summary_outputs;
if (enable_summary) {
SetSummaryNodes(kernel_graph.get());
summary_outputs = kernel_graph->summary_nodes();
runtime_.IncreaseSummaryRefCount(summary_outputs);
}
void CPUSession::PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
const std::vector<tensor::TensorPtr> &inputs, VectorRef *const outputs) {
Summary(kernel_graph.get());
}

void CPUSession::ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) {
bool ret = runtime_.Run(kernel_graph.get(), false);
if (!ret) {
MS_LOG(EXCEPTION) << "Run graph failed";
}

if (enable_summary) {
Summary(kernel_graph.get());
runtime_.DecreaseSummaryRefCount(summary_outputs);
}

MS_LOG(INFO) << "Run graph end";
}

void CPUSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,


+ 5
- 1
mindspore/ccsrc/backend/session/cpu_session.h View File

@@ -36,7 +36,11 @@ class CPUSession : public SessionBasic {
void CreateOutputTensors(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &input_tensors, VectorRef *,
std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node) override;
GraphId CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) override;
void RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) override;
void PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, const std::vector<tensor::TensorPtr> &inputs,
VectorRef *const outputs) override;
void PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, const std::vector<tensor::TensorPtr> &inputs,
VectorRef *const outputs) override;
void ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) override;
ParameterPtr CreateNewParameterFromParameter(const AnfNodePtr &anf, KernelGraph *graph) override;
void Optimize(const std::shared_ptr<KernelGraph> &kernel_graph);
void BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,


+ 2
- 1
mindspore/ccsrc/backend/session/executor.cc View File

@@ -236,6 +236,7 @@ void Executor::WorkerLoop() {
done_tasks_.emplace_back(task);
}
if (task->type_ != kRunGraph || task->sync_run_) {
std::lock_guard<std::mutex> lock(task_mutex_);
sync_run_task_finished_ = true;
sync_cond_var_.notify_all();
}
@@ -310,9 +311,9 @@ void Executor::ClearDoneTasks() {
}

void Executor::RunTask(const std::shared_ptr<Task> &task, bool sync, bool long_run) {
sync_run_task_finished_ = false;
{
std::lock_guard<std::mutex> lock(task_mutex_);
sync_run_task_finished_ = false;
ready_tasks_.push(task);
}
task_cond_var_.notify_all();


+ 33
- 34
mindspore/ccsrc/backend/session/gpu_session.cc View File

@@ -299,14 +299,6 @@ void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
}
}

void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const {
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
if (!runtime_instance->Run(kernel_graph.get(), false)) {
MS_LOG(EXCEPTION) << "GPU execute graph failed!";
}
}

GraphId GPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
// Construct graph, if successfully, graph_sum_ + 1
auto graph = ConstructKernelGraph(lst, outputs);
@@ -419,10 +411,8 @@ GraphId GPUSession::CompileGraphImpl(KernelGraphPtr graph) {
return graph->graph_id();
}

void GPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs,
VectorRef *outputs) {
auto &kernel_graph = graphs_[graph_id];
MS_LOG(INFO) << "RunGraph graph_id: " << graph_id;
void GPUSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
if (debugger_) {
debugger_->PreExecute(kernel_graph, graph_sum_);
}
@@ -430,26 +420,48 @@ void GPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor:
// Initialize parameter server
InitPSParamAndOptim(kernel_graph, inputs);
#endif
MS_EXCEPTION_IF_NULL(kernel_graph);
// It's InitDataset graph if kernel_num == 1, skip the loop.
}

void GPUSession::PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
// Summary
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
if (context_ptr->get_param<bool>(MS_CTX_ENABLE_GPU_SUMMARY)) {
Summary(kernel_graph.get());
}
bool dump_enabled = DumpDataEnabledIteration();
// debug used for dump
if (debugger_ && dump_enabled) {
Dump(kernel_graph);
} else {
DumpJsonParser::GetInstance().UpdateDumpIter();
}
if (debugger_) {
debugger_->PostExecute();
}
}

void GPUSession::ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) {
int kernel_num = kernel_graph->execution_order().size();
int64_t loopsize = (kernel_num > 1) ? ConfigManager::GetInstance().gpu_loopsink_size() : 1;
for (int64_t i = 0; i < loopsize; i++) {
#if ENABLE_CPU && ENABLE_GPU
std::string channel_name;
if (ps::PsDataPrefetch::GetInstance().cache_enable() && IsGetNextGraph(graph_id, &channel_name)) {
if (ps::PsDataPrefetch::GetInstance().cache_enable() && IsGetNextGraph(kernel_graph, &channel_name)) {
ps::ps_cache_instance.IncreaseGraphStep(channel_name);
}
#endif
Execute(kernel_graph);
}
// Summary
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
if (context_ptr->get_param<bool>(MS_CTX_ENABLE_GPU_SUMMARY)) {
Summary(kernel_graph.get());
}

void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const {
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
if (!runtime_instance->Run(kernel_graph.get(), false)) {
MS_LOG(EXCEPTION) << "GPU execute graph failed!";
}
PostIterationDbg(kernel_graph);
}

void GPUSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
@@ -519,19 +531,6 @@ bool GPUSession::DumpDataEnabledIteration() const {
return runtime_instance->DumpDataEnabledIteration();
}

void GPUSession::PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const {
bool dump_enabled = DumpDataEnabledIteration();
// debug used for dump
if (debugger_ && dump_enabled) {
Dump(kernel_graph);
} else {
DumpJsonParser::GetInstance().UpdateDumpIter();
}
if (debugger_) {
debugger_->PostExecute();
}
}

void GPUSession::SyncStream() {
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);


+ 5
- 3
mindspore/ccsrc/backend/session/gpu_session.h View File

@@ -39,7 +39,11 @@ class GPUSession : public SessionBasic {
void UnifyMindIR(const KernelGraphPtr &graph) override { return; }
GraphId CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) override;
GraphId CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) override;
void RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) override;
void PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, const std::vector<tensor::TensorPtr> &inputs,
VectorRef *const outputs) override;
void PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph, const std::vector<tensor::TensorPtr> &inputs,
VectorRef *const outputs) override;
void ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) override;
void BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
const std::vector<tensor::TensorPtr> &input_tensors,
const std::vector<int64_t> &tensors_mask) override;
@@ -79,8 +83,6 @@ class GPUSession : public SessionBasic {

bool DumpDataEnabledIteration() const;

void PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const;

GraphId CompileGraphImpl(KernelGraphPtr kernel_graph);
};
using GPUSessionPtr = std::shared_ptr<GPUSession>;


+ 17
- 122
mindspore/ccsrc/backend/session/session_basic.cc View File

@@ -252,52 +252,6 @@ ValueNodePtr CreateNewValueNode(const AnfNodePtr &anf, KernelGraph *graph) {
return new_value_node;
}

size_t LoadCtrlInputTensor(const std::shared_ptr<KernelGraph> &graph, std::vector<tensor::TensorPtr> *inputs) {
MS_EXCEPTION_IF_NULL(graph);
MS_LOG(INFO) << "Load kInputCtrlTensors";
auto inputs_params = graph->input_ctrl_tensors();
if (inputs_params == nullptr) {
return 0;
}
if (inputs_params->size() < 3) {
MS_LOG(EXCEPTION) << "Illegal inputs_params size";
}
// update current loop tensor to 0 per iterator
auto cur_loop_tensor = (*inputs_params)[0];
MS_EXCEPTION_IF_NULL(cur_loop_tensor);
auto *cur_val = static_cast<int32_t *>(cur_loop_tensor->data_c());
MS_EXCEPTION_IF_NULL(cur_val);
*cur_val = 0;
cur_loop_tensor->set_sync_status(kNeedSyncHostToDevice);
// set loop_count to zero
MS_EXCEPTION_IF_NULL(inputs);
inputs->push_back(cur_loop_tensor);

// update next loop tensor to 0 per iterator
auto next_loop_tensor = (*inputs_params)[1];
MS_EXCEPTION_IF_NULL(next_loop_tensor);
auto *next_val = static_cast<int32_t *>(next_loop_tensor->data_c());
MS_EXCEPTION_IF_NULL(next_val);
*next_val = 0;
next_loop_tensor->set_sync_status(kNeedSyncHostToDevice);
// set loop_count to zero
MS_EXCEPTION_IF_NULL(inputs);
inputs->push_back(next_loop_tensor);

auto epoch_tensor = (*inputs_params)[2];
MS_EXCEPTION_IF_NULL(epoch_tensor);
auto *epoch_val = static_cast<int32_t *>(epoch_tensor->data_c());
MS_EXCEPTION_IF_NULL(epoch_val);
*epoch_val = graph->current_epoch();
epoch_tensor->set_sync_status(kNeedSyncHostToDevice);
inputs->push_back(epoch_tensor);
MS_LOG(INFO) << "Load epoch_val:" << *epoch_val;

graph->set_current_epoch(graph->current_epoch() + 1);

return inputs_params->size();
}

ValueNodePtr ConstructRunOpValueNode(const std::shared_ptr<KernelGraph> &graph, const tensor::TensorPtr &input_tensor) {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(input_tensor);
@@ -1544,80 +1498,6 @@ void SessionBasic::AddParameterToGraphInputs(const std::vector<AnfNodePtr> &para
}
}

namespace {
bool TensorNeedSync(const AnfNodePtr &parameter, const tensor::TensorPtr &tensor) {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
auto device_address = AnfAlgo::GetMutableOutputAddr(parameter, 0);
if (ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER)) {
return tensor->device_address().get() == nullptr || tensor->device_address() != device_address;
}
if (tensor->NeedSyncHostToDevice()) {
return true;
}
auto tensor_address = tensor->device_address();
if (tensor_address != device_address) {
tensor->data_sync(false);
return true;
}
return false;
}
} // namespace

// run graph steps
void SessionBasic::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
const std::vector<tensor::TensorPtr> &inputs_const) const {
std::vector<tensor::TensorPtr> inputs(inputs_const);
size_t input_ctrl_size = 3;
MS_EXCEPTION_IF_NULL(kernel_graph);
if (kernel_graph->input_ctrl_tensors()) {
input_ctrl_size = LoadCtrlInputTensor(kernel_graph, &inputs);
}
auto &input_nodes = kernel_graph->input_nodes();
auto extra_param_size = kernel_graph->GetExtraParamAndTensor().size();
if ((inputs.size() + input_ctrl_size) - 3 != input_nodes.size() - extra_param_size) {
MS_LOG(EXCEPTION) << "Tensor input:" << inputs.size() << " is not equal graph inputs:" << input_nodes.size()
<< ", input_ctrl_size:" << input_ctrl_size << ", extra_param_size:" << extra_param_size;
}
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
for (size_t i = 0; i < inputs.size(); ++i) {
auto tensor = inputs[i];
MS_EXCEPTION_IF_NULL(tensor);
auto input_node = input_nodes[i];
MS_EXCEPTION_IF_NULL(input_node);
auto size = LongToSize(tensor->data().nbytes());
if (input_node->isa<Parameter>() && input_node->cast<ParameterPtr>()->is_used_by_dynamic_kernel()) {
auto tensor_shape = tensor->shape();
std::vector<size_t> shape_tmp;
(void)std::transform(tensor_shape.begin(), tensor_shape.end(), std::back_inserter(shape_tmp), IntToSize);
AnfAlgo::SetOutputInferTypeAndShape({AnfAlgo::GetOutputInferDataType(input_node, 0)}, {shape_tmp},
input_node.get());
size = abstract::ShapeSize(shape_tmp) * abstract::TypeIdSize(tensor->data_type());
}
if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0) && TensorNeedSync(input_node, tensor)) {
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
const std::string &param_name = input_node->fullname_with_scope();
if (ps::ps_cache_instance.IsHashTable(param_name)) {
continue;
}
#endif
auto device_address = AnfAlgo::GetMutableOutputAddr(input_node, 0);
MS_EXCEPTION_IF_NULL(device_address);
if (size != 0 && !device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(input_node, 0), size,
tensor->data_type(), tensor->data_c())) {
MS_LOG(EXCEPTION) << "SyncHostToDevice failed.";
}

if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode ||
AnfAlgo::IsParameterWeight(input_node->cast<ParameterPtr>())) {
tensor->set_device_address(device_address);
}
}
tensor->set_sync_status(kNoNeedSync);
}
}

void SessionBasic::UpdateOutputs(const std::shared_ptr<KernelGraph> &kernel_graph, VectorRef *const outputs,
const std::vector<tensor::TensorPtr> &input_tensors) const {
MS_EXCEPTION_IF_NULL(kernel_graph);
@@ -2130,6 +2010,22 @@ void SessionBasic::RunGraphAsync(const GraphId &graph_id, const std::vector<tens
executor_->RunGraphAsync(shared_from_this(), graph_id, inputs, outputs);
}

void SessionBasic::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs,
VectorRef *const outputs) {
MS_LOG(INFO) << "Run graph start, graph id: " << graph_id;
auto kernel_graph = GetGraph(graph_id);
MS_EXCEPTION_IF_NULL(kernel_graph);
// if none of child graph and no anf output exists
if (!kernel_graph->executable()) {
MS_LOG(INFO) << "No child graph has anf output";
return;
}
PreExecuteGraph(kernel_graph, inputs, outputs);
ExecuteGraph(kernel_graph);
PostExecuteGraph(kernel_graph, inputs, outputs);
MS_LOG(INFO) << "Run graph end, graph id: " << graph_id;
}

void SessionBasic::RunOpsInGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs,
VectorRef *outputs) {
MS_LOG(INFO) << "Start!";
@@ -2214,8 +2110,7 @@ void SessionBasic::UpdateGraphDynamicShapeAttr(const NotNull<KernelGraphPtr> &ro
root_graph->UpdateGraphDynamicAttr();
}

bool SessionBasic::IsGetNextGraph(const GraphId &graph_id, std::string *channel_name) {
auto kernel_graph = graphs_[graph_id];
bool SessionBasic::IsGetNextGraph(const std::shared_ptr<KernelGraph> &kernel_graph, std::string *channel_name) {
MS_EXCEPTION_IF_NULL(kernel_graph);
for (const auto &kernel_node : kernel_graph->execution_order()) {
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);


+ 8
- 4
mindspore/ccsrc/backend/session/session_basic.h View File

@@ -114,7 +114,7 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
virtual GraphId GetFinalRunGraph() const { return kInvalidGraphId; }
void AssignParamKey(const KernelGraphPtr &kernel_graph);
void InitPSParamAndOptim(const KernelGraphPtr &kernel_graph, const std::vector<tensor::TensorPtr> &inputs_const);
bool IsGetNextGraph(const GraphId &graph_id, std::string *channel_name);
bool IsGetNextGraph(const std::shared_ptr<KernelGraph> &kernel_graph, std::string *channel_name);
virtual bool CheckModelInputs(uint32_t graph_id, const std::vector<tensor::TensorPtr> &inputs,
std::string *error_msg) const {
return true;
@@ -173,8 +173,12 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
virtual GraphId CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) { return 0; }
virtual GraphId CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) { return kInvalidGraphId; }
virtual void BuildGraphImpl(GraphId) {}
virtual void RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
}
virtual void PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
const std::vector<tensor::TensorPtr> &inputs, VectorRef *const outputs) {}
virtual void PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
const std::vector<tensor::TensorPtr> &inputs, VectorRef *const outputs) {}
virtual void ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) {}
void RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs);
virtual void BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
const std::vector<tensor::TensorPtr> &input_tensors,
const std::vector<int64_t> &tensors_mask) {}
@@ -195,7 +199,7 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
}

virtual void LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
const std::vector<tensor::TensorPtr> &inputs_const) const;
const std::vector<tensor::TensorPtr> &inputs_const) const {}
void UpdateOutputs(const std::shared_ptr<KernelGraph> &kernel_graph, VectorRef *const outputs,
const std::vector<tensor::TensorPtr> &input_tensors) const;
void UpdateOutputAbstract(const std::shared_ptr<KernelGraph> &kernel_graph, OpRunInfo *op_run_info) const;


Loading…
Cancel
Save