/** * Copyright 2020-2021 Huawei Technologies Co., Ltd * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "backend/optimizer/somas/somas.h" #include #include #include #include #include #include #include #include "backend/optimizer/somas/somas_node.h" #include "backend/optimizer/somas/somas_solver_pre.h" #include "backend/optimizer/somas/somas_stream.h" #include "backend/optimizer/somas/somas_tensor.h" #ifdef ENABLE_D #include "runtime/device/ascend/ascend_stream_assign.h" #endif #include "backend/optimizer/common/helper.h" #include "utils/ms_context.h" #include "debug/common.h" #ifdef ENABLE_DUMP_IR #include "debug/rdr/running_data_recorder.h" #endif #include "common/thread_pool.h" #include "profiler/device/common/memory_profiling.h" using mindspore::profiler::MemoryProfiling; using mindspore::profiler::NodeMemory; using mindspore::profiler::TensorMemory; namespace mindspore { namespace somas { constexpr auto kGapSize = 512; constexpr auto kRetryIntervalSeconds = 500; constexpr size_t kRefNodeTensorNum = 2; constexpr auto kGraphId = "graph_id"; constexpr auto kHashId = "hash_id"; constexpr auto kMemOffset = "mem_offset"; constexpr auto kNodeSize = "node_size"; constexpr auto kTensorSize = "tensor_size"; constexpr auto kContiguousSize = "contiguous_size"; constexpr auto kRefNodeSize = "ref_node_size"; constexpr auto kStreamSize = "stream_size"; constexpr auto kStreamGroupSize = "stream_group_size"; constexpr auto kTensors = "tensors"; constexpr auto kTensorId = "tensor_id"; constexpr auto kSize = "size"; constexpr auto kOriSize = "ori_size"; constexpr auto kLifelongValue = "lifelong_value"; constexpr auto kLifeStart = "life_start"; constexpr auto kLifeEnd = "life_end"; constexpr auto kOffset = "offset"; constexpr auto kCachedResultThreshold = 2000; std::map tensor_type_name_map = {{kCommon, "Common"}, {kOutputOnly, "OutputOnly"}, {kWorkspace, "Workspace"}, {kGetNextOutput, "GetNextOutput"}, {kSummaryInput, "SummaryInput"}, {kRefNodeInput, "RefNodeInput"}, {kRefNodeOutput, "RefNodeOutput"}, {kUnknown, "Unknown"}}; std::map life_long_name_map = {{kLifeLongNone, "LifeLongNone"}, {kLifeLongGraphAll, "LifeLongGraphAll"}, {kLifeLongGraphStart, "LifeLongGraphStart"}, {kLifeLongGraphEnd, "LifeLongGraphEnd"}}; bool Somas::Allocate(const session::KernelGraph *graph) { auto ret = InitSomasTensors(graph); if (!ret) { MS_LOG(EXCEPTION) << "Somas Initialize Failed."; } if (tensors_list_.empty()) { MS_LOG(INFO) << "No Tensor for Somas"; return true; } ret = LoadSomasCache(graph); if (ret) { GenGraphStatisticInfo(); return ret; } // Computing Conflict pairs MS_LOG(INFO) << "Start Computing Conflict Pairs"; ComputeConflictPairs(); MS_LOG(INFO) << "End Computing Conflict Pairs"; ret = Assign(graph); if (!ret) { MS_LOG(EXCEPTION) << "Somas Assign Failed."; } SaveSomasResult(graph); GenGraphStatisticInfo(); return ret; } bool Somas::LoadSomasCache(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); if (tensors_list_.size() < kCachedResultThreshold) { MS_LOG(DEBUG) << "Tensors size (" << tensors_list_.size() << ") less than " << kCachedResultThreshold << ", no need to load cached"; return false; } bool ret = CalcSomasModelHash(graph); if (ret) { std::string filename = GetSaveGraphsPathName( "/somas_meta/somas_graph" + std::to_string(graph->graph_id()) + "_" + hash_id_ + ".json", save_graphs_path_); ret = LoadSomasResult(graph, filename); if (ret) { MS_LOG(INFO) << "Load Somas Cache file " << filename << " Successfully."; } } else { MS_LOG(ERROR) << "Calculate somas's model hash id failed."; } return ret; } bool Somas::CalcSomasModelHash(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); auto model_str = SomasInfo(true); hash_id_ = std::to_string(std::hash()(model_str)); MS_LOG(INFO) << "Graph " << graph->graph_id() << "'s SOMAS Model hash id is " << hash_id_; std::string filename = GetSaveGraphsPathName( "/somas_meta/somas_graph" + std::to_string(graph->graph_id()) + "_" + hash_id_ + ".info", save_graphs_path_); return Common::SaveStringToFile(filename, model_str); } bool Somas::SaveSomasResult(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); if (tensors_list_.size() < kCachedResultThreshold) { MS_LOG(DEBUG) << "Tensors size (" << tensors_list_.size() << ") less than " << kCachedResultThreshold << ", no need to save result"; return false; } nlohmann::json somas_json; somas_json[kGraphId] = graph->graph_id(); somas_json[kHashId] = hash_id_; somas_json[kMemOffset] = mem_offset_; somas_json[kNodeSize] = nodes_list_.size(); somas_json[kTensorSize] = tensors_list_.size(); somas_json[kContiguousSize] = contiguous_tensors_list_.size(); somas_json[kRefNodeSize] = ref_node_constraints_.size(); somas_json[kStreamSize] = streams_list_.size(); somas_json[kStreamGroupSize] = streams_groups_.size(); std::vector tensors_json; for (auto &tensor : tensors_list_) { MS_EXCEPTION_IF_NULL(tensor); nlohmann::json tensor_json; tensor_json[kTensorId] = tensor->GetId(); tensor_json[kSize] = tensor->GetAlignedSize(); tensor_json[kOriSize] = tensor->GetOriginalSize(); tensor_json[kLifelongValue] = tensor->lifelong_value_; tensor_json[kLifeStart] = tensor->lifetime_.start_; tensor_json[kLifeEnd] = tensor->lifetime_.end_; tensor_json[kOffset] = tensor->GetOffset(); tensors_json.emplace_back(tensor_json); } somas_json[kTensors] = tensors_json; std::string filename = GetSaveGraphsPathName( "/somas_meta/somas_graph" + std::to_string(graph->graph_id()) + "_" + hash_id_ + ".json", save_graphs_path_); (void)Common::SaveStringToFile(filename, somas_json.dump()); return true; } bool Somas::LoadSomasResult(const session::KernelGraph *graph, const string filename) { if (filename.length() <= strlen(".json")) { MS_LOG(WARNING) << "please check somas cache file path."; return false; } std::ifstream somas_json_fs(filename); if (!somas_json_fs.is_open()) { MS_LOG(INFO) << "Open json file: " << filename << " error, Somas Cache Missed."; return false; } nlohmann::json somas_json; try { somas_json_fs >> somas_json; somas_json_fs.close(); } catch (std::exception &e) { MS_LOG(WARNING) << "Parse json file error: " << filename << ", sleep 500ms and retry again."; somas_json_fs.close(); std::this_thread::sleep_for(std::chrono::milliseconds(kRetryIntervalSeconds)); std::ifstream retry_tmp(filename); if (!retry_tmp.is_open()) { MS_LOG(INFO) << "Open json file: " << filename << " error, please check kernel_meta."; return false; } retry_tmp >> somas_json; retry_tmp.close(); } auto ret = VerifySomasResult(graph, somas_json); if (!ret) { MS_LOG(WARNING) << "Verify Somas Result Failed."; return false; } auto mem_offset = somas_json[kMemOffset]; mem_offset_ = mem_offset; ret = UpdateTensorsOffset(somas_json[kTensors]); return ret; } bool Somas::VerifySomasResult(const session::KernelGraph *graph, const nlohmann::json &somas_json) const { MS_EXCEPTION_IF_NULL(graph); auto graph_id = somas_json[kGraphId]; auto hash_id = somas_json[kHashId]; auto node_size = somas_json[kNodeSize]; auto tensor_size = somas_json[kTensorSize]; auto contiguous_size = somas_json[kContiguousSize]; auto ref_node_size = somas_json[kRefNodeSize]; auto stream_size = somas_json[kStreamSize]; auto stream_group_size = somas_json[kStreamGroupSize]; if (graph_id != graph->graph_id()) { MS_LOG(WARNING) << "Mismatch graph id " << graph_id << " vs " << graph->graph_id(); return false; } if (hash_id != hash_id_) { MS_LOG(WARNING) << "Mismatch hash id " << hash_id << " vs " << hash_id_; return false; } if (node_size != nodes_list_.size()) { MS_LOG(WARNING) << "Mismatch node size " << node_size << " vs " << nodes_list_.size(); return false; } if (tensor_size != tensors_list_.size()) { MS_LOG(WARNING) << "Mismatch tensor size " << tensor_size << " vs " << tensors_list_.size(); return false; } if (contiguous_size != contiguous_tensors_list_.size()) { MS_LOG(WARNING) << "Mismatch contiguous size " << contiguous_size << " vs " << contiguous_tensors_list_.size(); return false; } if (ref_node_size != ref_node_constraints_.size()) { MS_LOG(WARNING) << "Mismatch ref node size " << ref_node_size << " vs " << ref_node_constraints_.size(); return false; } if (stream_size != streams_list_.size()) { MS_LOG(WARNING) << "Mismatch stream size " << stream_size << " vs " << streams_list_.size(); return false; } if (stream_group_size != streams_groups_.size()) { MS_LOG(WARNING) << "Mismatch stream group size " << stream_group_size << " vs " << streams_groups_.size(); return false; } return true; } bool Somas::UpdateTensorsOffset(const std::vector &tensors_json) { bool ret = true; for (auto &tensor_json : tensors_json) { auto tensor_id = tensor_json[kTensorId]; auto size = tensor_json[kSize]; auto ori_size = tensor_json[kOriSize]; auto lifelong_value = tensor_json[kLifelongValue]; auto life_start = tensor_json[kLifeStart]; auto life_end = tensor_json[kLifeEnd]; auto offset = tensor_json[kOffset]; auto iter = tensors_map_.find(tensor_id); if (iter != tensors_map_.end()) { MS_EXCEPTION_IF_NULL(iter->second); if (size != iter->second->aligned_size_) { MS_LOG(WARNING) << "Mismatch size of tensor " << tensor_id << " " << size << " vs " << iter->second->aligned_size_; ret = false; break; } if (ori_size != iter->second->GetOriginalSize()) { MS_LOG(WARNING) << "Mismatch original size of tensor " << tensor_id << " " << ori_size << " vs " << iter->second->GetOriginalSize(); ret = false; break; } if (lifelong_value != iter->second->lifelong_value_) { MS_LOG(WARNING) << "Mismatch lifelong value of tensor " << tensor_id << " " << lifelong_value << " vs " << iter->second->lifelong_value_; ret = false; break; } if (life_start != iter->second->lifetime_.start_) { MS_LOG(WARNING) << "Mismatch life start of tensor " << tensor_id << " " << life_start << " vs " << iter->second->lifetime_.start_; ret = false; break; } if (life_end != iter->second->lifetime_.end_) { MS_LOG(WARNING) << "Mismatch life start of tensor " << tensor_id << " " << life_end << " vs " << iter->second->lifetime_.end_; ret = false; break; } // verify pass, update memory offset iter->second->offset_ = offset; } else { MS_LOG(WARNING) << "Can't find tensor " << tensor_id; ret = false; break; } } return ret; } bool Somas::InitSomasTensors(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); InitBasicInfo(graph); IndependentNodeOutputProcess(graph); SummaryInputProcess(graph); RefNodeProcess(graph); NonTaskSplitProcess(graph); UnReuseNodeProcess(graph); GenContiguousList(graph); GetNextOutputProcess(graph); if (tensors_list_.empty()) { MS_LOG(INFO) << "No Tensor from graph " << graph->graph_id(); return true; } MS_LOG(INFO) << "Created " << streams_list_.size() << " streams (" << streams_groups_.size() << " groups), " << nodes_list_.size() << " nodes, " << tensors_list_.size() << " tensors, and " << contiguous_tensors_list_.size() << " contiguous lists"; #ifdef ENABLE_DUMP_IR SubModuleId module = SubModuleId::SM_OPTIMIZER; std::string name = "somas_pre_processed_info." + std::to_string(graph->graph_id()); (void)mindspore::RDR::RecordString(module, name, SomasInfo()); name = "somas_offline_log." + std::to_string(graph->graph_id()); (void)mindspore::RDR::RecordString(module, name, Offline()); #endif if (save_graphs_) { std::string file_path = GetSaveGraphsPathName( "/somas_pre_processed_info_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path_); DumpSomasInfoIR(file_path); std::string offline_file_path = GetSaveGraphsPathName("/somas_offline_log_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path_); DumpOfflineIR(offline_file_path); } return true; } void Somas::InitSomasStreamAndNode(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); std::vector kernel_cnodes; streams_list_ = {}; nodes_list_ = {}; size_t node_index = 0; if (graph->subgraph_multi_call()) { kernel_cnodes = graph->mem_reuse_exec_order(); } else { kernel_cnodes = graph->execution_order(); } for (size_t i = 0; i < kernel_cnodes.size(); i++) { auto kernel = kernel_cnodes[i]; MS_EXCEPTION_IF_NULL(kernel); SomasStreamPtr stream; auto stream_id = AnfAlgo::GetStreamId(kernel); auto it = find_if(streams_list_.begin(), streams_list_.end(), [stream_id](const SomasStreamPtr &s) { return s->GetId() == stream_id; }); if (it == streams_list_.end()) { stream = std::make_shared(stream_id); streams_list_.push_back(stream); } else { stream = *it; } // Node NodeType type = kCommonNode; if (AnfAlgo::IsCommunicationOp(kernel)) { type = kCommunicationNode; } auto node = std::make_shared(node_index, type, stream); MS_EXCEPTION_IF_NULL(node); node->scope_full_name_ = kernel->fullname_with_scope(); nodes_list_.push_back(node); stream->nodes_.push_back(node); auto key = kernel.get(); auto &nodes = nodes_map_[key]; nodes.push_back(node); node_index++; } } void Somas::InitSomasOutputAndWorkspaceTensors(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); tensors_list_ = {}; size_t tensor_index = 0; auto kernel_cnodes = graph->execution_order(); for (const auto &kernel : kernel_cnodes) { auto nodes = nodes_map_[kernel.get()]; auto node = nodes[0]; MS_EXCEPTION_IF_NULL(node); auto stream = node->GetStream(); MS_EXCEPTION_IF_NULL(stream); // Output Tensor auto kernel_mod = AnfAlgo::GetKernelMod(kernel); MS_EXCEPTION_IF_NULL(kernel_mod); auto output_sizes = kernel_mod->GetOutputSizeList(); auto index = 0; for (const auto &size : output_sizes) { auto output_tensor_index = tensor_index; tensor_index++; // Set all output tensor lifelong to true. auto tensor = std::make_shared(output_tensor_index, node, stream, size, kLifeLongNone); MS_EXCEPTION_IF_NULL(tensor); tensor->lifetime_.start_ = node->GetId(); tensor->lifetime_.end_ = (nodes.size() > 1) ? nodes.back()->GetId() : node->GetId(); tensor->type_ = kOutputOnly; if (AnfAlgo::OutputAddrExist(kernel, index)) { tensor->aligned_size_ = 0; } tensors_list_.push_back(tensor); tensors_map_[output_tensor_index] = tensor; stream->tensors_.push_back(tensor); std::for_each(nodes.begin(), nodes.end(), [tensor](auto &node) { MS_EXCEPTION_IF_NULL(node); node->tensors_.insert(tensor); node->output_tensors_.push_back(tensor); }); index++; } // WorkSpace Tensor auto workspace_sizes = kernel_mod->GetWorkspaceSizeList(); index = 0; for (const auto &size : workspace_sizes) { auto workspace_tensor_index = tensor_index; tensor_index++; SomasTensorPtr tensor = std::make_shared(workspace_tensor_index, node, stream, size, kLifeLongNone); MS_EXCEPTION_IF_NULL(tensor); tensor->type_ = kWorkspace; tensor->lifetime_.start_ = node->GetId(); tensor->lifetime_.end_ = (nodes.size() > 1) ? nodes.back()->GetId() : node->GetId(); if (AnfAlgo::WorkspaceAddrExist(kernel, index)) { tensor->aligned_size_ = 0; } tensors_list_.push_back(tensor); tensors_map_[workspace_tensor_index] = tensor; stream->tensors_.push_back(tensor); std::for_each(nodes.begin(), nodes.end(), [tensor](auto &node) { MS_EXCEPTION_IF_NULL(node); node->tensors_.insert(tensor); node->workspace_tensors_.push_back(tensor); }); index++; } } } void Somas::InitSomasInputTensors(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); bool is_all_nop_node = opt::IsAllNopNode(graph); static const auto enable_fusion_clear = (common::GetEnv("ENV_FUSION_CLEAR") == "1"); auto kernel_cnodes = graph->execution_order(); for (const auto &kernel : kernel_cnodes) { if (AnfAlgo::GetCNodeName(kernel) != kAtomicAddrCleanOpName) { InitCommonNodeInputs(is_all_nop_node, kernel); } else { InitAtomicCleanInputs(enable_fusion_clear, kernel); } } } void Somas::InitCommonNodeInputs(bool is_all_nop_node, const CNodePtr &kernel) { auto nodes = nodes_map_[kernel.get()]; auto node = nodes[0]; MS_EXCEPTION_IF_NULL(node); auto stream = node->GetStream(); MS_EXCEPTION_IF_NULL(stream); // Input Tensor auto input_tensor_num = AnfAlgo::GetInputTensorNum(kernel); size_t real_input_index = 0; for (size_t i = 0; i < input_tensor_num; i++) { auto input_node = kernel->input(i + 1); MS_EXCEPTION_IF_NULL(input_node); session::KernelWithIndex prenode_index; if (is_all_nop_node) { prenode_index = AnfAlgo::VisitKernelWithReturnType(input_node, 0, false); } else { prenode_index = AnfAlgo::VisitKernelWithReturnType(input_node, 0, true); } if (AnfAlgo::CheckPrimitiveType(prenode_index.first, prim::kPrimMakeTuple)) { MS_LOG(EXCEPTION) << "Input node [" << input_node->DebugString() << "]'s input " << i << " is MakeTuple"; } MS_EXCEPTION_IF_NULL(prenode_index.first); if (!AnfAlgo::IsRealCNodeKernel(prenode_index.first)) { auto op_name = AnfAlgo::GetCNodeName(kernel); TypeId input_origin_type = AnfAlgo::GetPrevNodeOutputInferDataType(kernel, i); if ((op_name == kDynamicRNNOpName || op_name == kDynamicGRUV2OpName) && input_origin_type == kMetaTypeNone) { continue; } auto parameter = GetSomasParameter(prenode_index.first, prenode_index.second); node->input_parameters_map_[real_input_index] = parameter; real_input_index++; MS_LOG(DEBUG) << "Input [" << prenode_index.first->fullname_with_scope() << "] is not a real cnode kernel."; continue; } auto iter = nodes_map_.find(prenode_index.first.get()); if (iter == nodes_map_.end()) { MS_LOG(EXCEPTION) << "Kernel[" << kernel->fullname_with_scope() << "]'s input " << i << " [" << prenode_index.first->fullname_with_scope() << "] is not init."; } auto pre_somas_node = iter->second.at(0); if (prenode_index.second > pre_somas_node->output_tensors_.size()) { MS_LOG(EXCEPTION) << "Output index " << prenode_index.second << " exceed input node [" << prenode_index.first->fullname_with_scope() << "]'s outputs size " << pre_somas_node->output_tensors_.size(); } auto input_somas_tensor = pre_somas_node->output_tensors_[prenode_index.second]; MS_EXCEPTION_IF_NULL(input_somas_tensor); std::for_each(nodes.begin(), nodes.end(), [input_somas_tensor](auto &node) { node->input_tensors_.push_back(input_somas_tensor); }); real_input_index++; if (input_somas_tensor->type_ == kOutputOnly) { input_somas_tensor->type_ = kCommon; } input_somas_tensor->destinationStreams_.insert(stream); for (auto &repeat_node : nodes) { input_somas_tensor->destinations_.insert(repeat_node); if (input_somas_tensor->lifetime_.end_ < repeat_node->GetId()) { input_somas_tensor->lifetime_.end_ = repeat_node->GetId(); } } if (node != pre_somas_node) { node->ancestor_nodes_.insert(pre_somas_node); } auto input_tensor_stream = input_somas_tensor->GetSourceStream(); if (input_tensor_stream != stream) { stream->ancestor_streams_.insert(input_tensor_stream); input_somas_tensor->between_streams_ = true; } } } void Somas::InitAtomicCleanInputs(bool enable_fusion_clear, const CNodePtr &kernel) { auto node = nodes_map_[kernel.get()].at(0); MS_EXCEPTION_IF_NULL(node); auto stream = node->GetStream(); MS_EXCEPTION_IF_NULL(stream); auto input_tensor_num = AnfAlgo::GetInputTensorNum(kernel); for (size_t i = 0; i < input_tensor_num; i++) { MS_EXCEPTION_IF_NULL(kernel->inputs()[i + 1]); auto pre_node = kernel->input(i + 1)->cast(); auto iter = nodes_map_.find(pre_node.get()); if (iter == nodes_map_.end()) { MS_LOG(EXCEPTION) << "Kernel[" << kernel->fullname_with_scope() << "]'s input [" << pre_node->fullname_with_scope() << "] is not init."; } auto pre_somas_node = iter->second.at(0); MS_EXCEPTION_IF_NULL(pre_somas_node); // set clean output tensors if (AnfAlgo::HasNodeAttr(kAttrAtomicOutputIndexs, pre_node)) { auto clean_output_indexs = AnfAlgo::GetNodeAttr>(pre_node, kAttrAtomicOutputIndexs); for (auto index : clean_output_indexs) { if (index > pre_somas_node->output_tensors_.size()) { MS_LOG(EXCEPTION) << "Output index " << index << " exceed input node [" << pre_node->fullname_with_scope() << "]'s outputs size " << pre_somas_node->output_tensors_.size(); } auto input_somas_tensor = pre_somas_node->output_tensors_[index]; MS_EXCEPTION_IF_NULL(input_somas_tensor); node->input_tensors_.push_back(input_somas_tensor); if (enable_fusion_clear) { input_somas_tensor->lifelong_value_ = kLifeLongGraphAll; MS_LOG(INFO) << "Set " << node->scope_full_name_ << "'s Input node " << pre_somas_node->scope_full_name_ << " 's output" << index << " to lifelong"; } } } // set clean workspace tensors if (AnfAlgo::HasNodeAttr(kAttrAtomicWorkspaceIndexs, pre_node)) { auto clean_workspace_indexs = AnfAlgo::GetNodeAttr>(pre_node, kAttrAtomicWorkspaceIndexs); for (const auto &index : clean_workspace_indexs) { if (index > pre_somas_node->output_tensors_.size()) { MS_LOG(EXCEPTION) << "Workspace index " << index << " exceed input node [" << pre_node->fullname_with_scope() << "]'s Workspace size " << pre_somas_node->workspace_tensors_.size(); } auto input_somas_tensor = pre_somas_node->workspace_tensors_[index]; MS_EXCEPTION_IF_NULL(input_somas_tensor); node->input_tensors_.push_back(input_somas_tensor); if (enable_fusion_clear) { input_somas_tensor->lifelong_value_ = kLifeLongGraphAll; MS_LOG(INFO) << "Set " << node->scope_full_name_ << "'s Input node " << pre_somas_node->scope_full_name_ << " 's workspace" << index << " to lifelong"; } } } } } SomasParameterPtr Somas::CreateSomasParameter(AnfNodePtr node, size_t index) { MS_EXCEPTION_IF_NULL(node); auto id = parameters_list_.size(); auto device_addr = AnfAlgo::GetOutputAddr(node, index); if (device_addr == nullptr) { MS_LOG(EXCEPTION) << "Node " << node->fullname_with_scope() << " has no device address before Somas."; } auto param = std::make_shared(id, node->fullname_with_scope(), index, device_addr->GetPtr(), device_addr->GetSize()); parameters_list_.push_back(param); return param; } SomasParameterPtr Somas::GetSomasParameter(AnfNodePtr node, size_t index) { auto key = node.get(); auto iter = parameters_map_.find(key); if (iter != parameters_map_.end()) { auto it = std::find_if(iter->second.begin(), iter->second.end(), [index](SomasParameterPtr param) -> bool { return index == param->output_index_; }); if (it != iter->second.end()) { return *it; } else { auto new_param = CreateSomasParameter(node, index); iter->second.push_back(new_param); return new_param; } } else { auto param = CreateSomasParameter(node, index); parameters_map_[key].push_back(param); return param; } } void Somas::InitBasicInfo(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); #ifdef ENABLE_D streams_groups_ = device::ascend::AscendStreamAssign::GetInstance().get_stream_group(); #endif InitSomasStreamAndNode(graph); InitSomasOutputAndWorkspaceTensors(graph); InitSomasInputTensors(graph); auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); #ifdef ENABLE_DUMP_IR SubModuleId module = SubModuleId::SM_OPTIMIZER; std::string name = "somas_initial_info." + std::to_string(graph->graph_id()); (void)mindspore::RDR::RecordString(module, name, SomasInfo()); #endif save_graphs_ = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_FLAG); save_graphs_path_ = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_PATH); if (save_graphs_path_.empty()) { save_graphs_path_ = "."; } if (save_graphs_) { std::string file_path = GetSaveGraphsPathName("/somas_initial_info_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path_); DumpSomasInfoIR(file_path); } } void Somas::GetNextOutputProcess(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); auto kernel_cnodes = graph->execution_order(); size_t total_size = 0; for (const auto &kernel : kernel_cnodes) { if (AnfAlgo::GetCNodeName(kernel) != kGetNextOpName) { continue; } auto iter = nodes_map_.find(kernel.get()); if (iter != nodes_map_.end()) { auto &node = iter->second.at(0); MS_EXCEPTION_IF_NULL(node); auto getnext_output_tensors = node->output_tensors_; for (auto &tensor : getnext_output_tensors) { MS_EXCEPTION_IF_NULL(tensor); total_size += tensor->GetAlignedSize(); tensor->lifelong_value_ = kLifeLongGraphAll; tensor->type_ = kGetNextOutput; } } } MS_LOG(INFO) << "Special Tensor total size: GetNext Output " << total_size; } void Somas::IndependentNodeOutputProcess(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); auto kernel_cnodes = graph->execution_order(); size_t total_size = 0; for (const auto &kernel : kernel_cnodes) { bool independent = AnfAlgo::IsIndependentNode(kernel); if (!independent) { continue; } auto iter = nodes_map_.find(kernel.get()); if (iter != nodes_map_.end()) { auto &node = iter->second.at(0); MS_EXCEPTION_IF_NULL(node); auto semi_reuse_output_tensors = node->output_tensors_; for (auto &tensor : semi_reuse_output_tensors) { MS_EXCEPTION_IF_NULL(tensor); total_size += tensor->GetAlignedSize(); tensor->lifelong_value_ = kLifeLongGraphAll; } } } MS_LOG(INFO) << "Special Tensor total size: Independent Node output " << total_size; } void Somas::SummaryInputProcess(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); bool summary_exist = graph->summary_node_exist(); if (!summary_exist) { return; } auto summary_nodes = graph->summary_nodes(); if (summary_nodes.empty()) { return; } size_t total_summary_size = 0; for (auto &node_item : summary_nodes) { auto node = node_item.second.first; size_t index = IntToSize(node_item.second.second); auto iter = nodes_map_.find(node.get()); if (iter != nodes_map_.end()) { auto input_node = iter->second.at(0); MS_EXCEPTION_IF_NULL(input_node); if (index < input_node->output_tensors_.size()) { auto tensor = input_node->output_tensors_[index]; MS_EXCEPTION_IF_NULL(tensor); tensor->lifelong_value_ = kLifeLongGraphAll; tensor->type_ = kSummaryInput; total_summary_size += tensor->GetAlignedSize(); MS_LOG(INFO) << "Set summary node input tensor's lifelong, node: " << node->fullname_with_scope() << " index: " << index; } else { MS_LOG(WARNING) << "Index exceed size, node " << node->fullname_with_scope() << " index: " << index << " size: " << input_node->output_tensors_.size(); } } else { MS_LOG(WARNING) << "Can't find summary input node " << node->fullname_with_scope() << " index: " << index; } } MS_LOG(INFO) << "Special Tensor total size: SummaryNodes: " << total_summary_size; } void Somas::RefNodeProcess(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); auto kernel_cnodes = graph->execution_order(); size_t total_output_size = 0; size_t total_input_size = 0; for (const auto &kernel : kernel_cnodes) { auto kernel_mod = AnfAlgo::GetKernelMod(kernel); if (kernel_mod == nullptr) { MS_LOG(WARNING) << "Kernel mode is NULL Of " << kernel->fullname_with_scope(); continue; } auto output_sizes = kernel_mod->GetOutputSizeList(); size_t output_index = 0; for (const auto &size : output_sizes) { auto out_index = output_index; output_index++; session::AnfWithOutIndex out_pair(kernel, out_index); if (graph->IsInRefOutputMap(out_pair)) { auto origin_pair = graph->GetRefCorrespondOutput(out_pair); MS_EXCEPTION_IF_NULL(origin_pair.first); auto &node = nodes_map_[kernel.get()].at(0); MS_EXCEPTION_IF_NULL(node); auto output_tensor = node->output_tensors_[out_index]; MS_EXCEPTION_IF_NULL(output_tensor); output_tensor->type_ = kRefNodeOutput; total_output_size += size; if (AnfAlgo::IsRealCNodeKernel(origin_pair.first)) { auto ori_node = origin_pair.first->cast(); auto ori_index = origin_pair.second; auto &repeat_node = nodes_map_[ori_node.get()].at(0); MS_EXCEPTION_IF_NULL(repeat_node); auto input_tensor = repeat_node->output_tensors_[ori_index]; MS_EXCEPTION_IF_NULL(input_tensor); input_tensor->type_ = kRefNodeInput; total_input_size += input_tensor->aligned_size_; std::vector refnode_input_output; refnode_input_output.push_back(input_tensor->GetId()); refnode_input_output.push_back(output_tensor->GetId()); ref_node_constraints_.push_back(refnode_input_output); MS_LOG(INFO) << "RefNode: input " << input_tensor->GetId() << " output " << output_tensor->GetId(); } } } } MS_LOG(INFO) << "Special Tensor total size: RefNode: input " << total_input_size << " output " << total_output_size; } void Somas::NonTaskSplitProcess(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); auto kernel_cnodes = graph->execution_order(); for (const auto &kernel : kernel_cnodes) { auto op_name = AnfAlgo::GetCNodeName(kernel); if ((op_name == kSplitOpName || op_name == kSplitVOpName) && AnfAlgo::HasNodeAttr(kAttrNonTask, kernel)) { std::vector refnode_input_output; auto node = nodes_map_[kernel.get()].at(0); MS_EXCEPTION_IF_NULL(node); if (node->input_tensors_.size() == 0) { MS_LOG(EXCEPTION) << op_name << " has no input tensor, can not do split non_task process."; } auto input_tensor = node->input_tensors_[0]; MS_EXCEPTION_IF_NULL(input_tensor); input_tensor->type_ = kRefNodeInput; refnode_input_output.push_back(input_tensor->GetId()); for (auto &output_tensor : node->output_tensors_) { MS_EXCEPTION_IF_NULL(output_tensor); output_tensor->type_ = kRefNodeOutput; refnode_input_output.push_back(output_tensor->GetId()); } ref_node_constraints_.push_back(refnode_input_output); } } } void Somas::UnReuseNodeProcess(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); vector full_name_list = {}; if (full_name_list.size() == 0) { return; } auto kernel_cnodes = graph->execution_order(); for (const auto &kernel : kernel_cnodes) { MS_EXCEPTION_IF_NULL(kernel); auto full_name = kernel->fullname_with_scope(); auto iter = std::find(full_name_list.begin(), full_name_list.end(), full_name); if (iter != full_name_list.end()) { MS_LOG(INFO) << "Set UnReuse Node in somas, Node:" << full_name; auto key = kernel.get(); auto somas_node = nodes_map_[key].at(0); MS_EXCEPTION_IF_NULL(somas_node); // input auto inputs = somas_node->input_tensors_; for (auto &input : inputs) { MS_EXCEPTION_IF_NULL(input); input->lifelong_value_ = kLifeLongGraphAll; } // output auto outputs = somas_node->output_tensors_; MS_LOG(INFO) << "Output size of " << kernel->fullname_with_scope() << " is " << outputs.size(); for (auto &output : outputs) { MS_EXCEPTION_IF_NULL(output); output->lifelong_value_ = kLifeLongGraphAll; } // workspace auto workspaces = somas_node->workspace_tensors_; for (auto &workspace : workspaces) { MS_EXCEPTION_IF_NULL(workspace); workspace->lifelong_value_ = kLifeLongGraphAll; } } } } void Somas::GenContiguousList(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); for (const auto &node : nodes_list_) { MS_EXCEPTION_IF_NULL(node); if (node->GetType() != kCommunicationNode) { continue; } // Contiguous input if ((!node->input_tensors_.empty()) && (!node->input_tensors_[0]->contiguous_)) { if (node->input_tensors_[0]->aligned_size_) { node->input_tensors_[0]->aligned_size_ += kGapSize; } if (node->input_tensors_[node->input_tensors_.size() - 1]->aligned_size_) { node->input_tensors_[node->input_tensors_.size() - 1]->aligned_size_ += kGapSize; } std::vector inputs; for (const auto &input_tensor : node->input_tensors_) { MS_EXCEPTION_IF_NULL(input_tensor); comm_input_total_size_ += input_tensor->aligned_size_; input_tensor->contiguous_ = true; inputs.push_back(input_tensor->GetId()); } contiguous_tensors_list_.push_back(inputs); } // Contiguous output if ((!node->output_tensors_.empty()) && (!node->output_tensors_[0]->contiguous_)) { if (node->output_tensors_[0]->aligned_size_) { node->output_tensors_[0]->aligned_size_ += kGapSize; } if (node->output_tensors_[node->output_tensors_.size() - 1]->aligned_size_) { node->output_tensors_[node->output_tensors_.size() - 1]->aligned_size_ += kGapSize; } std::vector outputs; for (const auto &output_tensor : node->output_tensors_) { MS_EXCEPTION_IF_NULL(output_tensor); comm_output_total_size_ += output_tensor->aligned_size_; output_tensor->contiguous_ = true; outputs.push_back(output_tensor->GetId()); } contiguous_tensors_list_.push_back(outputs); } } } void Somas::ComputeConflictPairs() { if (tensors_list_.empty()) { MS_LOG(INFO) << "No Tensor for Conflict computing"; return; } MS_LOG(INFO) << "Start Conflict Computing (Bitset Model)"; auto start_conflict = std::chrono::system_clock::now(); std::sort(nodes_list_.begin(), nodes_list_.end(), NodeSort); UpdateTensorDestinations(); MS_LOG(INFO) << "Start Bitset"; std::vector nodes_dependency; size_t count = nodes_list_.back()->GetId() + 1; for (size_t i = 0; i < count; i++) { nodes_dependency.emplace_back(count); } MS_LOG(INFO) << "Start Path Computing"; // Loop to compute ancestor paths via bitset for time dependence for (const auto &node : nodes_list_) { for (const auto &ancestor : node->ancestor_nodes_) { nodes_dependency[node->GetId()].SetBitTrue(ancestor->GetId()); Union(&nodes_dependency[node->GetId()], &nodes_dependency[ancestor->GetId()]); } } MS_LOG(INFO) << "End Path Computing"; MS_LOG(INFO) << "Start Tensor Relation Computing"; count = tensors_list_.back()->GetId() + 1; for (size_t i = 0; i < count; i++) { reuse_matrix_.emplace_back(count); } if (tensors_list_.size() < kParallelComputeSizeThreshold) { ComputeMultiTensorConflicts(tensors_list_, tensors_list_, nodes_dependency, &reuse_matrix_); } else { MS_LOG(INFO) << "Tensor Num " << tensors_list_.size() << " is larger than " << kParallelComputeSizeThreshold; MS_LOG(INFO) << "Enter Multi-Thread Mode..."; size_t process_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum(); MS_LOG(INFO) << "Threads Num is " << process_num; size_t start_index = 0; size_t total_size = tensors_list_.size(); size_t job_size = total_size / process_num; if (job_size == 0) { job_size = total_size; } std::vector tasks; while (start_index < total_size) { size_t end_index = (start_index + job_size) > total_size ? total_size : start_index + job_size; auto jobs = std::vector(tensors_list_.begin() + start_index, tensors_list_.begin() + end_index); auto task = [this, jobs, &nodes_dependency]() { this->ComputeMultiTensorConflicts(jobs, tensors_list_, nodes_dependency, &reuse_matrix_); return common::SUCCESS; }; tasks.emplace_back(task); start_index += job_size; } common::ThreadPool::GetInstance().SyncRun(tasks); } MS_LOG(INFO) << "End Tensor Relation Computing"; auto end_conflict = std::chrono::system_clock::now(); MS_LOG(INFO) << "End Conflict Computing (Bitset Model)(time taken " << std::chrono::duration_cast(end_conflict - start_conflict).count() << "ms)"; } void Somas::UpdateTensorDestinations() { // Loop to add edges within each stream (node order within stream) for (const auto &stream : streams_list_) { MS_EXCEPTION_IF_NULL(stream); auto &nodes = stream->nodes_; std::sort(nodes.begin(), nodes.end(), NodeSort); for (size_t i = 1; i < nodes.size(); i++) { const auto &previous_node = nodes[i - 1]; const auto ¤t_node = nodes[i]; MS_EXCEPTION_IF_NULL(current_node); current_node->ancestor_nodes_.insert(previous_node); } } // Loop to add edges from end to beginning of next group for (const auto &group : streams_groups_) { for (size_t i = 1; i < group.size(); i++) { int64_t previous_stream = group[i - 1]; int64_t current_stream = group[i]; auto it = std::find_if(streams_list_.begin(), streams_list_.end(), [previous_stream](const SomasStreamPtr &stream) { return stream->GetId() == previous_stream; }); if (it == streams_list_.end()) { continue; } auto &last_node_in_prev_stream = (*it)->nodes_.back(); it = std::find_if(streams_list_.begin(), streams_list_.end(), [current_stream](const SomasStreamPtr &stream) { return stream->GetId() == current_stream; }); if (it == streams_list_.end()) { continue; } auto &first_node_in_cur_stream = (*it)->nodes_.front(); first_node_in_cur_stream->ancestor_nodes_.insert(last_node_in_prev_stream); } } // Loop to avoid tensors with empty destinations (add itself) for (const auto &tensor : tensors_list_) { MS_EXCEPTION_IF_NULL(tensor); if (tensor->destinations_.size() == 0) { tensor->destinations_.insert(tensor->GetSourceNode()); } } // Loop to compute max destinations in each stream for (const auto &tensor : tensors_list_) { MS_EXCEPTION_IF_NULL(tensor); tensor->ComputeMaxDestinationId(); } } void Somas::ComputeMultiTensorConflicts(const std::vector &calc_tensors_list, const std::vector &all_tensors_list, const vector &nodes_dependency, std::vector *tensor_relation) const { auto start = std::chrono::system_clock::now(); MS_LOG(INFO) << "Start Computing Conflicts Pairs, tensors list size is " << calc_tensors_list.size(); for (size_t i = 0; i < calc_tensors_list.size(); i++) { auto calc_tensor = calc_tensors_list[i]; MS_EXCEPTION_IF_NULL(calc_tensor); if (calc_tensor->IsLifelong() || calc_tensor->IsRefOverlap() || calc_tensor->GetAlignedSize() == 0) { continue; } ComputeOneTensorConflicts(calc_tensor, all_tensors_list, nodes_dependency, tensor_relation); } auto end = std::chrono::system_clock::now(); MS_LOG(INFO) << "End Computing Conflicts Pairs (time taken " << std::chrono::duration_cast(end - start).count() << "ms)"; } void Somas::ComputeOneTensorConflicts(const std::shared_ptr &calc_tensor, const std::vector &all_tensors_list, const vector &nodes_dependency, std::vector *tensor_relation) const { MS_EXCEPTION_IF_NULL(calc_tensor); for (size_t j = 0; j < all_tensors_list.size(); j++) { auto target_tensor = all_tensors_list[j]; MS_EXCEPTION_IF_NULL(target_tensor); if (calc_tensor == target_tensor || target_tensor->IsLifelong() || target_tensor->IsRefOverlap() || target_tensor->GetAlignedSize() == 0) { continue; } size_t calc_src_node = calc_tensor->GetSourceNode()->GetId(); size_t target_src_node = target_tensor->GetSourceNode()->GetId(); if (calc_src_node == target_src_node) { continue; } if ((*tensor_relation)[calc_tensor->GetId()].IsBitTrue(target_tensor->GetId()) || (*tensor_relation)[target_tensor->GetId()].IsBitTrue(calc_tensor->GetId())) { continue; } bool reuse = true; // check calc_tensor's all consumers is target_tensor's source node's dependency or not for (const auto &dst_map : calc_tensor->max_destinations_) { const auto &dst_node = dst_map.second; MS_EXCEPTION_IF_NULL(dst_node); if (nodes_dependency[target_src_node].IsBitTrue(dst_node->GetId()) == false) { // calc_tensor's consumer is not in target_tensor's source node's dependency, not sure this consumer is done or // not when target_tensor produced reuse = false; break; } else if (target_src_node == dst_node->GetId()) { // calc_tensor is target_tensor's source node's input, can't reuse reuse = false; break; } else { // calc_tensor's consumer is in target_tensor's source node's dependency, this consumer is done when // target_tensor produced reuse = true; } } if (reuse) { // calc_tensor and target_tensor have dependencies so they can reuse each other (*tensor_relation)[calc_tensor->GetId()].SetBitTrue(target_tensor->GetId()); (*tensor_relation)[target_tensor->GetId()].SetBitTrue(calc_tensor->GetId()); } } } bool Somas::NodeSort(SomasNodePtr node1, SomasNodePtr node2) { return node1->GetId() < node2->GetId(); } bool Somas::Assign(const session::KernelGraph *graph) { if (tensors_list_.empty()) { MS_LOG(INFO) << "No Tensor for Assigner"; return true; } // Ref Node Preprocessing UpdateRefTensorsConflict(); std::map contiguous_list_with_ref_index_map = GetContiguousListContainRefTensor(); vector> contiguous_tensors_list_removed = contiguous_tensors_list_; std::set> contiguous_tensors_list_to_remove; for (auto ref_list_pair : contiguous_list_with_ref_index_map) { contiguous_tensors_list_to_remove.insert(contiguous_tensors_list_[ref_list_pair.second]); } // remove the contiguous list which all tensors' align size is 0 for (auto contiguous_list : contiguous_tensors_list_) { bool all_outputs = true; for (auto tensor_id : contiguous_list) { auto tensor = tensors_list_[tensor_id]; MS_EXCEPTION_IF_NULL(tensor); if (tensor->aligned_size_ != 0) { all_outputs = false; break; } } if (all_outputs) { contiguous_tensors_list_to_remove.insert(contiguous_list); } } for (auto contiguous_list : contiguous_tensors_list_to_remove) { auto iterator = std::find(contiguous_tensors_list_removed.begin(), contiguous_tensors_list_removed.end(), contiguous_list); if (iterator != contiguous_tensors_list_removed.end()) { contiguous_tensors_list_removed.erase(iterator); } else { MS_LOG(WARNING) << "Could not find contiguous list to remove for ref"; } } MS_LOG(INFO) << "End Solving Preprocessing for Ref Node"; UpdateRefOverlapTensorsConflicts(); #ifdef SOMAS_DEBUG // Compute number of constraints for each tensor auto tensors_num = tensors_list_.size(); for (auto tensor1 : tensors_list_) { auto ones_num = reuse_matrix_[tensor1->GetId()].CountOnesNum(); tensor1->num_constraints_ = tensors_num - ones_num; } #endif // Prepare solver info MS_LOG(INFO) << "Start Loop to create solver info"; for (auto tensor : tensors_list_) { MS_EXCEPTION_IF_NULL(tensor); if (tensor->GetSolverTensorDesc() != nullptr) { SomasSolverTensorDescPtr pSolverTensor = tensor->GetSolverTensorDesc(); solver_tensor_desc_map_.insert(std::pair(pSolverTensor->index_, pSolverTensor)); } } MS_LOG(INFO) << "End Loop to create solver info"; MS_LOG(INFO) << "Start Solving"; if (solver_tensor_desc_map_.empty()) { MS_LOG(INFO) << "solver_tensor_desc_list is empty."; return true; } somas_solver_ = std::make_shared(); auto status = somas_solver_->Solving(graph, &solver_tensor_desc_map_, &reuse_matrix_, contiguous_tensors_list_removed, false); MS_LOG(INFO) << "End Solving"; if (status != SUCCESS) { GenGraphStatisticInfo(); MS_LOG(EXCEPTION) << "SOMAS Solving Failed."; } // Update solver_tensor_desc offset to tensors list for (const auto &tensor : tensors_list_) { MS_EXCEPTION_IF_NULL(tensor); tensor->SetOffset(); } UpdateRefTensorsOffset(); UpdateContiguousTensorsOffset(contiguous_list_with_ref_index_map); // Set mem_offset_ value by solver result mem_offset_ = static_cast(somas_solver_->GetMaxOffset()); return true; } std::map Somas::GetContiguousListContainRefTensor() { // key: contiguous list index with ref node input; value: contiguous list index with ref node output std::map contiguous_list_with_ref_index_map; std::map ref_tensors_in_contiguous_map = GetRefTensorsInContiguousList(); std::map>> contiguous_ref_list_error_check_map; for (auto ref_pair : ref_tensors_in_contiguous_map) { size_t ref_first = ref_pair.first; size_t ref_second = ref_pair.second; bool found_first = false; bool found_second = false; size_t index_first = 0; size_t index_second = 0; size_t index_in_list_first = 0; size_t index_in_list_second = 0; for (size_t index = 0; index < contiguous_tensors_list_.size() && (!found_first || !found_second); index++) { if (!found_first) { auto iterator_first = std::find(contiguous_tensors_list_[index].begin(), contiguous_tensors_list_[index].end(), ref_first); if (iterator_first != contiguous_tensors_list_[index].end()) { index_first = index; index_in_list_first = iterator_first - contiguous_tensors_list_[index].begin(); found_first = true; } } if (!found_second) { auto iterator_second = std::find(contiguous_tensors_list_[index].begin(), contiguous_tensors_list_[index].end(), ref_second); if (iterator_second != contiguous_tensors_list_[index].end()) { index_second = index; index_in_list_second = iterator_second - contiguous_tensors_list_[index].begin(); found_second = true; } } } if (!found_first) { MS_LOG(WARNING) << "Contiguous ref tensor " << ref_first << " not found in any contiguous list"; } if (!found_second) { MS_LOG(WARNING) << "Contiguous ref tensor " << ref_second << " not found in any contiguous list"; } if (contiguous_list_with_ref_index_map.find(index_first) == contiguous_list_with_ref_index_map.end() || contiguous_list_with_ref_index_map[index_first] == index_second) { contiguous_list_with_ref_index_map[index_first] = index_second; // Checking for error cases if (index_in_list_first != index_in_list_second) { MS_LOG(WARNING) << "Inconsistency in contiguous ref: tensor " << ref_first << " in position " << index_in_list_first << " of contiguous list " << index_first << " and tensor " << ref_second << " in position " << index_in_list_second << " of contiguous list " << index_second; } contiguous_ref_list_error_check_map[index_first][index_second].insert(index_in_list_first); } else { MS_LOG(WARNING) << "Contiguous list " << index_first << " associated (ref node) with two other contiguous lists: " << contiguous_list_with_ref_index_map[index_first] << " and " << index_second; } } for (auto check_list_pair : contiguous_ref_list_error_check_map) { auto first_list = check_list_pair.first; auto index_set_map = check_list_pair.second; for (auto index_set : index_set_map) { auto second_list = index_set.first; if (contiguous_tensors_list_[first_list].size() != contiguous_tensors_list_[second_list].size()) { MS_LOG(WARNING) << "Contiguous lists " << first_list << " and " << second_list << " considered in ref do not have the same size"; } for (size_t x = 0; x < contiguous_tensors_list_[second_list].size(); x++) { if (contiguous_ref_list_error_check_map[first_list][second_list].count(x) == 0) { MS_LOG(WARNING) << "Contiguous lists " << first_list << " and " << second_list << " considered in ref: ref pair at in-lists index " << x << " has not been considered"; } } } } return contiguous_list_with_ref_index_map; } std::map Somas::GetRefTensorsInContiguousList() { // key: refnode input value: refnode output std::map ref_tensors_in_contiguous_map; for (auto ref_node_list : ref_node_constraints_) { // Count contiguous tensors in ref list size_t contiguous_in_ref_list = std::count_if(ref_node_list.begin(), ref_node_list.end(), [this](size_t tid) { return tensors_map_[tid]->contiguous_; }); // Keep info about contiguous and check for errors if (ref_node_list.size() > kRefNodeTensorNum && contiguous_in_ref_list > 0) { MS_LOG(WARNING) << "Ref node of size greater than two with at least one contiguous tensor in"; } if (ref_node_list.size() == kRefNodeTensorNum && contiguous_in_ref_list == 1) { MS_LOG(WARNING) << "Ref node of size two with only one contiguous tensor" << ref_node_list[0] << ":" << tensors_map_[ref_node_list[0]]->contiguous_ << ", " << ref_node_list[1] << ":" << tensors_map_[ref_node_list[1]]->contiguous_; } if (ref_node_list.size() == kRefNodeTensorNum && contiguous_in_ref_list == kRefNodeTensorNum) { ref_tensors_in_contiguous_map[ref_node_list[0]] = ref_node_list[1]; } } return ref_tensors_in_contiguous_map; } void Somas::UpdateContiguousTensorsOffset(const std::map &contiguous_ref_list_map) { // Handle contiguous ref node for (auto ref_list_pair : contiguous_ref_list_map) { size_t index_first = ref_list_pair.first; size_t index_second = ref_list_pair.second; for (size_t x = 0; x < contiguous_tensors_list_[index_second].size(); x++) { tensors_map_[contiguous_tensors_list_[index_second][x]]->offset_ = tensors_map_[contiguous_tensors_list_[index_first][x]]->offset_; } } // Contiguous gaps postprocessing for (auto list : contiguous_tensors_list_) { tensors_map_[list[0]]->offset_ += kGapSize; } } void Somas::UpdateRefTensorsOffset() { // Ref Node Postprocessing MS_LOG(INFO) << "\nStart Solving Postprocessing for Ref Node"; // Set offset for rest of ref node list (ignored by solver due to ref node preprocessing) for (auto ref_node_list : ref_node_constraints_) { for (size_t i = 1; i < ref_node_list.size(); ++i) { tensors_map_[ref_node_list[i]]->offset_ = tensors_map_[ref_node_list[0]]->offset_; } } } void Somas::UpdateRefOverlapTensorsConflicts() { // Ref Overlap Preprocessing MS_LOG(INFO) << "Start Solving Preprocessing for Ref Overlap"; // In ConflictComputing(), by use of ref_overlap_ flag, each tensor in a ref_overlap_list has all entries 1 in // cannot_reuse_ array Here, we allow reuse only among tensors in same list for (auto ref_overlap_list : ref_overlap_constraints_) { for (size_t tid_1 : ref_overlap_list) { for (size_t tid_2 : ref_overlap_list) { reuse_matrix_[tid_1].SetBitTrue(tid_2); reuse_matrix_[tid_2].SetBitTrue(tid_1); } } } MS_LOG(INFO) << "End Solving Preprocessing for Ref Overlap"; } void Somas::UpdateRefTensorsConflict() { // Keep all constraints for first tensor in list for (auto ref_node_list : ref_node_constraints_) { size_t tid_0 = ref_node_list[0]; for (SomasTensorPtr tensor : tensors_list_) { if (reuse_matrix_[tid_0].IsBitTrue(tensor->GetId()) == false) { continue; } for (size_t tid : ref_node_list) { if (reuse_matrix_[tid].IsBitTrue(tensor->GetId()) == false) { reuse_matrix_[tid_0].SetBitFalse(tensor->GetId()); reuse_matrix_[tensor->GetId()].SetBitFalse(tid_0); break; } } } // Set rest to size 0, so that solver ignores them (if not contiguous) for (size_t i = 1; i < ref_node_list.size(); ++i) { if (!tensors_map_[ref_node_list[i]]->contiguous_) { tensors_map_[ref_node_list[i]]->aligned_size_ = 0; } } } } std::string Somas::GetSplitName(const std::string &scope_name) const { auto index = scope_name.rfind('/'); if (index == std::string::npos) { return scope_name; } else { if (index < scope_name.size() - 1) { auto split_name = scope_name.substr(index + 1); return split_name; } return scope_name; } } std::string Somas::SomasInfo(bool calc_hash) const { std::ostringstream oss; if (!calc_hash) { DumpParameters(oss); } DumpTensors(oss); DumpNodes(oss); oss << "\n\nAll Stream Groups:\n\n"; for (const auto &stream_group : streams_groups_) { for (const auto &stream : stream_group) { oss << "stm" << stream << " "; } oss << "\n"; } if (!ref_node_constraints_.empty()) { oss << "\n\nAll Ref Node Info:\n\n"; for (const auto &ref_in_out : ref_node_constraints_) { oss << "refnode input-output:"; for (const auto &item : ref_in_out) { oss << "%" << item << "T "; } oss << "\n"; } } return oss.str(); } void Somas::DumpNodes(std::ostringstream &oss) const { oss << "\n\nAll Nodes:\n\n"; for (const auto &node : nodes_list_) { MS_EXCEPTION_IF_NULL(node); auto scope_name = node->scope_full_name_; std::string split_name = GetSplitName(scope_name); oss << "$" << node->GetId() << "\t" << split_name << "\t" << static_cast(node->GetType()) << "\t"; auto input_num = node->input_tensors_.size() + node->input_parameters_map_.size(); oss << "inputs["; size_t tensor_index = 0; for (size_t input_index = 0; input_index < input_num; input_index++) { auto iter = node->input_parameters_map_.find(input_index); if (iter != node->input_parameters_map_.end()) { oss << "%" << iter->second->id_ << "P" << ", "; } else { oss << "%" << node->input_tensors_[tensor_index]->GetId() << "T" << ", "; tensor_index++; } } oss << "]"; oss << "\toutputs["; for (const auto &out : node->output_tensors_) { MS_EXCEPTION_IF_NULL(out); oss << "%" << out->GetId() << "T" << ", "; } oss << "]"; oss << "\tworkspace["; for (const auto &wk : node->workspace_tensors_) { MS_EXCEPTION_IF_NULL(wk); oss << "%" << wk->GetId() << "T" << ", "; } oss << "]"; oss << "\tstreamID[" << "@" << node->GetStream()->GetId() << "]\n"; } } void Somas::DumpTensors(std::ostringstream &oss) const { oss << "\n\nAll Tensors:\n\n"; oss << "index:" << "\tsize:" << "\treal_size:" << "\toffset:" << "\taddr:" << "\ttype:" << "\tlifelong:" << "\tlife_start:" << "\tlife_end:" << "\tsource node name:\n"; for (const auto &tensor : tensors_list_) { MS_EXCEPTION_IF_NULL(tensor); auto scope_name = tensor->GetSourceNode()->scope_full_name_; std::string split_name = GetSplitName(scope_name); oss << "%" << tensor->GetId() << "T" << "\t" << "#" << tensor->GetAlignedSize() << "S" << "\t" << "#" << tensor->GetOriginalSize() << "S" << "\t" << "&" << tensor->GetOffset() << "" << "\t" << "&" << static_cast(tensor->GetOffset() + mem_base_addr_) << "\t" << tensor_type_name_map[tensor->type_] << "\t" << tensor->IsLifelong() << "\t" << tensor->lifetime_.start_ << "\t" << tensor->lifetime_.end_ << "\t" << split_name << "\n"; } } void Somas::DumpParameters(std::ostringstream &oss) const { oss << "All Parameters:\n\n"; oss << "index:" << "\tsize:" << "\tstart_addr:" << "\tsource node name:" << "\tnode out index:\n"; for (const auto ¶m : parameters_list_) { MS_EXCEPTION_IF_NULL(param); oss << "%" << param->id_ << "P" << "\t" << "#" << param->size_ << "S" << "\t" << "&" << param->addr_ << "\t" << param->source_node_name_ << "\t" << param->output_index_ << "\n"; } } void Somas::DumpSomasInfoIR(const string filename) const { (void)Common::SaveStringToFile(filename, SomasInfo()); } std::string Somas::Offline() const { std::ostringstream oss; for (auto tensor : tensors_list_) { MS_EXCEPTION_IF_NULL(tensor); if (tensor->IsOutputOnly() || tensor->type_ == TensorType::kRefNodeOutput) { oss << "Somas EDGE ERROR src=n" << tensor->GetSourceNode()->GetId() << ", srcstm=" << tensor->GetSourceStream()->GetId() << ", dst=nc" << ", dststm=nc" << ", workspace=0, size=" << tensor->GetOriginalSize() << ", lifelong=" << static_cast(tensor->lifelong_value_) << ", tid=" << tensor->GetId() << ", start=" << tensor->lifetime_.start_ << ", end=" << tensor->lifetime_.end_ << std::endl; } else { std::map dest_infos; for (SomasNodePtr dest_node : tensor->destinations_) { dest_infos.insert(std::make_pair(dest_node->GetId(), dest_node->GetStream()->GetId())); } for (auto dest_info : dest_infos) { oss << "Somas EDGE src=n" << tensor->GetSourceNode()->GetId() << ", srcstm=" << tensor->GetSourceStream()->GetId() << ", dst=n" << dest_info.first << ", dststm=" << dest_info.second << ", workspace=" << static_cast(tensor->type_ == kWorkspace) << ", size=" << tensor->GetOriginalSize() << ", lifelong=" << static_cast(tensor->lifelong_value_) << ", tid=" << tensor->GetId() << ", start=" << tensor->lifetime_.start_ << ", end=" << tensor->lifetime_.end_ << std::endl; } } } for (vector tList : contiguous_tensors_list_) { oss << "Somas CONTIGUOUS"; for (size_t tid : tList) { oss << " " << tid; } oss << std::endl; } for (const auto &group : streams_groups_) { oss << "Somas GROUP"; for (int64_t sid : group) { oss << " " << sid; } oss << std::endl; } return oss.str(); } void Somas::DumpOfflineIR(const string filename) const { MS_LOG(INFO) << "Printing somas-log-from-graph log: " << filename; (void)Common::SaveStringToFile(filename, Offline()); } std::string Somas::SomasMemory() const { std::ostringstream oss; std::map mem_map; for (auto tensor : tensors_list_) { MS_EXCEPTION_IF_NULL(tensor); mem_map[tensor->GetOffset()] = 0; } size_t num = 0; for (auto iter = mem_map.begin(); iter != mem_map.end(); ++iter, ++num) { iter->second = num; } std::map> mem_list; for (const auto &output_tensor : tensors_list_) { MS_EXCEPTION_IF_NULL(output_tensor); size_t key = output_tensor->offset_; auto iter = mem_list.find(key); if (iter == mem_list.end()) { std::map id_tensor_map; id_tensor_map[output_tensor->GetId()] = output_tensor; mem_list[key] = id_tensor_map; } else { iter->second[output_tensor->GetId()] = output_tensor; } } oss << "mem_id:" << "\tstart_offset:" << "\tend_offset:" << "\ttensor_id:" << "\torigin_size:" << "\talign_size:" << "\tstart_addr:" << "\tend_addr:" << "\ttype:" << "\tsrc_node:" << "\tsrc_stm_id:" << "lifetime_start\t" << "lifetime_end\n"; for (const auto &mem : mem_list) { auto id_tensor_map = mem.second; for (const auto &id_tensor : id_tensor_map) { auto place_tensor = id_tensor.second; MS_EXCEPTION_IF_NULL(place_tensor); std::string scope_name; size_t src_stm_id = 0xffff; if (place_tensor->GetSourceNode() != nullptr) { scope_name = place_tensor->GetSourceNode()->scope_full_name_; src_stm_id = place_tensor->GetSourceNode()->GetStream()->GetId(); } else { scope_name = "Somas Tensor"; } std::string split_name = GetSplitName(scope_name); oss << "#" << mem_map[place_tensor->GetOffset()] << "\t" << place_tensor->GetOffset() << "\t" << place_tensor->GetOffset() + place_tensor->GetAlignedSize() << "\t%" << place_tensor->GetId() << "T\t" << place_tensor->GetOriginalSize() << "\t" << place_tensor->GetAlignedSize() << "\t&" << static_cast(place_tensor->GetOffset() + mem_base_addr_) << "\t&" << static_cast(place_tensor->GetOffset() + mem_base_addr_ + place_tensor->GetAlignedSize()) << "\t" << tensor_type_name_map[place_tensor->type_] << "\t" << split_name << "\tstm" << src_stm_id << "\t" << place_tensor->lifetime_.start_ << "\t" << place_tensor->lifetime_.end_ << "\n"; } } return oss.str(); } void Somas::DumpSomasMemoryIR(const string filename) const { (void)Common::SaveStringToFile(filename, SomasMemory()); } size_t Somas::CalcLowerBound() const { size_t max_node_id = std::accumulate(tensors_list_.begin(), tensors_list_.end(), 0, [](size_t max_id, auto tensor) { return std::max(max_id, tensor->lifetime_.end_); }); std::map lifetime_lb; for (size_t time = 0; time <= max_node_id; time++) { lifetime_lb[time] = 0; } size_t lower, upper; for (auto tensor : tensors_list_) { MS_EXCEPTION_IF_NULL(tensor); if (tensor->lifelong_value_ == kLifeLongGraphAll) { lower = 0; upper = max_node_id; } else { lower = tensor->lifetime_.start_; upper = tensor->lifetime_.end_; } for (size_t time = lower; time <= upper; time++) { lifetime_lb[time] += tensor->GetAlignedSize(); } } size_t max_lifetime = 0; for (size_t time = 0; time <= max_node_id; time++) { if (max_lifetime < lifetime_lb[time]) { max_lifetime = lifetime_lb[time]; } } return max_lifetime; } void Somas::GenGraphStatisticInfo() { lower_bound_ = CalcLowerBound(); for (const auto &tensor : tensors_list_) { MS_EXCEPTION_IF_NULL(tensor); upper_bound_ += tensor->aligned_size_; if (tensor->type_ == kWorkspace) { workspace_total_size_ += tensor->aligned_size_; } if (tensor->lifelong_value_ == kLifeLongGraphAll) { lifelong_all_total_size_ += tensor->aligned_size_; } else if (tensor->lifelong_value_ == kLifeLongGraphStart) { lifelong_start_total_size_ += tensor->aligned_size_; } else if (tensor->lifelong_value_ == kLifeLongGraphEnd) { lifelong_end_total_size_ += tensor->aligned_size_; } } const double giga = 1024. * 1024. * 1024.; MS_LOG(INFO) << "Lower Bound: " << lower_bound_ << " (" << lower_bound_ / giga << " GB), Upper Bound: " << upper_bound_ << " (" << upper_bound_ / giga << " GB)"; MS_LOG(INFO) << "\nTotal Dynamic Size (Upper Bound):\t" << upper_bound_ << "\n" << "Theoretical Optimal Size (Lower Bound):\t" << lower_bound_ << "\n" << "Total Workspace Size:\t" << workspace_total_size_ << "\n" << "Total Communication Input Tensor Size:\t" << comm_input_total_size_ << "\n" << "Total Communication Output Tensor Size:\t" << comm_output_total_size_ << "\n" << "Total LifeLong All Tensor Size:\t" << lifelong_all_total_size_ << "\n" << "Total LifeLong Start Tensor Size:\t" << lifelong_start_total_size_ << "\n" << "Total LifeLong End Tensor Size:\t" << lifelong_end_total_size_ << "\n" << "Reused Size(Allocate Size):\t" << GetTotalMemSize() << "\n\n\n"; } uint8_t *Somas::GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const { MS_EXCEPTION_IF_NULL(node); auto key = node.get(); auto iter = nodes_map_.find(key); uint8_t *ptr = nullptr; if (iter != nodes_map_.end()) { auto &somas_node = iter->second.at(0); MS_EXCEPTION_IF_NULL(somas_node); if (index >= somas_node->output_tensors_.size()) { MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's workspace size:[" << somas_node->output_tensors_.size() << "]"; } auto output_tensor = somas_node->output_tensors_[index]; ptr = mem_base_addr_ + output_tensor->offset_; } else { MS_LOG(EXCEPTION) << "node [" << AnfAlgo::GetCNodeName(node) << "] don't exist in nodes_map"; } return ptr; } uint8_t *Somas::GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const { MS_EXCEPTION_IF_NULL(node); auto key = node.get(); auto iter = nodes_map_.find(key); uint8_t *ptr = nullptr; if (iter != nodes_map_.end()) { auto &somas_node = iter->second.at(0); MS_EXCEPTION_IF_NULL(somas_node); if (index >= somas_node->workspace_tensors_.size()) { MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's workspace size:[" << somas_node->workspace_tensors_.size() << "]"; } auto workspace_tensor = somas_node->workspace_tensors_[index]; ptr = mem_base_addr_ + workspace_tensor->offset_; } return ptr; } void Somas::ConvertToProfilingNode(uint32_t graph_id) { #ifdef ENABLE_D auto graph_node = MemoryProfiling::GetInstance().GetGraphMemoryNode(graph_id); if (graph_node == nullptr) { graph_node = MemoryProfiling::GetInstance().AddGraphMemoryNode(graph_id); MS_LOG(INFO) << "Add graph memory node for dynamic memory profiling, graph id is " << graph_id; } for (const auto &tensor : tensors_list_) { TensorMemory tensor_memory; tensor_memory.SetTensorId(tensor->GetId()); tensor_memory.SetAlignedSize(tensor->GetAlignedSize()); tensor_memory.SetType(tensor_type_name_map[tensor->type_]); tensor_memory.SetLifeStart(tensor->lifetime_.start_); tensor_memory.SetLifeEnd(tensor->lifetime_.end_); tensor_memory.SetLifeLong(life_long_name_map[tensor->lifelong_value_]); graph_node->AddTensorMemory(tensor_memory); } for (const auto &node : nodes_list_) { NodeMemory node_memory; std::string name = GetSplitName(node->scope_full_name_); node_memory.SetNodeName(name); node_memory.SetNodeId(node->GetId()); for (const auto &input_tensor : node->input_tensors_) { node_memory.AddInputTensorId(input_tensor->GetId()); } for (const auto &output_tensor : node->output_tensors_) { node_memory.AddOutputTensorId(output_tensor->GetId()); } for (const auto &workspace_tensor : node->workspace_tensors_) { node_memory.AddWorkSpaceTensorId(workspace_tensor->GetId()); } graph_node->AddNodeMemory(node_memory); } #endif } } // namespace somas } // namespace mindspore