Browse Source

Pre Merge pull request !932 from TangQunzhang/development

pull/932/MERGE
TangQunzhang Gitee 5 years ago
parent
commit
e2c5c66daf
6 changed files with 450 additions and 675 deletions
  1. +2
    -2
      ge/graph/build/memory/binary_block_mem_assigner.cc
  2. +162
    -87
      ge/graph/build/memory/block_mem_assigner.cc
  3. +40
    -14
      ge/graph/build/memory/block_mem_assigner.h
  4. +244
    -554
      ge/graph/build/memory/graph_mem_assigner.cc
  5. +2
    -18
      ge/graph/build/memory/graph_mem_assigner.h
  6. BIN
      ge/graph/build/memory/graph_mem_assigner.zip

+ 2
- 2
ge/graph/build/memory/binary_block_mem_assigner.cc View File

@@ -69,8 +69,8 @@ Status BinaryBlockMemAssigner::GetMemoryRanges(vector<int64_t> &range_ceils) {
GELOGW("Vector all_memory_size is empty!"); GELOGW("Vector all_memory_size is empty!");
return SUCCESS; return SUCCESS;
} }
if ((all_memory_size.front() == 0) || (log(kLogBase) == 0)) {
GELOGE(FAILED, "dividend is 0!");
if ((all_memory_size.front() <= 0) || (log(kLogBase) == 0)) {
GELOGE(FAILED, "Memory size:%ld is invalid.", all_memory_size.front());
return FAILED; return FAILED;
} }
// Memory size is 512 aligned, so it is not necessary to take less than 512 // Memory size is 512 aligned, so it is not necessary to take less than 512


+ 162
- 87
ge/graph/build/memory/block_mem_assigner.cc View File

@@ -65,10 +65,7 @@ void AlignMemOffset(size_t &mem_align_size) {
} }


static bool CompareLifeTime(const NodeTypeIndex &left, const NodeTypeIndex &right) { static bool CompareLifeTime(const NodeTypeIndex &left, const NodeTypeIndex &right) {
auto left_node_op_desc = left.node->GetOpDesc();
auto right_node_op_desc = right.node->GetOpDesc();
if ((left_node_op_desc != nullptr) && (right_node_op_desc != nullptr)
&& (left_node_op_desc->GetId() < right_node_op_desc->GetId())) {
if (left.GetLifeBegin() < right.GetLifeBegin()) {
return true; return true;
} }
return false; return false;
@@ -100,14 +97,14 @@ bool CrossLifeTime(const NodeTypeIndex &left, const NodeTypeIndex &right) {
auto left_node_op_desc = left.node->GetOpDesc(); auto left_node_op_desc = left.node->GetOpDesc();
auto right_node_op_desc = right.node->GetOpDesc(); auto right_node_op_desc = right.node->GetOpDesc();
if ((left_node_op_desc != nullptr) && (right_node_op_desc != nullptr)) { if ((left_node_op_desc != nullptr) && (right_node_op_desc != nullptr)) {
if (left_node_op_desc->GetId() < right_node_op_desc->GetId()) {
if (left.life_time_end >= static_cast<size_t>(right_node_op_desc->GetId())) {
if (left.GetLifeBegin() < right.GetLifeBegin()) {
if (left.life_time_end >= right.GetLifeBegin()) {
return true; return true;
} }
} else if (left_node_op_desc->GetId() == right_node_op_desc->GetId()) {
} else if (left.GetLifeBegin() == right.GetLifeBegin()) {
return true; return true;
} else { } else {
if (right.life_time_end >= static_cast<size_t>(left_node_op_desc->GetId())) {
if (right.life_time_end >= left.GetLifeBegin()) {
return true; return true;
} }
} }
@@ -325,12 +322,7 @@ void MemoryBlock::AddLifeReuseBlock(MemoryBlock *block, DependStreamLife &total_
size_t MemoryBlock::GetLifeBegin() { size_t MemoryBlock::GetLifeBegin() {
size_t life_time = 0; size_t life_time = 0;
if (!node_type_index_list_.empty()) { if (!node_type_index_list_.empty()) {
if (node_type_index_list_.front().node != nullptr) {
auto node_op_desc = node_type_index_list_.front().node->GetOpDesc();
if (node_op_desc != nullptr) {
life_time = node_op_desc->GetId();
}
}
life_time = node_type_index_list_.front().GetLifeBegin();
} }
return life_time; return life_time;
} }
@@ -417,7 +409,7 @@ void MemoryBlock::AddDependLifeBegin(DependStreamLife &total_node_depend_stream_
depend_stream_life_[stream_id_] = GetLifeBegin(); depend_stream_life_[stream_id_] = GetLifeBegin();
} }


size_t MemoryBlock::GetLifeEnd() {
size_t MemoryBlock::GetLifeEnd() const {
if (!node_type_index_list_.empty()) { if (!node_type_index_list_.empty()) {
return node_type_index_list_.back().life_time_end; return node_type_index_list_.back().life_time_end;
} }
@@ -571,32 +563,29 @@ void BlockMemAssigner::GetOutAndWorkSpaceMem(vector<int64_t> &all_memory_size) {


for (auto &out_anchor : n->GetAllOutDataAnchors()) { for (auto &out_anchor : n->GetAllOutDataAnchors()) {
GeTensorDesc output_desc = node_op_desc->GetOutputDesc(out_anchor->GetIdx()); GeTensorDesc output_desc = node_op_desc->GetOutputDesc(out_anchor->GetIdx());
bool reuse_input = false;
GE_IF_BOOL_EXEC(ge::TensorUtils::GetReuseInput(output_desc, reuse_input) != SUCCESS,
GELOGI("Get reuse_input failed"));

if (!reuse_input) {
int64_t size = 0;
GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(output_desc, size) != SUCCESS, GELOGI("Get size failed"));
batch_all_memory_size[batch_label].emplace_back(size);
if (batch_total_size.find(batch_label) == batch_total_size.end()) {
batch_total_size[batch_label] = size;
} else {
batch_total_size[batch_label] += size;
}
int64_t size = 0;
GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(output_desc, size) != SUCCESS, GELOGI("Get size failed"));
GE_IF_BOOL_EXEC(size < 0, GELOGE(FAILED, "Node:%s size:%ld is invalid, maybe it is unknown shape node.",
node_op_desc->GetName().c_str(), size);
return;);
batch_all_memory_size[batch_label].emplace_back(size);
if (batch_total_size.find(batch_label) == batch_total_size.end()) {
batch_total_size[batch_label] = size;
} else {
batch_total_size[batch_label] += size;
}


if (!anchor_to_symbol_.empty()) {
auto iter1 = anchor_to_symbol_.find(NodeIndexIO(n, out_anchor->GetIdx(), kOut).ToString());
if (iter1 == anchor_to_symbol_.end()) {
continue;
}
const std::string &symbol = iter1->second;
auto iter2 = symbol_size_.find(symbol);
if (iter2 == symbol_size_.end()) {
symbol_size_[symbol] = size;
} else if (size > static_cast<int64_t>(iter2->second)) {
iter2->second = size;
}
if (!anchor_to_symbol_.empty()) {
auto iter1 = anchor_to_symbol_.find(NodeIndexIO(n, out_anchor->GetIdx(), kOut).ToString());
if (iter1 == anchor_to_symbol_.end()) {
continue;
}
const std::string &symbol = iter1->second;
auto iter2 = symbol_size_.find(symbol);
if (iter2 == symbol_size_.end()) {
symbol_size_[symbol] = size;
} else if (size > static_cast<int64_t>(iter2->second)) {
iter2->second = size;
} }
} }
} }
@@ -637,35 +626,17 @@ bool IsDirectOutputNode(const NodePtr &node, int idx) {
return false; return false;
} }


void AddReusableBlockCount(const MemoryBlock &mem_block, map<string, uint64_t> &reusable_block_counts) {
string key = std::to_string(mem_block.Size());
key += "_" + std::to_string(mem_block.stream_id_);
key += "_" + std::to_string(mem_block.memory_type_);
auto it = reusable_block_counts.find(key);
if (it != reusable_block_counts.end()) {
it->second++;
} else {
reusable_block_counts[key] = 1;
}
}

void ReduceReusableBlockCount(const MemoryBlock &mem_block, map<string, uint64_t> &reusable_block_counts) {
string key = std::to_string(mem_block.Size());
key += "_" + std::to_string(mem_block.stream_id_);
key += "_" + std::to_string(mem_block.memory_type_);
auto it = reusable_block_counts.find(key);
if (it != reusable_block_counts.end()) {
if (it->second > 0) {
it->second--;
}
}
}

bool CanReuseBySize(const map<string, uint64_t> &reusable_block_counts, const MemoryBlock &reusable_block,
size_t block_size, size_t real_size, bool continuous) {
bool CanReuseBlock(size_t continuous_life_begin, const MemoryBlock &reusable_block, size_t block_size) {
bool can_reuse = false; bool can_reuse = false;
if (reusable_block.Size() == block_size) { if (reusable_block.Size() == block_size) {
can_reuse = true;
// in some continuous input case, continuous first input node's is not same as topo first node.
if (continuous_life_begin > 0) {
if (continuous_life_begin > reusable_block.GetLifeEnd()) {
can_reuse = true;
}
} else {
can_reuse = true;
}
} }
return can_reuse; return can_reuse;
} }
@@ -676,6 +647,13 @@ bool BlockMemAssigner::IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t ou
if (n == nullptr || n->GetAllOutDataAnchors().size() <= 0) { if (n == nullptr || n->GetAllOutDataAnchors().size() <= 0) {
return false; return false;
} }
auto node_desc = n->GetOpDesc();
GE_IF_BOOL_EXEC(node_desc == nullptr, GELOGE(FAILED, "Node[%s] nodedesc is null.", n->GetName().c_str());
return false;);
std::vector<int64_t> offsets_for_fusion = {};
bool has_lx_fusion_attr =
AttrUtils::GetListInt(node_desc, ATTR_NAME_OUTPUT_OFFSET_FOR_BUFFER_FUSION, offsets_for_fusion);

if (static_cast<size_t>(out_index) < n->GetAllOutDataAnchors().size()) { if (static_cast<size_t>(out_index) < n->GetAllOutDataAnchors().size()) {
auto out_anchor = n->GetOutDataAnchor(out_index); auto out_anchor = n->GetOutDataAnchor(out_index);
GE_IF_BOOL_EXEC(out_anchor == nullptr, GE_IF_BOOL_EXEC(out_anchor == nullptr,
@@ -698,16 +676,17 @@ bool BlockMemAssigner::IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t ou
return false;); return false;);


// If GetBool fail, is_input_continuous is false. // If GetBool fail, is_input_continuous is false.
bool is_input_continuous_no_padding = false;
(void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT,
is_input_continuous_no_padding);
if (is_input_continuous_no_padding) {
(void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT, is_input_continuous);
if (is_input_continuous) {
reset_zero_copy_flag = true; reset_zero_copy_flag = true;
return false;
has_lx_fusion_attr = true;
} else {
(void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous);
} }
(void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous);


GE_IF_BOOL_EXEC(is_input_continuous && CheckIsZeroMemNodeType(peer_node->GetType()),
// lx_fusion memory only assign first input, broadcast's input some are variable some are not, reassign later
GE_IF_BOOL_EXEC(is_input_continuous &&
(CheckIsZeroMemNodeType(peer_node->GetType()) || (has_lx_fusion_attr && (peer_in_anchor->GetIdx() != 0))),
GELOGI("Node[%s] output[%u] no_need_assign_memory.", n->GetName().c_str(), out_index); GELOGI("Node[%s] output[%u] no_need_assign_memory.", n->GetName().c_str(), out_index);
no_need_assign_memory = true; no_need_assign_memory = true;
return false;); return false;);
@@ -721,6 +700,10 @@ bool BlockMemAssigner::IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t ou
// Only set attr one times. // Only set attr one times.
if (node_continuous_input_blocks_[peer_in_node_desc->GetName()].size() == 0) { if (node_continuous_input_blocks_[peer_in_node_desc->GetName()].size() == 0) {
(void)ge::AttrUtils::SetBool(peer_in_node_desc, ATTR_NAME_CONTINUOUS_INPUT_ALLOC, true); (void)ge::AttrUtils::SetBool(peer_in_node_desc, ATTR_NAME_CONTINUOUS_INPUT_ALLOC, true);
// lx fusion case assign max size for first block, so reuse as none continuous
GE_IF_BOOL_EXEC(has_lx_fusion_attr,
is_op_reuse_mem_ = IsContinuousMemoryReuse(n, peer_node, out_index);
return false;);
node_continuous_input_counts_[peer_in_node_desc->GetName()] = peer_node->GetAllInDataAnchorsSize(); node_continuous_input_counts_[peer_in_node_desc->GetName()] = peer_node->GetAllInDataAnchorsSize();
} }
peer_input_index = peer_in_anchor->GetIdx(); peer_input_index = peer_in_anchor->GetIdx();
@@ -733,6 +716,95 @@ bool BlockMemAssigner::IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t ou
return false; return false;
} }


bool IsContinuousInputNodeMaxLife(const NodePtr &n, uint32_t out_index) {
if (n == nullptr) {
return false;
}

int64_t max_node_life_time = 0;
int64_t continuous_input_node_life_time = 0;
if (static_cast<size_t>(out_index) < n->GetAllOutDataAnchors().size()) {
auto out_anchor = n->GetOutDataAnchor(out_index);
if(out_anchor == nullptr) {
return false;
}

// continuous input node's life time should be max
for (auto const &peer_in_anchor : out_anchor->GetPeerInDataAnchors()) {
if ((peer_in_anchor == nullptr) || (peer_in_anchor->GetOwnerNode() == nullptr)){
return false;
}
auto peer_in_node_desc = peer_in_anchor->GetOwnerNode()->GetOpDesc();
GE_IF_BOOL_EXEC(peer_in_node_desc == nullptr,
GELOGE(FAILED, "Node[%s] output[%u] peer in node desc is null.", n->GetName().c_str(), out_index);
return false;);

if(peer_in_node_desc->GetId() > max_node_life_time) {
max_node_life_time = peer_in_node_desc->GetId();
}

// If GetBool fail, is_input_continuous is false.
bool is_input_continuous = false;
(void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT, is_input_continuous);
if (!is_input_continuous) {
(void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous);
}
if (is_input_continuous) {
continuous_input_node_life_time = peer_in_node_desc->GetId();
}
}
}
return ((max_node_life_time != 0) && (continuous_input_node_life_time == max_node_life_time)) ;
}

///
/// @ingroup GE
/// @brief Check continuous memory reuseable
/// @return void
///
bool BlockMemAssigner::IsContinuousMemoryReuse(const NodePtr &n, const NodePtr &peer_node, uint32_t out_index) {
// n,peer_node_desc have been checked
auto node_desc = n->GetOpDesc();
auto peer_node_desc = peer_node->GetOpDesc();
continuous_life_begin_ = static_cast<size_t>(node_desc->GetId());
// lx fusion case check all continuous input node, firt input node's life time should be min
for (const auto &in_anchor : peer_node->GetAllInDataAnchors()) {
if ((in_anchor == nullptr) || (in_anchor->GetPeerOutAnchor() == nullptr) ||
(in_anchor->GetPeerOutAnchor()->GetOwnerNode() == nullptr) ||
(in_anchor->GetPeerOutAnchor()->GetOwnerNode()->GetOpDesc() == nullptr)) {
GELOGE(FAILED, "Node[%s] output[%u] peer input node desc is null.", n->GetName().c_str(), out_index);
return false;
}
auto peer_out_node_desc = in_anchor->GetPeerOutAnchor()->GetOwnerNode()->GetOpDesc();
///
/// node2 node1 node3
/// | / / |
/// node5 node6
/// firt input node's life time is not min
/// when node5's first input node2's life time is not min(node2 > node1), use node1's life time to reuse
///
if (peer_out_node_desc->GetId() < continuous_life_begin_) {
continuous_life_begin_ = static_cast<size_t>(peer_out_node_desc->GetId());
GELOGI(
"Node[%s] life[%ld] output[%u] is not continuous input node[%s] life[%ld]'s min life time,"
"min is node[%s] life[%zu]",
n->GetName().c_str(), node_desc->GetId(), out_index, peer_node_desc->GetName().c_str(),
peer_node_desc->GetId(), peer_out_node_desc->GetName().c_str(), continuous_life_begin_);
}
// when node3's output node5's life time is not max(node6 > node5), not reuse
if (!IsContinuousInputNodeMaxLife(in_anchor->GetPeerOutAnchor()->GetOwnerNode(),
in_anchor->GetPeerOutAnchor()->GetIdx())) {
GELOGI(
"Node[%s] life[%ld] output[%u]'s continuous input node[%s] life[%ld]'s is not node[%s] output[%d]'s "
"max life node",
n->GetName().c_str(), node_desc->GetId(), out_index, peer_node_desc->GetName().c_str(),
peer_node_desc->GetId(), peer_out_node_desc->GetName().c_str(), in_anchor->GetPeerOutAnchor()->GetIdx());
return false;
}
}
return true;
}

/// ///
/// @ingroup GE /// @ingroup GE
/// @brief Check pre_reuse flag & post_reuse glag for each symbol /// @brief Check pre_reuse flag & post_reuse glag for each symbol
@@ -1018,8 +1090,9 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size,
GE_IF_BOOL_EXEC(reusable_block->batch_label_ != batch_label, continue); GE_IF_BOOL_EXEC(reusable_block->batch_label_ != batch_label, continue);


// A node can reuse blocks of the same stream and preorder streams // A node can reuse blocks of the same stream and preorder streams
if (CanReuseBySize(reusable_block_counts_, *reusable_block, block_size, real_size, continuous)) {
reusable_block->AddNodeTypeIndex({n, mem_type, out_index, false}, real_size, no_align_size);
if (CanReuseBlock(continuous_life_begin_, *reusable_block, block_size)) {
reusable_block->AddNodeTypeIndex({n, mem_type, out_index, false, continuous_life_begin_},
real_size, no_align_size);
if (mem_type == kOutput) { if (mem_type == kOutput) {
auto iter = anchor_to_symbol_.find(NodeIndexIO(n, out_index, kOut).ToString()); auto iter = anchor_to_symbol_.find(NodeIndexIO(n, out_index, kOut).ToString());
if (iter != anchor_to_symbol_.end()) { if (iter != anchor_to_symbol_.end()) {
@@ -1028,7 +1101,6 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size,
} }
reusable_block->continuous_block_ = continuous; reusable_block->continuous_block_ = continuous;
reusable_block->ref_count_++; reusable_block->ref_count_++;
ReduceReusableBlockCount(*reusable_block, reusable_block_counts_);
reusable_blocks_[memory_type][stream_id].erase((++it).base()); reusable_blocks_[memory_type][stream_id].erase((++it).base());
return reusable_block; return reusable_block;
} }
@@ -1041,8 +1113,7 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size,


// Data and netoutput need zero copy block // Data and netoutput need zero copy block
block->is_zero_copy_ = IsZeroCopyBlock(n, continuous); block->is_zero_copy_ = IsZeroCopyBlock(n, continuous);

block->Init(real_size, mem_type, n, out_index, no_align_size, node_op_desc->GetStreamId());
block->AddNodeTypeIndex({n, mem_type, out_index, false, continuous_life_begin_}, real_size, no_align_size);
block->stream_id_ = node_op_desc->GetStreamId(); block->stream_id_ = node_op_desc->GetStreamId();
block->ref_count_++; block->ref_count_++;
block->continuous_block_ = continuous; block->continuous_block_ = continuous;
@@ -1142,7 +1213,12 @@ MemoryBlock *BlockMemAssigner::ApplyOutMemory(const NodePtr &n, uint32_t index,
std::string symbol; std::string symbol;
if (IsSymbolExist(node_index_io, symbol)) { if (IsSymbolExist(node_index_io, symbol)) {
block = symbol_blocks_[symbol]; block = symbol_blocks_[symbol];
block->AddNodeTypeIndex({n, kOutput, index, true}, size, no_align_size);
GE_IF_BOOL_EXEC(block == nullptr, GELOGE(FAILED, "Node %s ref block is nullptr.", node_op_desc->GetName().c_str());
return nullptr);
auto block_size = GetBlockSize(size, ranges);
block->SetSize(block_size);
block->SetLifeTimeEnd(life_time_);
block->AddNodeTypeIndex({n, kOutput, index, true, continuous_life_begin_}, size, no_align_size);
block->ref_count_++; block->ref_count_++;
} else { } else {
int64_t max_size = size; int64_t max_size = size;
@@ -1196,7 +1272,6 @@ MemoryBlock *BlockMemAssigner::ApplyOutMemory(const NodePtr &n, uint32_t index,
GE_IF_BOOL_EXEC(ge::TensorUtils::GetReuseInputIndex(*owner_node_op_desc, dst_reuse_input_index) != SUCCESS, GE_IF_BOOL_EXEC(ge::TensorUtils::GetReuseInputIndex(*owner_node_op_desc, dst_reuse_input_index) != SUCCESS,
GELOGI("Get dst_reuse_input_index failed")); GELOGI("Get dst_reuse_input_index failed"));
if (dst_reuse_input && (dst_reuse_input_index == static_cast<uint32_t>(in_anchor->GetIdx()))) { if (dst_reuse_input && (dst_reuse_input_index == static_cast<uint32_t>(in_anchor->GetIdx()))) {
block->AddNodeTypeIndex({owner_node, kOutput, i, true}, block->Size(), block->Size());
out_count_reuse_input += 1; out_count_reuse_input += 1;
reuse_input = true; reuse_input = true;
} }
@@ -1237,7 +1312,7 @@ bool IsAtomicOutputMemory(const ge::NodePtr &node, uint32_t output_index, bool i
if (static_cast<uint32_t>(index) == output_index) { if (static_cast<uint32_t>(index) == output_index) {
if (node->GetOwnerComputeGraph() != nullptr) { if (node->GetOwnerComputeGraph() != nullptr) {
string graph_name = node->GetOwnerComputeGraph()->GetName(); string graph_name = node->GetOwnerComputeGraph()->GetName();
GELOGD("[IMAS]Atomic no assign %s name[%s] output[%ld] streamid[%ld].", graph_name.c_str(),
GELOGD("Atomic no assign %s name[%s] output[%ld] streamid[%ld].", graph_name.c_str(),
op_desc->GetName().c_str(), index, op_desc->GetStreamId()); op_desc->GetName().c_str(), index, op_desc->GetStreamId());
} }
return true; return true;
@@ -1275,7 +1350,6 @@ void BlockMemAssigner::ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock
if (to_release->same_stream_) { if (to_release->same_stream_) {
to_release->SetLifeTimeEnd(life_time_); to_release->SetLifeTimeEnd(life_time_);
reusable_memory.emplace_back(to_release); reusable_memory.emplace_back(to_release);
AddReusableBlockCount(*to_release, reusable_block_counts_);
} }
} }
} }
@@ -1375,6 +1449,7 @@ Status BlockMemAssigner::AssignOutputMemoryWithReuse(const NodePtr &node, vector
} }


is_op_reuse_mem_ = true; is_op_reuse_mem_ = true;
continuous_life_begin_ = 0;
if (op_reuse_env_valid_ == true) { if (op_reuse_env_valid_ == true) {
vector<string>::iterator it_name = vector<string>::iterator it_name =
std::find(op_no_reuse_mem_vec_.begin(), op_no_reuse_mem_vec_.end(), op_desc->GetName()); std::find(op_no_reuse_mem_vec_.begin(), op_no_reuse_mem_vec_.end(), op_desc->GetName());
@@ -1426,7 +1501,7 @@ Status BlockMemAssigner::AssignOutputMemoryWithReuse(const NodePtr &node, vector
continue; continue;
} }
// atomic can't be reused // atomic can't be reused
bool need_change = is_op_reuse_mem_ && out_node_set_continuous_input && is_atomic;
bool need_change = is_op_reuse_mem_ && is_atomic;
if (need_change) { if (need_change) {
is_op_reuse_mem_ = false; is_op_reuse_mem_ = false;
} }
@@ -1820,10 +1895,10 @@ void SetOffsetSize(const NodeTypeIndex &node_type, const MemoryBlock *block,
op_desc->SetWorkspace(workspace_list); op_desc->SetWorkspace(workspace_list);
} }
GELOGI("[IMAS]Set %s name[%s] %s[%u] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu] noalignsize[%zu] " GELOGI("[IMAS]Set %s name[%s] %s[%u] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu] noalignsize[%zu] "
"life time begin[%zu] life time end[%zu] child[%d:%d:%d:%d:%d] isref[%d] batch[%s]", graph_name.c_str(),
"life time begin[%s] life time end[%zu] child[%d:%d:%d:%d:%d] isref[%d] batch[%s]", graph_name.c_str(),
op_desc->GetName().c_str(), node_type.GetMemType().c_str(), node_type.index, offset, op_desc->GetStreamId(), op_desc->GetName().c_str(), node_type.GetMemType().c_str(), node_type.index, offset, op_desc->GetStreamId(),
block->Size(), real_size, no_align_size, op_desc->GetId(), end, child_block_level, block->reuse_mem_,
block->continuous_block_, block->is_zero_copy_, block->same_stream_, node_type.ref_input,
block->Size(), real_size, no_align_size, node_type.GetLifeBeginDesc().c_str(), end, child_block_level,
block->reuse_mem_, block->continuous_block_, block->is_zero_copy_, block->same_stream_, node_type.ref_input,
block->batch_label_.c_str()); block->batch_label_.c_str());
} }




+ 40
- 14
ge/graph/build/memory/block_mem_assigner.h View File

@@ -39,14 +39,15 @@ using DependStreamLife = std::map<int64_t, std::map<int64_t, size_t>>;
enum OpMemoryType { kOutput, kWorkspace }; enum OpMemoryType { kOutput, kWorkspace };


struct NodeTypeIndex { struct NodeTypeIndex {
NodeTypeIndex(ge::NodePtr node, OpMemoryType mem_type, uint32_t index, bool ref_input = false)
: node(std::move(node)), mem_type(mem_type), index(index), ref_input(ref_input) {}
NodeTypeIndex(ge::NodePtr node, OpMemoryType mem_type, uint32_t index, bool ref_input = false, size_t begin = 0)
: node(std::move(node)), mem_type(mem_type), index(index), ref_input(ref_input), life_time_begin(begin) {}


ge::NodePtr node = nullptr; ge::NodePtr node = nullptr;
OpMemoryType mem_type = kOutput; OpMemoryType mem_type = kOutput;
uint32_t index = 0; uint32_t index = 0;
size_t life_time_end = kMaxLifeTime;
bool ref_input = false; bool ref_input = false;
size_t life_time_begin = 0;
size_t life_time_end = kMaxLifeTime;
const string GetMemType() const { const string GetMemType() const {
if (mem_type == kOutput) { if (mem_type == kOutput) {
return "output"; return "output";
@@ -55,6 +56,34 @@ struct NodeTypeIndex {
} }
return "unknown"; return "unknown";
} }

size_t GetLifeBegin() const {
if ((node == nullptr) || (node->GetOpDesc() == nullptr)) {
return 0;
}

if ((life_time_begin > 0) && (life_time_begin < static_cast<size_t>(node->GetOpDesc()->GetId()))) {
return life_time_begin;
} else {
return node->GetOpDesc()->GetId();
}
}

std::string GetLifeBeginDesc() const {
if (node == nullptr) {
return "";
}
auto node_op_desc = node->GetOpDesc();
if (node_op_desc != nullptr) {
auto life_begin = GetLifeBegin();
if (life_begin != static_cast<size_t>(node_op_desc->GetId())) {
return std::to_string(life_begin) + "-" + std::to_string(node_op_desc->GetId());
} else {
return std::to_string(node_op_desc->GetId());
}
}
return "";
}
}; };


class MemoryBlock { class MemoryBlock {
@@ -86,16 +115,13 @@ class MemoryBlock {
symbol_list_.clear(); symbol_list_.clear();
} }


void Init(size_t real_size, OpMemoryType type, const ge::NodePtr &node, uint32_t out_index, size_t no_align_size,
int64_t stream_id) {
real_size_list_.emplace_back(real_size);
no_align_size_list_.emplace_back(no_align_size);
node_type_index_list_.emplace_back(node, type, out_index, false);
if (stream_id != stream_id_) {
same_stream_ = false;
size_t Size() const { return block_size_; }

void SetSize(size_t size) {
if (size > block_size_) {
block_size_ = size;
} }
} }
size_t Size() const { return block_size_; }


size_t AlignSize() const; size_t AlignSize() const;


@@ -143,7 +169,7 @@ class MemoryBlock {


size_t GetLifeBegin(); size_t GetLifeBegin();


size_t GetLifeEnd();
size_t GetLifeEnd() const;


void AddDependLifeBegin(DependStreamLife &node_depend_stream_life); void AddDependLifeBegin(DependStreamLife &node_depend_stream_life);


@@ -406,6 +432,7 @@ class BlockMemAssigner : public MemAssigner {
bool IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t out_index, std::string &peer_name, bool IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t out_index, std::string &peer_name,
uint32_t &peer_input_index, bool &no_need_assign_memory, bool &reset_zero_copy_flag); uint32_t &peer_input_index, bool &no_need_assign_memory, bool &reset_zero_copy_flag);


bool IsContinuousMemoryReuse(const NodePtr &n, const NodePtr &peer_node, uint32_t out_index);
/// ///
/// @ingroup GE /// @ingroup GE
/// @|+++++++++block1++++++++| |+++++++++block1++++++++| /// @|+++++++++block1++++++++| |+++++++++block1++++++++|
@@ -425,8 +452,6 @@ class BlockMemAssigner : public MemAssigner {


std::unordered_map<int64_t, std::unordered_map<int64_t, std::vector<MemoryBlock *>>> reusable_blocks_; std::unordered_map<int64_t, std::unordered_map<int64_t, std::vector<MemoryBlock *>>> reusable_blocks_;


std::map<std::string, uint64_t> reusable_block_counts_;

std::unordered_map<int64_t, std::unordered_map<int64_t, std::vector<MemoryBlock *>>> stream_workspace_blocks_; std::unordered_map<int64_t, std::unordered_map<int64_t, std::vector<MemoryBlock *>>> stream_workspace_blocks_;


std::unordered_map<std::string, std::vector<MemoryBlock *>> node_out_blocks_; std::unordered_map<std::string, std::vector<MemoryBlock *>> node_out_blocks_;
@@ -456,6 +481,7 @@ class BlockMemAssigner : public MemAssigner {


std::string max_batch_label_; std::string max_batch_label_;


size_t continuous_life_begin_ = 0;
/// ///
/// @ [stream1][nodeid] /// @ [stream1][nodeid]
/// @[nodeid] [stream2][nodeid] /// @[nodeid] [stream2][nodeid]


+ 244
- 554
ge/graph/build/memory/graph_mem_assigner.cc View File

@@ -35,10 +35,9 @@ namespace {
const int kAllInputAddrIsAtomic = -1; const int kAllInputAddrIsAtomic = -1;
const int kVirtualInputNodeMemoryReuse = 0; const int kVirtualInputNodeMemoryReuse = 0;
const int kVirtualOutputNodeMemoryReuse = 1; const int kVirtualOutputNodeMemoryReuse = 1;
const size_t kVirtualInputNodeOutputSize = 1;
const size_t kVirtualOutputNodeInputSize = 1;
const size_t kVirtualNodeDataIndex = 0;
const char *const kMbatchNodeNameFlag = "_ascend_mbatch_batch_";
// One state per bit cannot be repeated
enum ContinuousType { kNotContinuous = 0, kInput = 1, kInputNoPadding = 2, kOutput = 4, kOutputNoPadding = 8 };

int64_t GetSymbolOutputOffset(const std::map<std::string, std::string> &anchor_to_symbol, int64_t GetSymbolOutputOffset(const std::map<std::string, std::string> &anchor_to_symbol,
const std::map<std::string, std::list<ge::NodeIndexIO>> &symbol_to_anchors, const std::map<std::string, std::list<ge::NodeIndexIO>> &symbol_to_anchors,
const ge::NodePtr &node, const uint32_t i) { const ge::NodePtr &node, const uint32_t i) {
@@ -136,7 +135,7 @@ ge::Status GraphMemoryAssigner::AssignVarAttr2Nodes() {
return ge::SUCCESS; return ge::SUCCESS;
} }


ge::Status GraphMemoryAssigner::CalculateTensorRealSizeAndOutSize(const ge::ConstGeTensorDescPtr &output_desc,
ge::Status CalculateTensorRealSizeAndOutSize(const ge::ConstGeTensorDescPtr &output_desc,
int64_t dim_index, int64_t &output_mem_size, int64_t dim_index, int64_t &output_mem_size,
int64_t &batch_dim_num, int64_t &out_size) { int64_t &batch_dim_num, int64_t &out_size) {
graphStatus graph_status = ge::TensorUtils::GetSize(*output_desc, out_size); graphStatus graph_status = ge::TensorUtils::GetSize(*output_desc, out_size);
@@ -181,68 +180,6 @@ ge::Status GraphMemoryAssigner::CalculateTensorRealSizeAndOutSize(const ge::Cons
return SUCCESS; return SUCCESS;
} }


Status GraphMemoryAssigner::GetMaxBatchLabel(const map<string, vector<NodePtr>> &mem_reuse_virtual_nodes_map,
int32_t mem_reuse_model, string &max_batch_label) {
for (auto &i_map : mem_reuse_virtual_nodes_map) {
vector<NodePtr> virtual_nodes_list = i_map.second;
vector<int64_t> max_shape_dims;
size_t max_batch_dim = 0;
bool max_batch_dim_find = false;
for (size_t i = 0; i < virtual_nodes_list.size(); ++i) {
GE_CHECK_NOTNULL(virtual_nodes_list[i]);
OpDescPtr op_desc = virtual_nodes_list[i]->GetOpDesc();
GE_CHECK_NOTNULL(op_desc);

ge::ConstGeTensorDescPtr input_output_desc;
if (mem_reuse_model == kVirtualInputNodeMemoryReuse) {
input_output_desc = op_desc->GetOutputDescPtr(kVirtualNodeDataIndex);
} else if (mem_reuse_model == kVirtualOutputNodeMemoryReuse) {
input_output_desc = op_desc->GetInputDescPtr(kVirtualNodeDataIndex);
} else {
std::string error = "Invalid parameter memory reuse model, which is " + FmtToStr(mem_reuse_model);
GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
return FAILED;
}
GE_CHECK_NOTNULL(input_output_desc);

if (i == 0) {
// All ops must have ATTR_NAME_BATCH_LABEL, no need to check return value.
(void) ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, max_batch_label);
max_shape_dims = input_output_desc->GetShape().GetDims();
} else {
vector<int64_t> current_shape_dims = input_output_desc->GetShape().GetDims();
if (current_shape_dims.size() != max_shape_dims.size()) {
std::string error = "The shape of several nodes between multiple batches does not match.";
GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
return FAILED;
}
for (size_t j = 0; j < current_shape_dims.size(); ++j) {
if (current_shape_dims[j] == max_shape_dims[j]) {
continue;
}
if (max_batch_dim_find && max_batch_dim != j) {
std::string error = "The shape of several nodes between multiple batches does not match.";
GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
return FAILED;
}
max_batch_dim_find = true;
max_batch_dim = j;
if (current_shape_dims[j] > max_shape_dims[j]) {
max_shape_dims[j] = current_shape_dims[j];
// All ops must have ATTR_NAME_BATCH_LABEL, no need to check return value.
(void) ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, max_batch_label);
}
// Only compare the first different dim in shape.
break;
}
}
}
// In every element of virtual_input_nodes_map, the label of the max batch node is the same.
break;
}
return SUCCESS;
}

Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, map<int64_t, size_t> &mem_type_to_offset) { Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, map<int64_t, size_t> &mem_type_to_offset) {
if (memory_offset_.empty()) { if (memory_offset_.empty()) {
GELOGE(FAILED, "memory_offset_ is empty."); GELOGE(FAILED, "memory_offset_ is empty.");
@@ -250,13 +187,6 @@ Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, map<int64_t, size
} }


GE_CHK_STATUS_RET(ReAssignContinuousMemory(is_loop_graph), "ReAssignContinuousMemory Failed!"); GE_CHK_STATUS_RET(ReAssignContinuousMemory(is_loop_graph), "ReAssignContinuousMemory Failed!");

GE_CHK_STATUS_RET(ReAssignReuseAndNoPaddingContinuousInputMemory(),
"ReAssignReuseAndNoPaddingContinuousInputMemory Failed!");

GE_CHK_STATUS_RET(ReAssignReuseAndNoPaddingContinuousOutputMemory(),
"ReAssignReuseAndNoPaddingContinuousOutputMemory Failed!");

GE_CHK_STATUS_RET(ReAssignAtomicMemory(is_loop_graph), "ReAssignAtomicMemory Failed!"); GE_CHK_STATUS_RET(ReAssignAtomicMemory(is_loop_graph), "ReAssignAtomicMemory Failed!");


size_t total_mem_offset = 0; size_t total_mem_offset = 0;
@@ -313,22 +243,133 @@ Status GraphMemoryAssigner::AssignZeroCopyMemory(map<int64_t, size_t> &mem_offse
return SUCCESS; return SUCCESS;
} }


uint32_t GetContinuousMemoryType(const OpDescPtr &op_desc) {
if(op_desc == nullptr) {
return kNotContinuous;
};

bool is_continuous = false;
uint32_t continuous_type = kNotContinuous;
// If GetBool fail, is_continuous is false.
(void) ge::AttrUtils::GetBool(op_desc, ATTR_NAME_CONTINUOUS_INPUT, is_continuous);
if (is_continuous) {
continuous_type |= kInput;
} else {
(void) ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT, is_continuous);
if(is_continuous) {
bool attr_reuse = false;
(void) ge::AttrUtils::GetBool(op_desc, ATTR_NAME_OUTPUT_REUSE_INPUT, attr_reuse);
if (attr_reuse) {
continuous_type |= kInputNoPadding;
}
}
}

is_continuous = false;
(void) ge::AttrUtils::GetBool(op_desc, ATTR_NAME_CONTINUOUS_OUTPUT, is_continuous);
if (is_continuous) {
continuous_type |= kOutput;
} else {
(void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOPADDING_CONTINUOUS_OUTPUT, is_continuous);
if (is_continuous) {
bool attr_reuse = false;
(void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_OUTPUT_REUSE_INPUT, attr_reuse);
if (attr_reuse) {
continuous_type |= kOutputNoPadding;
}
}
}
return continuous_type;
}

Status GetMemorySize(const OpDescPtr &op_desc, const ge::ConstGeTensorDescPtr &output_desc, uint32_t continuous_type,
int64_t &tensor_size, int64_t &nopadding_size) {
if ((op_desc == nullptr) || (output_desc == nullptr)) {
GELOGE(FAILED, "Input para is nullptr.");
return FAILED;
}
tensor_size = 0;
nopadding_size = 0;
bool is_nopadding = ((continuous_type & kInputNoPadding) != 0) || ((continuous_type & kOutputNoPadding) != 0);
if (is_nopadding) {
int64_t attr_dim_index;
bool get_attr_dim_flag = ge::AttrUtils::GetInt(op_desc, ATTR_NAME_REUSE_INPUT_ON_DIM_INDEX, attr_dim_index);
if (!get_attr_dim_flag) {
GELOGE(FAILED, "Get attr _reuse_input_on_dim_index failed.");
return FAILED;
}

// Calculate tensor real size of each piece of data and out size of complete data
int64_t batch_dim_num = 1;
if (CalculateTensorRealSizeAndOutSize(output_desc, attr_dim_index, nopadding_size, batch_dim_num, tensor_size) !=
SUCCESS) {
GELOGE(FAILED, "CalculateTensorRealSizeAndOutSize failed for node %s.", op_desc->GetName().c_str());
return FAILED;
}
} else {
if (ge::TensorUtils::GetSize(*output_desc, tensor_size) != ge::SUCCESS) {
GELOGE(FAILED, "GetSize failed.");
return FAILED;
}
}
if ((tensor_size < 0) || (nopadding_size < 0)) {
GELOGE(FAILED, "GetMemorySize for node %s failed.", op_desc->GetName().c_str());
return FAILED;
}
return SUCCESS;
}

void AlignMemOffset(int64_t &mem_align_size) {
if (mem_align_size <= 0) {
return;
}
mem_align_size = (mem_align_size + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE * MEM_ALIGN_SIZE;
}

bool IsContinuousInputConflict(const ge::NodePtr &node, const OpDescPtr &peer_op_desc) {
bool is_peer_output_continuous = false;
// If GetBool fail, is_peer_output_continuous is false.
(void) ge::AttrUtils::GetBool(peer_op_desc, ATTR_NAME_CONTINUOUS_OUTPUT, is_peer_output_continuous);

// Get peer node output size, if size == 1(peer node has only one output), continuous input of the node and
// continuous output of the previous node is the same, we can support it. If size != 1, there may be
// conflict between the two, we can not support it.
auto peer_output_size = peer_op_desc->GetOutputsSize();
GE_IF_BOOL_EXEC(is_peer_output_continuous && (peer_output_size != 1),
std::string error = "Current op" + FmtToStr(node->GetOpDesc()->GetName()) +
" requires continuous input, while the previous op" + FmtToStr(peer_op_desc->GetName()) +
" requires continuous output. There may be conflict between the two." +
"This node is not supported now.";
GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
return true;);

bool is_peer_reference = false;
// If GetBool fail, is_peer_reference is false.
(void) AttrUtils::GetBool(peer_op_desc, ATTR_NAME_REFERENCE, is_peer_reference);
GE_IF_BOOL_EXEC(is_peer_reference,
std::string error = "Current op" + FmtToStr(node->GetOpDesc()->GetName()) +
" requires continuous input, while the previous op" + FmtToStr(peer_op_desc->GetName()) +
" requires continuous output. There may be conflict between the two." +
"This node is not supported now.";
GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
return true;);
return false;
}

Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) { Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) {
Status ret; Status ret;
for (auto &node : compute_graph_->GetAllNodes()) { for (auto &node : compute_graph_->GetAllNodes()) {
// Get the continuous input type of the node, default is false
bool is_input_continuous = false;
GE_CHECK_NOTNULL(node->GetOpDesc());
// If GetBool fail, is_input_continuous is false.
(void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous);
GE_CHECK_NOTNULL(node);
auto continuous_type = GetContinuousMemoryType(node->GetOpDesc());


// Assign continuous input memory // Assign continuous input memory
if (is_input_continuous) {
bool is_continuous_input = ((continuous_type & kInput) != 0) || ((continuous_type & kInputNoPadding) != 0);
if (is_continuous_input) {
int64_t memory_type = RT_MEMORY_HBM; int64_t memory_type = RT_MEMORY_HBM;
GE_CHK_STATUS_RET(GetNodeMemoryType(node, memory_type, "input"), "Get node memory type failed."); GE_CHK_STATUS_RET(GetNodeMemoryType(node, memory_type, "input"), "Get node memory type failed.");
int64_t mem_clean_start = 0; int64_t mem_clean_start = 0;
int64_t mem_clean_size = 0; int64_t mem_clean_size = 0;
ret = AssignContinuousInputMemory(node, mem_clean_start, mem_clean_size, memory_type);
ret = AssignContinuousInputMemory(node, mem_clean_start, mem_clean_size, memory_type, continuous_type);
if (ret != ge::SUCCESS) { if (ret != ge::SUCCESS) {
GELOGE(ret, "Assign continuous input memory failed!"); GELOGE(ret, "Assign continuous input memory failed!");
return ret; return ret;
@@ -338,7 +379,6 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) {
vector<int32_t> input_indexes; vector<int32_t> input_indexes;
// If GetListInt fail, input_indexes is empty. // If GetListInt fail, input_indexes is empty.
(void) ge::AttrUtils::GetListInt(node->GetOpDesc(), ATOMIC_ATTR_INPUT_INDEX, input_indexes); (void) ge::AttrUtils::GetListInt(node->GetOpDesc(), ATOMIC_ATTR_INPUT_INDEX, input_indexes);

if (!input_indexes.empty() && input_indexes[0] == kAllInputAddrIsAtomic) { if (!input_indexes.empty() && input_indexes[0] == kAllInputAddrIsAtomic) {
// check whether there is an atomic conflict between the current node and the peer out node // check whether there is an atomic conflict between the current node and the peer out node
if (!CheckInputIsSupportAtomic(node)) { if (!CheckInputIsSupportAtomic(node)) {
@@ -350,6 +390,7 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) {
const auto &in_control_anchor = node->GetInControlAnchor(); const auto &in_control_anchor = node->GetInControlAnchor();
GE_CHECK_NOTNULL(in_control_anchor); GE_CHECK_NOTNULL(in_control_anchor);
for (const auto &peer_out_control_anchor : in_control_anchor->GetPeerOutControlAnchors()) { for (const auto &peer_out_control_anchor : in_control_anchor->GetPeerOutControlAnchors()) {
GE_CHECK_NOTNULL(peer_out_control_anchor);
auto peer_out_node = peer_out_control_anchor->GetOwnerNode(); auto peer_out_node = peer_out_control_anchor->GetOwnerNode();
if (peer_out_node->GetType() == ATOMICADDRCLEAN) { if (peer_out_node->GetType() == ATOMICADDRCLEAN) {
ret = SetAtomicCleanAttr(peer_out_node, {mem_clean_start}, {mem_clean_size}); ret = SetAtomicCleanAttr(peer_out_node, {mem_clean_start}, {mem_clean_size});
@@ -362,23 +403,12 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) {
} }
} }


// Get the reference type of the node, default is false
bool is_ref = false;
// If GetBool fail, is_ref is false.
(void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_REFERENCE, is_ref);

// Get the continuous output type of the node, default is false
bool is_output_continuous = false;
// If GetBool fail, is_output_continuous is false.
(void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_CONTINUOUS_OUTPUT, is_output_continuous);

// If the output is ref type and refers to the ref of an input, the name of the output
// and the input are the same. Ge encounters ref type, finds matching relationship according
// to the names of input and output, and allocates the same memory address, eg: HCOMBroadcast
if (!is_ref && is_output_continuous) { // Assign continuous output memory
ret = AssignContinuousOutputMemory(node);
// Assign continuous output memory
bool is_continuous_output = ((continuous_type & kOutput) != 0) || ((continuous_type & kOutputNoPadding) != 0);
if (is_continuous_output) {
ret = AssignContinuousOutputMemory(node, continuous_type);
if (ret != ge::SUCCESS) { if (ret != ge::SUCCESS) {
GELOGE(ret, "Assign reference memory failed!");
GELOGE(ret, "Assign continuous output memory failed!");
return ret; return ret;
} }
} }
@@ -391,94 +421,39 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) {
} }


Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node, int64_t &continuous_mem_start, Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node, int64_t &continuous_mem_start,
int64_t &continuous_mem_size, int64_t memory_type) {
int64_t &continuous_mem_size, int64_t memory_type, uint32_t continuous_type) {
GELOGI("Current node %s needs continuous input.", node->GetName().c_str()); GELOGI("Current node %s needs continuous input.", node->GetName().c_str());
bool continuous_input_alloc = false;
(void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_CONTINUOUS_INPUT_ALLOC, continuous_input_alloc);
auto iter = memory_offset_.find(memory_type); auto iter = memory_offset_.find(memory_type);
if (iter == memory_offset_.end()) { if (iter == memory_offset_.end()) {
std::string error = "Memory offset does not have memory type" + FmtToStr(memory_type); std::string error = "Memory offset does not have memory type" + FmtToStr(memory_type);
GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
return FAILED; return FAILED;
} }
// The head and tail of hcom continuous input should be added 512
iter->second.mem_offset_ += MEM_ALIGN_SIZE;
continuous_mem_start = iter->second.mem_offset_; continuous_mem_start = iter->second.mem_offset_;
int64_t mem_offset = iter->second.mem_offset_;
int64_t extra_memory_size = 0;
bool is_continuous_input_allocated = false;
(void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_CONTINUOUS_INPUT_ALLOC, is_continuous_input_allocated);
for (auto &in_data_anchor : node->GetAllInDataAnchors()) { for (auto &in_data_anchor : node->GetAllInDataAnchors()) {
GE_IF_BOOL_EXEC(in_data_anchor == nullptr, continue);
auto peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor(); auto peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor();
GE_IF_BOOL_EXEC(peer_out_data_anchor == nullptr, continue); GE_IF_BOOL_EXEC(peer_out_data_anchor == nullptr, continue);

auto peer_op_desc = peer_out_data_anchor->GetOwnerNode()->GetOpDesc(); auto peer_op_desc = peer_out_data_anchor->GetOwnerNode()->GetOpDesc();
GE_IF_BOOL_EXEC(peer_op_desc == nullptr, continue); GE_IF_BOOL_EXEC(peer_op_desc == nullptr, continue);
bool is_peer_output_continuous = false;
// If GetBool fail, is_peer_output_continuous is false.
(void) ge::AttrUtils::GetBool(peer_op_desc, ATTR_NAME_CONTINUOUS_OUTPUT, is_peer_output_continuous);

// Get peer node output size, if size == 1(peer node has only one output), continuous input of the node and
// continuous output of the previous node is the same, we can support it. If size != 1, there may be
// conflict between the two, we can not support it.
auto peer_output_size = peer_op_desc->GetOutputsSize();
GE_IF_BOOL_EXEC(is_peer_output_continuous && (peer_output_size != 1),
std::string error = "Current op" + FmtToStr(node->GetOpDesc()->GetName()) +
" requires continuous input, while the previous op" + FmtToStr(peer_op_desc->GetName()) +
" requires continuous output. There may be conflict between the two." +
"This node is not supported now.";
GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
return PARAM_INVALID;);

bool is_peer_reference = false;
// If GetBool fail, is_peer_reference is false.
(void) AttrUtils::GetBool(peer_op_desc, ATTR_NAME_REFERENCE, is_peer_reference);
GE_IF_BOOL_EXEC(is_peer_reference,
std::string error = "Current op" + FmtToStr(node->GetOpDesc()->GetName()) +
" requires continuous input, while the previous op" + FmtToStr(peer_op_desc->GetName()) +
" requires continuous output. There may be conflict between the two." +
"This node is not supported now.";
GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
return PARAM_INVALID;);

vector<int64_t> output_list = peer_op_desc->GetOutputOffset();
std::vector<int64_t> offsets_for_fusion = {};
bool has_offset_attr =
AttrUtils::GetListInt(peer_op_desc, ATTR_NAME_OUTPUT_OFFSET_FOR_BUFFER_FUSION, offsets_for_fusion);
if (peer_out_data_anchor->GetIdx() < static_cast<int>(output_list.size())) {
if (continuous_input_alloc && !has_offset_attr) {
if (in_data_anchor->GetIdx() == 0) {
continuous_mem_start = output_list.at(peer_out_data_anchor->GetIdx());
}
// can not use else if, incase only one input
if (in_data_anchor->GetIdx() == static_cast<int>(node->GetAllInDataAnchors().size()) - 1) {
int64_t tensor_desc_size = 0;
Status ret = ge::TensorUtils::GetSize(*(peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx())),
tensor_desc_size);
GE_IF_BOOL_EXEC(ret != ge::SUCCESS, GELOGE(FAILED, "GetSize failed."); return FAILED;);

tensor_desc_size = (tensor_desc_size + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE * MEM_ALIGN_SIZE;
continuous_mem_size =
output_list.at(peer_out_data_anchor->GetIdx()) - continuous_mem_start + tensor_desc_size + MEM_ALIGN_SIZE;
}
GELOGI(
"[IMAS]Check Continuous input : Set %s name[%s] output[%d] offset to [%ld] stream_id[%ld] size[%u] "
"real_size[%u].",
node->GetOwnerComputeGraph()->GetName().c_str(), peer_op_desc->GetName().c_str(),
peer_out_data_anchor->GetIdx(), output_list.at(peer_out_data_anchor->GetIdx()), peer_op_desc->GetStreamId(),
0, 0);
continue;
}

output_list.at(peer_out_data_anchor->GetIdx()) = iter->second.mem_offset_;
} else {
std::string error = "index" + FmtToStr(peer_out_data_anchor->GetIdx()) + " is out of range.";
GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
GELOGE(FAILED, "index : %d is out of range.", peer_out_data_anchor->GetIdx());
return FAILED;
}
peer_op_desc->SetOutputOffset(output_list);
size_t pre_mem_offset = iter->second.mem_offset_;
GE_IF_BOOL_EXEC(IsContinuousInputConflict(node, peer_op_desc), return PARAM_INVALID;);


int64_t tensor_desc_size = 0; int64_t tensor_desc_size = 0;
if (has_offset_attr) {
if (peer_out_data_anchor->GetIdx() < static_cast<int>(offsets_for_fusion.size())) {
auto offset_for_fusion = offsets_for_fusion[peer_out_data_anchor->GetIdx()];
iter->second.mem_offset_ += offset_for_fusion;
int64_t nopadding_size = 0;
int64_t real_size = 0;
std::vector<int64_t> offsets_of_fusion = {};
bool lx_fusion = AttrUtils::GetListInt(peer_op_desc, ATTR_NAME_OUTPUT_OFFSET_FOR_BUFFER_FUSION, offsets_of_fusion);
lx_fusion = lx_fusion && !offsets_of_fusion.empty();
if (lx_fusion) {
if (peer_out_data_anchor->GetIdx() < static_cast<int>(offsets_of_fusion.size())) {
nopadding_size = offsets_of_fusion[peer_out_data_anchor->GetIdx()];
tensor_desc_size = nopadding_size;
} else { } else {
std::string error = "fusion: peer node" + FmtToStr(peer_op_desc->GetName()) + std::string error = "fusion: peer node" + FmtToStr(peer_op_desc->GetName()) +
" index" + FmtToStr(peer_out_data_anchor->GetIdx()) + " is out of range."; " index" + FmtToStr(peer_out_data_anchor->GetIdx()) + " is out of range.";
@@ -486,425 +461,140 @@ Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node,
return FAILED; return FAILED;
} }
} else { } else {
Status ret =
TensorUtils::GetSize(*(peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx())), tensor_desc_size);
GE_IF_BOOL_EXEC(ret != ge::SUCCESS, GELOGE(FAILED, "GetSize failed."); return FAILED;);

iter->second.mem_offset_ += tensor_desc_size;
}

// If set tensor_actual_size, Memory alignment is not required.
int32_t is_tensor_actual_size = 0;
ge::AttrUtils::GetInt(peer_op_desc, ATTR_NAME_GET_TENSOR_ACTUAL_SIZE, is_tensor_actual_size);
if (is_tensor_actual_size == 0) {
AlignMemOffset(MEM_ALIGN_SIZE, memory_type);
if (GetMemorySize(node->GetOpDesc(), peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx()),
continuous_type, tensor_desc_size, nopadding_size) != ge::SUCCESS) {
return FAILED;
}
} }
GELOGI(
"[IMAS]Continuous input : Set %s name[%s] output[%d] offset to [%zu] stream_id[%ld] size[%zu] "
"real_size[%ld].", node->GetOwnerComputeGraph()->GetName().c_str(), peer_op_desc->GetName().c_str(),
peer_out_data_anchor->GetIdx(), pre_mem_offset, peer_op_desc->GetStreamId(),
(iter->second.mem_offset_ - pre_mem_offset), tensor_desc_size);
}


iter->second.mem_offset_ += MEM_ALIGN_SIZE;
if (!continuous_input_alloc) {
continuous_mem_size = iter->second.mem_offset_ - continuous_mem_start;
}
return SUCCESS;
}

Status GraphMemoryAssigner::AssignContinuousOutputMemory(const ge::NodePtr &node) {
GELOGI("Current node %s needs continuous output.", node->GetName().c_str());
auto out_op_desc = node->GetOpDesc();
GE_IF_BOOL_EXEC(out_op_desc == nullptr, GELOGE(ge::FAILED, "out_op_desc is null."); return ge::FAILED);
vector<int64_t> output_list = out_op_desc->GetOutputOffset();

if ((out_op_desc->GetOutputsSize() > output_list.size()) || (output_list.size() == 0)) {
GELOGE(ge::FAILED, "The size %zu of node output desc is more than output_list's size %zu.",
out_op_desc->GetOutputsSize(), output_list.size());
return ge::FAILED;
}

size_t mem_offset = output_list[0];
for (auto &out_data_anchor : node->GetAllOutDataAnchors()) {
output_list[out_data_anchor->GetIdx()] = mem_offset;
int64_t tensor_desc_size = 0;
if (ge::TensorUtils::GetSize(*(out_op_desc->GetOutputDescPtr(out_data_anchor->GetIdx())), tensor_desc_size) !=
ge::SUCCESS) {
GELOGE(FAILED, "GetSize failed.");
return FAILED;
}
mem_offset += tensor_desc_size;
if (mem_offset <= 0) {
bool is_nopadding = ((continuous_type & kInputNoPadding) != 0) || lx_fusion;
vector<int64_t> output_list = peer_op_desc->GetOutputOffset();
if (peer_out_data_anchor->GetIdx() >= static_cast<int>(output_list.size())) {
std::string error = "index" + FmtToStr(peer_out_data_anchor->GetIdx()) + " is out of range.";
GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
return FAILED; return FAILED;
} }
mem_offset = (mem_offset + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE * MEM_ALIGN_SIZE;
GELOGI(
"[IMAS]Continuous output : Set %s name[%s] output[%d] offset to [%zu] stream_id[%ld] size[%ld] "
"real_size[%ld].",
node->GetOwnerComputeGraph()->GetName().c_str(), out_op_desc->GetName().c_str(), out_data_anchor->GetIdx(),
output_list[out_data_anchor->GetIdx()], out_op_desc->GetStreamId(), tensor_desc_size, tensor_desc_size);
}
out_op_desc->SetOutputOffset(output_list);
return ge::SUCCESS;
}

Status GraphMemoryAssigner::ReAssignVirtualInputNodeMemory(NodePtr node, size_t &mem_offset_reuse) {
OpDescPtr op_desc = node->GetOpDesc();
vector<int64_t> output_list = op_desc->GetOutputOffset();
if (output_list.empty()) {
GELOGE(FAILED, "Outputoffset is empty node name:%s", node->GetName().c_str());
return FAILED;
}
output_list.at(0) = mem_offset_reuse;
op_desc->SetOutputOffset(output_list);
GELOGI("Set virtual input node %s output offset to %zu.", op_desc->GetName().c_str(), mem_offset_reuse);


int64_t attr_dim_index;
bool get_attr_dim_flag = ge::AttrUtils::GetInt(op_desc, ATTR_NAME_REUSE_INPUT_ON_DIM_INDEX, attr_dim_index);
if (!get_attr_dim_flag) {
GELOGE(FAILED, "Get attr _reuse_input_on_dim_index failed.");
return FAILED;
}

size_t extra_memory_size = 0;
for (const auto &in_data_anchor : node->GetAllInDataAnchors()) {
auto peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor();
GE_CHECK_NOTNULL(peer_out_data_anchor);
auto peer_op_desc = peer_out_data_anchor->GetOwnerNode()->GetOpDesc();
GE_CHECK_NOTNULL(peer_op_desc);
vector<int64_t> output_offsets = peer_op_desc->GetOutputOffset();
if (peer_out_data_anchor->GetIdx() >= static_cast<int>(output_offsets.size())) {
GELOGE(ge::FAILED, "Index : %d is out of range.", peer_out_data_anchor->GetIdx());
return ge::FAILED;
// when continuous input has been allocated first input is beginning offset
bool is_allocated_first_input = is_continuous_input_allocated && (in_data_anchor->GetIdx() == 0);
if (is_allocated_first_input) {
mem_offset = output_list.at(peer_out_data_anchor->GetIdx());
continuous_mem_start = output_list.at(peer_out_data_anchor->GetIdx());
} else {
// set offset for input
output_list.at(peer_out_data_anchor->GetIdx()) = mem_offset;
peer_op_desc->SetOutputOffset(output_list);
} }
output_offsets.at(peer_out_data_anchor->GetIdx()) = mem_offset_reuse;
peer_op_desc->SetOutputOffset(output_offsets);
size_t pre_mem_offset = mem_offset_reuse;


// Calculate tensor real size of each piece of data and out size of complete data
ge::ConstGeTensorDescPtr output_desc = peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx());
GE_CHECK_NOTNULL(output_desc);
int64_t output_mem_size;
int64_t batch_dim_num = 1;
int64_t out_size;
if (CalculateTensorRealSizeAndOutSize(output_desc, attr_dim_index, output_mem_size, batch_dim_num, out_size) !=
SUCCESS) {
GELOGE(FAILED, "CalculateTensorRealSizeAndOutSize failed for node %s output [%d].",
peer_op_desc->GetName().c_str(), peer_out_data_anchor->GetIdx());
return FAILED;
int64_t align_size = tensor_desc_size;
if (is_nopadding) {
mem_offset += nopadding_size;
extra_memory_size += (tensor_desc_size - nopadding_size);
real_size = nopadding_size;
} else {
ge::AlignMemOffset(align_size);
mem_offset += align_size;
// The head and tail of hcom continuous input should be added 512
extra_memory_size = MEM_ALIGN_SIZE;
real_size = tensor_desc_size;
} }


mem_offset_reuse += output_mem_size;
extra_memory_size = extra_memory_size + out_size - output_mem_size;

GELOGI("[IMAS]Virtual node optimize: set %s name[%s] output[%d] offset to [%zu] stream_id[%ld] size[%ld] "
"real_size[%ld].",
node->GetOwnerComputeGraph()->GetName().c_str(), peer_op_desc->GetName().c_str(),
peer_out_data_anchor->GetIdx(), pre_mem_offset, peer_op_desc->GetStreamId(), out_size,
output_mem_size);
GELOGI("[IMAS]Continuous input : Set %s name[%s] output[%d] offset to [%zu] stream_id[%ld] size[%zu] realsize[%ld]"
" nopadding[%d].", node->GetOwnerComputeGraph()->GetName().c_str(), peer_op_desc->GetName().c_str(),
peer_out_data_anchor->GetIdx(), output_list.at(peer_out_data_anchor->GetIdx()), peer_op_desc->GetStreamId(),
is_continuous_input_allocated ? 0UL : align_size, real_size, is_nopadding);
} }
mem_offset_reuse += extra_memory_size;
size_t after_mem_offset = mem_offset_reuse;
GELOGI("After reassign virtual input node[name: %s, type: %s] memory, memory offset = %zu.",
op_desc->GetName().c_str(), op_desc->GetType().c_str(), after_mem_offset);
return SUCCESS;
}


Status GraphMemoryAssigner::ReAssignReuseAndNoPaddingContinuousInputMemory() {
map<string, vector<NodePtr>> mem_reuse_virtual_input_nodes_map;
int64_t memory_type = RT_MEMORY_HBM;
for (const auto &n : compute_graph_->GetAllNodes()) {
OpDescPtr op_desc = n->GetOpDesc();
GE_CHECK_NOTNULL(op_desc);
bool attr_continuous = false;
bool get_continuous_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT, attr_continuous);
GE_IF_BOOL_EXEC(!get_continuous_flag, continue);
bool attr_reuse = false;
bool get_reuse_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_OUTPUT_REUSE_INPUT, attr_reuse);
GE_IF_BOOL_EXEC(!get_reuse_flag, continue);
if (attr_reuse && attr_continuous) {
if (op_desc->GetOutputsSize() != kVirtualInputNodeOutputSize) {
// When current virtual node has several outputs, can't directly determine which input is the tensor for reuse.
std::string error = "Only one output is supported, current virtual node" + FmtToStr(n->GetName()) +
" has " + FmtToStr(op_desc->GetOutputsSize()) + " outputs.";
GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
return FAILED;
}
GE_CHK_STATUS_RET(GetNodeMemoryType(n, memory_type, "input"), "Get node memory type failed.");
auto iter = memory_offset_.find(memory_type);
if (iter == memory_offset_.end()) {
std::string error = "Memory offset does not have memory type" + FmtToStr(memory_type);
GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
return FAILED;
}
GELOGD("Start to reassign memory for virtual input node, memory offset = %zu, memory type = %ld.",
iter->second.mem_offset_, memory_type);
string batch_label_string;
// Not all ops have ATTR_NAME_BATCH_LABEL, no need to check return value, only check out parameter
(void) ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label_string);
if (batch_label_string.empty()) {
size_t node_mem_offset = iter->second.mem_offset_;
// No ATTR_NAME_BATCH_LABEL, no need to reuse memory.
Status status = ReAssignVirtualInputNodeMemory(n, node_mem_offset);
if (status != SUCCESS) {
GELOGE(FAILED, "Reassign memory of virtual input node failed, node name: %s.", n->GetName().c_str());
return FAILED;
}

iter->second.mem_offset_ = node_mem_offset;
AlignMemOffset(MEM_ALIGN_SIZE, memory_type);
GELOGD("After reassign memory for virtual input node, align memory = %zu, memory type = %ld.",
iter->second.mem_offset_, memory_type);
} else {
// Has ATTR_NAME_BATCH_LABEL, for dynamic multi-batch node, need to reuse memory.
string current_node_full_name = op_desc->GetName();
size_t pos = current_node_full_name.find(kMbatchNodeNameFlag);
if (pos == string::npos) {
GELOGE(FAILED, "Cannot find key string [%s] of multi-batch in name of virtual input node, node name: %s.",
kMbatchNodeNameFlag, n->GetName().c_str());
return FAILED;
}
string fixed_name = current_node_full_name.substr(0, pos);
vector<NodePtr> parallel_virtual_input_nodes;
if (mem_reuse_virtual_input_nodes_map.count(fixed_name) != 0) {
parallel_virtual_input_nodes = mem_reuse_virtual_input_nodes_map[fixed_name];
}
parallel_virtual_input_nodes.emplace_back(n);
mem_reuse_virtual_input_nodes_map[fixed_name] = parallel_virtual_input_nodes;
}
}
}

int32_t mem_reuse_model = 0;
if (ReAssignVirtualNodesMemory(mem_reuse_virtual_input_nodes_map, mem_reuse_model) != SUCCESS) {
GELOGE(FAILED, "Reassign memory of virtual input nodes failed.");
return FAILED;
mem_offset += extra_memory_size;
ge::AlignMemOffset(mem_offset);
continuous_mem_size = mem_offset - continuous_mem_start;
if (is_continuous_input_allocated) {
// not allocate memory here, so no need add 512 in header
iter->second.mem_offset_ -= MEM_ALIGN_SIZE;
} else {
iter->second.mem_offset_ = mem_offset;
} }
return SUCCESS; return SUCCESS;
} }


Status GraphMemoryAssigner::ReAssignVirtualOutputNodeMemory(NodePtr node, size_t &mem_offset_reuse) {
OpDescPtr op_desc = node->GetOpDesc();

// 1. set memory of to be reused input tensor
Status GetFirstInputPeerOutOutputOffset(const ge::NodePtr &node, int64_t &mem_offset) {
auto in_data_anchor_list = node->GetAllInDataAnchors(); auto in_data_anchor_list = node->GetAllInDataAnchors();
if (in_data_anchor_list.empty()) {
GELOGE(FAILED, "Node %s's in data anchor is empty.", node->GetName().c_str());
return FAILED;
}
auto peer_out_data_anchor = in_data_anchor_list.at(0)->GetPeerOutAnchor(); auto peer_out_data_anchor = in_data_anchor_list.at(0)->GetPeerOutAnchor();
GE_CHECK_NOTNULL(peer_out_data_anchor);
GE_IF_BOOL_EXEC(peer_out_data_anchor == nullptr, GELOGE(ge::FAILED, "peer_out_data_anchor is null.");
return ge::FAILED);
auto peer_op_desc = peer_out_data_anchor->GetOwnerNode()->GetOpDesc(); auto peer_op_desc = peer_out_data_anchor->GetOwnerNode()->GetOpDesc();
GE_CHECK_NOTNULL(peer_op_desc);
GE_IF_BOOL_EXEC(peer_op_desc == nullptr, GELOGE(ge::FAILED, "peer_op_desc is null."); return ge::FAILED);
vector<int64_t> in_node_output_offsets = peer_op_desc->GetOutputOffset(); vector<int64_t> in_node_output_offsets = peer_op_desc->GetOutputOffset();
if (peer_out_data_anchor->GetIdx() >= static_cast<int>(in_node_output_offsets.size())) { if (peer_out_data_anchor->GetIdx() >= static_cast<int>(in_node_output_offsets.size())) {
GELOGE(FAILED, "Index : %d is out of range.", peer_out_data_anchor->GetIdx()); GELOGE(FAILED, "Index : %d is out of range.", peer_out_data_anchor->GetIdx());
return FAILED; return FAILED;
} }
in_node_output_offsets.at(peer_out_data_anchor->GetIdx()) = mem_offset_reuse;
peer_op_desc->SetOutputOffset(in_node_output_offsets);
GELOGI("Set virtual output node %s input data offset to %zu.", op_desc->GetName().c_str(), mem_offset_reuse);
mem_offset = in_node_output_offsets.at(peer_out_data_anchor->GetIdx());
return SUCCESS;
}


// 2. set memory of output tensor
vector<int64_t> output_list = op_desc->GetOutputOffset();
if (output_list.empty()) {
GELOGE(FAILED, "Outputoffset is empty, node name: %s", node->GetName().c_str());
return FAILED;
}
if (op_desc->GetOutputsSize() > output_list.size()) {
GELOGE(FAILED, "The size %zu of op_desc is more than output_list's size %zu.", op_desc->GetOutputsSize(),
output_list.size());
return FAILED;
}
int64_t attr_dim_index;
bool get_attr_dim_flag = ge::AttrUtils::GetInt(op_desc, ATTR_NAME_REUSE_INPUT_ON_DIM_INDEX, attr_dim_index);
if (!get_attr_dim_flag) {
GELOGE(FAILED, "Get attr _reuse_input_on_dim_index failed.");
return FAILED;
Status GraphMemoryAssigner::AssignContinuousOutputMemory(const ge::NodePtr &node, uint32_t continuous_type) {
GELOGI("Current node %s needs continuous output.", node->GetName().c_str());
auto out_op_desc = node->GetOpDesc();
GE_IF_BOOL_EXEC(out_op_desc == nullptr, GELOGE(ge::FAILED, "out_op_desc is null."); return ge::FAILED);
vector<int64_t> output_list = out_op_desc->GetOutputOffset();
if ((out_op_desc->GetOutputsSize() > output_list.size()) || (output_list.size() == 0)) {
GELOGE(ge::FAILED, "The size %zu of node output desc is more than output_list's size %zu.",
out_op_desc->GetOutputsSize(), output_list.size());
return ge::FAILED;
} }


size_t extra_memory_size = 0;
for (auto &out_data_anchor : node->GetAllOutDataAnchors()) {
output_list[out_data_anchor->GetIdx()] = mem_offset_reuse;
size_t pre_mem_offset = mem_offset_reuse;

// calculate tensor real size of each piece of data and out size of complete data
ge::ConstGeTensorDescPtr output_desc = op_desc->GetOutputDescPtr(out_data_anchor->GetIdx());
GE_CHECK_NOTNULL(output_desc);
int64_t output_mem_size;
int64_t batch_dim_num = 1;
int64_t out_size;
if (CalculateTensorRealSizeAndOutSize(output_desc, attr_dim_index, output_mem_size, batch_dim_num, out_size) !=
SUCCESS) {
GELOGE(FAILED, "CalculateTensorRealSizeAndOutSize failed for node %s output [%d].",
op_desc->GetName().c_str(), out_data_anchor->GetIdx());
return FAILED;
int64_t mem_offset = 0;
bool is_nopadding = ((continuous_type & kOutputNoPadding) != 0);
if (is_nopadding) {
// out tensor memory must be reused input tensor memory
if (GetFirstInputPeerOutOutputOffset(node, mem_offset) != SUCCESS) {
return ge::FAILED;
} }
} else {
// Get the reference type of the node, default is false
bool is_ref = false;
// If GetBool fail, is_ref is false.
(void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_REFERENCE, is_ref);


mem_offset_reuse += output_mem_size;
extra_memory_size = extra_memory_size + out_size - output_mem_size;

GELOGI("[IMAS]Virtual node optimize: set %s name[%s] output[%d] offset to [%zu], size[%ld], real_size[%ld].",
node->GetOwnerComputeGraph()->GetName().c_str(), op_desc->GetName().c_str(), out_data_anchor->GetIdx(),
pre_mem_offset, out_size, output_mem_size);
}
op_desc->SetOutputOffset(output_list);
mem_offset_reuse += extra_memory_size;
size_t after_mem_offset = mem_offset_reuse;
GELOGI("After reassign virtual output node[name: %s, type: %s] memory, memory offset = %zu.",
op_desc->GetName().c_str(), op_desc->GetType().c_str(), after_mem_offset);
return SUCCESS;
}

Status GraphMemoryAssigner::ReAssignReuseAndNoPaddingContinuousOutputMemory() {
map<string, vector<NodePtr>> mem_reuse_virtual_output_nodes_map;
int64_t memory_type = RT_MEMORY_HBM;
for (const auto &n : compute_graph_->GetAllNodes()) {
OpDescPtr op_desc = n->GetOpDesc();
GE_CHECK_NOTNULL(op_desc);
bool attr_continuous = false;
bool get_continuous_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOPADDING_CONTINUOUS_OUTPUT, attr_continuous);
GE_IF_BOOL_EXEC(!get_continuous_flag, continue);
bool attr_reuse = false;
bool get_reuse_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_OUTPUT_REUSE_INPUT, attr_reuse);
GE_IF_BOOL_EXEC(!get_reuse_flag, continue);

if (attr_reuse && attr_continuous) {
auto in_data_anchor_list = n->GetAllInDataAnchors();
if (in_data_anchor_list.size() != kVirtualOutputNodeInputSize) {
// When current virtual node has several inputs, can't directly determine which input is the tensor for reuse.
std::string error = "Only one input is supported, current virtual node" + FmtToStr(n->GetName()) +
" has " + FmtToStr(in_data_anchor_list.size()) + " inputs.";
GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
return FAILED;
}
GE_CHK_STATUS_RET(GetNodeMemoryType(n, memory_type, "output"), "Get node memory type failed.");
auto iter = memory_offset_.find(memory_type);
if (iter == memory_offset_.end()) {
std::string error = "Memory offset does not have memory type" + FmtToStr(RT_MEMORY_HBM);
GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
return FAILED;
}
GELOGD("Start to reassign memory for virtual output node, memory offset = %zu, memory type = %ld.",
iter->second.mem_offset_, memory_type);
string batch_label_string;
// Not all ops have ATTR_NAME_BATCH_LABEL, no need to check return value, only check out parameter
(void) ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label_string);
if (batch_label_string.empty()) {
size_t node_mem_offset = iter->second.mem_offset_;
// No ATTR_NAME_BATCH_LABEL, no need to reuse memory.
Status status = ReAssignVirtualOutputNodeMemory(n, node_mem_offset);
if (status != SUCCESS) {
GELOGE(FAILED, "Reassign memory of virtual output node failed, node name: %s.", n->GetName().c_str());
return FAILED;
}
iter->second.mem_offset_ = node_mem_offset;
AlignMemOffset(MEM_ALIGN_SIZE, memory_type);
GELOGD("After reassign memory for virtual output node, align memory = %zu, memory type = %ld.",
iter->second.mem_offset_, memory_type);
} else {
// Has ATTR_NAME_BATCH_LABEL, for dynamic multi-batch node, need to reuse memory.
string current_node_full_name = op_desc->GetName();
size_t pos = current_node_full_name.find(kMbatchNodeNameFlag);
if (pos == string::npos) {
std::string error = "Cannot find key string" + FmtToStr(kMbatchNodeNameFlag) +
" of multi-batch in name of virtual output node, the node name is " + FmtToStr(n->GetName());
GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
return FAILED;
}
string fixed_name = current_node_full_name.substr(0, pos);
vector<NodePtr> parallel_virtual_output_nodes;
if (mem_reuse_virtual_output_nodes_map.count(fixed_name) != 0) {
parallel_virtual_output_nodes = mem_reuse_virtual_output_nodes_map[fixed_name];
}
parallel_virtual_output_nodes.emplace_back(n);
mem_reuse_virtual_output_nodes_map[fixed_name] = parallel_virtual_output_nodes;
}
// If the output is ref type and refers to the ref of an input, the name of the output
// and the input are the same. Ge encounters ref type, finds matching relationship according
// to the names of input and output, and allocates the same memory address, eg: HCOMBroadcast
if (is_ref) {
GELOGI("Current node %s no needs assign continuous output because reference input by name.",
node->GetName().c_str());
return SUCCESS;
} }
mem_offset = output_list[0];
} }


int32_t mem_reuse_model = 1;
if (ReAssignVirtualNodesMemory(mem_reuse_virtual_output_nodes_map, mem_reuse_model) != SUCCESS) {
GELOGE(FAILED, "Reassign memory of virtual output nodes failed.");
return FAILED;
}
return SUCCESS;
}

Status GraphMemoryAssigner::ReAssignVirtualNodesMemory(map<string, vector<NodePtr>> &mem_reuse_nodes_map,
int32_t mem_reuse_model) {
// Find max batch label value
string max_batch_label;
GE_CHK_STATUS_RET(GetMaxBatchLabel(mem_reuse_nodes_map, mem_reuse_model, max_batch_label),
"Get max batch label failed.");
PrintMemoryOffset();
vector<size_t> nodes_mem_offset_list;
for (auto &i_map : mem_reuse_nodes_map) {
vector<NodePtr> virtual_nodes_list = i_map.second;
int64_t memory_type = RT_MEMORY_HBM;
GE_CHK_STATUS_RET(GetNodeListMemoryType(virtual_nodes_list, mem_reuse_model, memory_type),
"Get node list memory type failed.");
auto iter = memory_offset_.find(memory_type);
if (iter == memory_offset_.end()) {
std::string error = "Memory offset does not have memory type" + FmtToStr(RT_MEMORY_HBM);
GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
for (auto &out_data_anchor : node->GetAllOutDataAnchors()) {
output_list[out_data_anchor->GetIdx()] = mem_offset;
int64_t tensor_desc_size = 0;
int64_t nopadding_size = 0;
if (GetMemorySize(out_op_desc, out_op_desc->GetOutputDescPtr(out_data_anchor->GetIdx()), continuous_type,
tensor_desc_size, nopadding_size) != ge::SUCCESS) {
GELOGE(FAILED, "GetSize failed.");
return FAILED; return FAILED;
} }
size_t max_batch_node_mem_offset = iter->second.mem_offset_;
nodes_mem_offset_list.emplace_back(max_batch_node_mem_offset);
for (auto &i_node : virtual_nodes_list) {
// Op_desc is not nullptr, it has been checked.
OpDescPtr op_desc = i_node->GetOpDesc();
string batch_label_string;
// All ops must have ATTR_NAME_BATCH_LABEL, no need to check return value.
(void) ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label_string);
if (batch_label_string == max_batch_label) {
Status status = SUCCESS;
if (mem_reuse_model == kVirtualInputNodeMemoryReuse) {
status = ReAssignVirtualInputNodeMemory(i_node, max_batch_node_mem_offset);
} else if (mem_reuse_model == kVirtualOutputNodeMemoryReuse) {
status = ReAssignVirtualOutputNodeMemory(i_node, max_batch_node_mem_offset);
} else {
std::string error = "Invalid parameter memory reuse model, which is " + FmtToStr(mem_reuse_model);
GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
return FAILED;
}

if (status != SUCCESS) {
GELOGE(FAILED, "Reassign memory of virtual node failed, node name: %s.", i_node->GetName().c_str());
return FAILED;
}
iter->second.mem_offset_ = max_batch_node_mem_offset;
AlignMemOffset(MEM_ALIGN_SIZE, memory_type);
GELOGD("After reassign memory for virtual node, align memory = %zu, memory type = %ld.",
iter->second.mem_offset_, memory_type);
// Only assign memory of max batch nodes.
break;
}
}
}
PrintMemoryOffset();
size_t memory_reuse_index = 0;
for (auto &i_map : mem_reuse_nodes_map) {
vector<NodePtr> virtual_nodes_list = i_map.second;
for (auto &i_node : virtual_nodes_list) {
size_t remaining_batch_node_mem_offset = nodes_mem_offset_list[memory_reuse_index];
Status status = SUCCESS;
if (mem_reuse_model == kVirtualInputNodeMemoryReuse) {
status = ReAssignVirtualInputNodeMemory(i_node, remaining_batch_node_mem_offset);
} else if (mem_reuse_model == kVirtualOutputNodeMemoryReuse) {
status = ReAssignVirtualOutputNodeMemory(i_node, remaining_batch_node_mem_offset);
} else {
std::string error = "Invalid parameter memory reuse model, which is " + FmtToStr(mem_reuse_model);
GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
return FAILED;
}


if (status != SUCCESS) {
GELOGE(FAILED, "Reassign memory of virtual node failed, node name: %s.", i_node->GetName().c_str());
return FAILED;
}
if (is_nopadding) {
mem_offset += nopadding_size;
} else {
mem_offset += tensor_desc_size;
ge::AlignMemOffset(mem_offset);
} }
memory_reuse_index++;
GELOGI("[IMAS]Continuous output : Set %s name[%s] output[%d] offset to [%ld] stream_id[%ld] size[%ld] "
"realsize[%ld] nopadding[%d].", node->GetOwnerComputeGraph()->GetName().c_str(),
out_op_desc->GetName().c_str(), out_data_anchor->GetIdx(), output_list[out_data_anchor->GetIdx()],
out_op_desc->GetStreamId(), 0UL, is_nopadding ? nopadding_size : tensor_desc_size, is_nopadding);
} }
return SUCCESS;
out_op_desc->SetOutputOffset(output_list);
return ge::SUCCESS;
} }


Status GraphMemoryAssigner::ReAssignAtomicMemory(bool is_loop_graph) { Status GraphMemoryAssigner::ReAssignAtomicMemory(bool is_loop_graph) {


+ 2
- 18
ge/graph/build/memory/graph_mem_assigner.h View File

@@ -119,31 +119,15 @@ class GraphMemoryAssigner {
/// ///
ge::Status ReAssignContinuousMemory(bool is_loop_graph); ge::Status ReAssignContinuousMemory(bool is_loop_graph);


ge::Status ReAssignReuseAndNoPaddingContinuousInputMemory();

ge::Status ReAssignReuseAndNoPaddingContinuousOutputMemory();

ge::Status ReAssignVirtualInputNodeMemory(NodePtr node, size_t &mem_offset_reuse);

ge::Status ReAssignVirtualOutputNodeMemory(NodePtr node, size_t &mem_offset_reuse);

ge::Status ReAssignVirtualNodesMemory(map<string, vector<NodePtr>> &mem_reuse_nodes_map, int32_t mem_reuse_model);

ge::Status GetMaxBatchLabel(const map<string, vector<NodePtr>> &mem_reuse_virtual_nodes_map,
int32_t mem_reuse_model, string &max_batch_label);

ge::Status CalculateTensorRealSizeAndOutSize(const ge::ConstGeTensorDescPtr &output_desc, int64_t dim_index,
int64_t &output_mem_size, int64_t &batch_dim_num, int64_t &out_size);

ge::Status ReAssignAtomicMemory(bool is_loop_graph); ge::Status ReAssignAtomicMemory(bool is_loop_graph);


ge::Status FilterAtomicNodesForMemoryAssign(map<string, map<NodePtr, vector<NodePtr>>> &normal_atomic_nodes_map, ge::Status FilterAtomicNodesForMemoryAssign(map<string, map<NodePtr, vector<NodePtr>>> &normal_atomic_nodes_map,
map<string, vector<NodePtr>> &connecting_output_atomic_nodes); map<string, vector<NodePtr>> &connecting_output_atomic_nodes);


ge::Status AssignContinuousInputMemory(const ge::NodePtr &node, int64_t &continuous_mem_start, ge::Status AssignContinuousInputMemory(const ge::NodePtr &node, int64_t &continuous_mem_start,
int64_t &continuous_mem_size, int64_t memory_type);
int64_t &continuous_mem_size, int64_t memory_type, uint32_t continuous_type);


ge::Status AssignContinuousOutputMemory(const ge::NodePtr &node);
ge::Status AssignContinuousOutputMemory(const ge::NodePtr &node, uint32_t continuous_type);


/// ///
/// @brief check the input of node whether support atomic attr /// @brief check the input of node whether support atomic attr


BIN
ge/graph/build/memory/graph_mem_assigner.zip View File


Loading…
Cancel
Save