From e49e09d7a752d337bb1700365b51db732f8286e2 Mon Sep 17 00:00:00 2001 From: chuxing Date: Sat, 27 Mar 2021 22:45:28 +0800 Subject: [PATCH 1/9] fix hccl --- ge/hybrid/model/hybrid_model_builder.cc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/ge/hybrid/model/hybrid_model_builder.cc b/ge/hybrid/model/hybrid_model_builder.cc index 669fafb1..19d2ef49 100755 --- a/ge/hybrid/model/hybrid_model_builder.cc +++ b/ge/hybrid/model/hybrid_model_builder.cc @@ -1089,14 +1089,14 @@ Status HybridModelBuilder::LoadTask(NodeItem &node_item) { Status HybridModelBuilder::LoadTasks() { GE_CHK_STATUS_RET(CheckAicpuOpList(), "Check Aicpu op failed."); - std::map ordered_partitioned_calls; + std::map ordered_partitioned_calls; for (auto &it : hybrid_model_.node_items_) { auto &node_item = it.second; if (node_item->node_type == NETOUTPUT) { continue; } if (node_item->node_type == PARTITIONEDCALL) { - ordered_partitioned_calls.emplace(node_item->node_id, node_item.get()); + ordered_partitioned_calls.emplace(node_item->node_name, node_item.get()); continue; } GE_CHK_STATUS_RET_NOLOG(LoadTask(*node_item)); @@ -2092,13 +2092,15 @@ Status HybridModelBuilder::ParseDependentByParallelGroup() { } if (nearest_dep_node != nullptr) { - GELOGD("Add dependency for nodes of same parallel group[%s], src = [%s], dst = [%s]", + GELOGD("Add dependency for nodes with the same parallel group[%s], src = [%s], dst = [%s]", parallel_group.c_str(), nearest_dep_node->NodeName().c_str(), node_item->NodeName().c_str()); auto &deps = node_item->dependents_for_execution; if (std::find(deps.begin(), deps.end(), nearest_dep_node->node) != deps.end()) { - GELOGD("Already has dependency, skip it"); + GELOGD("%s->%s Already has dependency, skip it", + nearest_dep_node->node->GetName().c_str(), + node_item->NodeName().c_str()); continue; } nearest_dep_node->has_observer = true; From 6eddcd2d95c19010b99ebfe73f224b02cc2c301d Mon Sep 17 00:00:00 2001 From: chuxing Date: Sun, 28 Mar 2021 13:47:26 +0800 Subject: [PATCH 2/9] update log --- ge/hybrid/model/hybrid_model_builder.cc | 29 +++++++++++++------------ 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/ge/hybrid/model/hybrid_model_builder.cc b/ge/hybrid/model/hybrid_model_builder.cc index 19d2ef49..7bd9d35c 100755 --- a/ge/hybrid/model/hybrid_model_builder.cc +++ b/ge/hybrid/model/hybrid_model_builder.cc @@ -1637,6 +1637,7 @@ Status HybridModelBuilder::LoadKnownShapedSubgraph(ComputeGraph &graph, NodeItem auto temp_graph = MakeShared("temp"); GE_CHECK_NOTNULL(temp_graph); auto wrapper_node = temp_graph->AddNode(wrapper_op_desc); + wrapper_op_desc->SetId(parent_node_item->node_id); GeModelPtr ge_model = subgraph_models_[subgraph_name]; GE_CHECK_NOTNULL(ge_model); hybrid_model_.known_shape_sub_models_.emplace(wrapper_node, ge_model); @@ -1916,7 +1917,6 @@ Status HybridModelBuilder::LoadDynamicSubgraph(ComputeGraph &graph, bool is_root NodeItem *node_item = nullptr; GE_CHK_STATUS_RET_NOLOG(GetOrCreateNodeItem(node, &node_item)); GE_CHK_STATUS_RET_NOLOG(BuildNodeItem(node, *node_item)); - GE_CHK_STATUS_RET_NOLOG(CollectParallelGroups(node_item)); GE_CHK_STATUS_RET_NOLOG(UpdateAnchorStatus(node)); // needed by FE generate task node_item->input_start = input_start; @@ -2069,22 +2069,17 @@ Status HybridModelBuilder::CollectParallelGroups(NodeItem *node_item) { } Status HybridModelBuilder::ParseDependentByParallelGroup() { + for (auto &it : hybrid_model_.node_items_) { + GE_CHK_STATUS_RET_NOLOG(CollectParallelGroups(it.second.get())); + } for (const auto &it : node_to_parallel_groups_) { auto node_item = it.first; - auto dst_engine_type = NodeExecutorManager::GetInstance().ResolveExecutorType(*node_item->node); + auto dst_executor_type = NodeExecutorManager::GetInstance().ResolveExecutorType(*node_item->node); for (const auto ¶llel_group : it.second) { auto &dependent_nodes = parallel_group_to_nodes_[parallel_group]; NodeItem *nearest_dep_node = nullptr; int max_id = -1; for (auto &dep_node : dependent_nodes) { - if (node_item == dep_node) { - continue; - } - auto src_engine_type = NodeExecutorManager::GetInstance().ResolveExecutorType(*dep_node->node); - if (src_engine_type == dst_engine_type) { - continue; - } - if (dep_node->node_id < node_item->node_id && dep_node->node_id > max_id) { nearest_dep_node = dep_node; max_id = dep_node->node_id; @@ -2092,10 +2087,12 @@ Status HybridModelBuilder::ParseDependentByParallelGroup() { } if (nearest_dep_node != nullptr) { - GELOGD("Add dependency for nodes with the same parallel group[%s], src = [%s], dst = [%s]", - parallel_group.c_str(), - nearest_dep_node->NodeName().c_str(), - node_item->NodeName().c_str()); + GELOGD("[%s] Nearest node = [%s]", node_item->NodeName().c_str(), nearest_dep_node->NodeName().c_str()); + auto src_engine_type = NodeExecutorManager::GetInstance().ResolveExecutorType(*nearest_dep_node->node); + if (src_engine_type == dst_executor_type) { + GELOGD("No need to add dependency for nodes with same executor type"); + continue; + } auto &deps = node_item->dependents_for_execution; if (std::find(deps.begin(), deps.end(), nearest_dep_node->node) != deps.end()) { GELOGD("%s->%s Already has dependency, skip it", @@ -2105,6 +2102,10 @@ Status HybridModelBuilder::ParseDependentByParallelGroup() { } nearest_dep_node->has_observer = true; deps.emplace_back(nearest_dep_node->node); + GELOGD("Add dependency for nodes with the same parallel group[%s], src = [%s], dst = [%s]", + parallel_group.c_str(), + nearest_dep_node->NodeName().c_str(), + node_item->NodeName().c_str()); } } } From a89298a0455fc8fb1f2075d76144efe4138d2004 Mon Sep 17 00:00:00 2001 From: chuxing Date: Sun, 28 Mar 2021 14:12:05 +0800 Subject: [PATCH 3/9] ensure order --- ge/hybrid/model/hybrid_model_builder.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ge/hybrid/model/hybrid_model_builder.cc b/ge/hybrid/model/hybrid_model_builder.cc index 7bd9d35c..25dabd78 100755 --- a/ge/hybrid/model/hybrid_model_builder.cc +++ b/ge/hybrid/model/hybrid_model_builder.cc @@ -1089,14 +1089,14 @@ Status HybridModelBuilder::LoadTask(NodeItem &node_item) { Status HybridModelBuilder::LoadTasks() { GE_CHK_STATUS_RET(CheckAicpuOpList(), "Check Aicpu op failed."); - std::map ordered_partitioned_calls; + std::map> ordered_partitioned_calls; for (auto &it : hybrid_model_.node_items_) { auto &node_item = it.second; if (node_item->node_type == NETOUTPUT) { continue; } if (node_item->node_type == PARTITIONEDCALL) { - ordered_partitioned_calls.emplace(node_item->node_name, node_item.get()); + ordered_partitioned_calls[node_item->node_id][node_item->node_name] = node_item.get(); continue; } GE_CHK_STATUS_RET_NOLOG(LoadTask(*node_item)); @@ -1104,7 +1104,9 @@ Status HybridModelBuilder::LoadTasks() { // HCCL operators need to be loaded in the same order across different processes for (auto &it : ordered_partitioned_calls) { - GE_CHK_STATUS_RET_NOLOG(LoadTask(*it.second)); + for (auto &it2 : it.second) { + GE_CHK_STATUS_RET_NOLOG(LoadTask(*it2.second)); + } } return SUCCESS; From 55dc62571fc862430ea76054aa2cae315a533709 Mon Sep 17 00:00:00 2001 From: chuxing Date: Sun, 28 Mar 2021 14:54:28 +0800 Subject: [PATCH 4/9] fix ut --- tests/ut/ge/hybrid/ge_hybrid_unittest.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/ut/ge/hybrid/ge_hybrid_unittest.cc b/tests/ut/ge/hybrid/ge_hybrid_unittest.cc index 2166b274..8e32dd4f 100644 --- a/tests/ut/ge/hybrid/ge_hybrid_unittest.cc +++ b/tests/ut/ge/hybrid/ge_hybrid_unittest.cc @@ -315,6 +315,11 @@ TEST_F(UtestGeHybrid, test_parse_parallel_group) { ASSERT_EQ(builder.parallel_group_to_nodes_["group_1"].size(), 2); ASSERT_EQ(builder.parallel_group_to_nodes_["group_2"].size(), 1); + builder.parallel_group_to_nodes_.clear(); + builder.node_ref_inputs_.clear(); + model.node_items_[node] = std::move(node_item); + model.node_items_[node_1] = std::move(node_item_1); + ASSERT_FALSE(node_item->has_observer); ASSERT_TRUE(node_item_1->dependents_for_execution.empty()); ASSERT_EQ(builder.ParseDependentByParallelGroup(), SUCCESS); From b932d0a718507ec33ddb47cd36473cbbdb689539 Mon Sep 17 00:00:00 2001 From: chuxing Date: Sun, 28 Mar 2021 14:56:31 +0800 Subject: [PATCH 5/9] fix ut --- tests/ut/ge/hybrid/ge_hybrid_unittest.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/ut/ge/hybrid/ge_hybrid_unittest.cc b/tests/ut/ge/hybrid/ge_hybrid_unittest.cc index 8e32dd4f..2878a177 100644 --- a/tests/ut/ge/hybrid/ge_hybrid_unittest.cc +++ b/tests/ut/ge/hybrid/ge_hybrid_unittest.cc @@ -320,12 +320,12 @@ TEST_F(UtestGeHybrid, test_parse_parallel_group) { model.node_items_[node] = std::move(node_item); model.node_items_[node_1] = std::move(node_item_1); - ASSERT_FALSE(node_item->has_observer); - ASSERT_TRUE(node_item_1->dependents_for_execution.empty()); + ASSERT_FALSE(model.node_items_[node]->has_observer); + ASSERT_TRUE(model.node_items_[node_1]->dependents_for_execution.empty()); ASSERT_EQ(builder.ParseDependentByParallelGroup(), SUCCESS); ASSERT_TRUE(node_item->has_observer); - ASSERT_EQ(node_item_1->dependents_for_execution.size(), 1); - ASSERT_EQ(node_item_1->dependents_for_execution[0], node); + ASSERT_EQ(model.node_items_[node_1]->dependents_for_execution.size(), 1); + ASSERT_EQ(model.node_items_[node_1]->dependents_for_execution[0], node); // repeat parse ASSERT_EQ(builder.ParseDependentByParallelGroup(), SUCCESS); From 171fda818ac71c9082590f8b553bc45990701145 Mon Sep 17 00:00:00 2001 From: chuxing Date: Sun, 28 Mar 2021 15:07:22 +0800 Subject: [PATCH 6/9] fix ut --- tests/ut/ge/hybrid/ge_hybrid_unittest.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ut/ge/hybrid/ge_hybrid_unittest.cc b/tests/ut/ge/hybrid/ge_hybrid_unittest.cc index 2878a177..e54e1bd3 100644 --- a/tests/ut/ge/hybrid/ge_hybrid_unittest.cc +++ b/tests/ut/ge/hybrid/ge_hybrid_unittest.cc @@ -323,7 +323,7 @@ TEST_F(UtestGeHybrid, test_parse_parallel_group) { ASSERT_FALSE(model.node_items_[node]->has_observer); ASSERT_TRUE(model.node_items_[node_1]->dependents_for_execution.empty()); ASSERT_EQ(builder.ParseDependentByParallelGroup(), SUCCESS); - ASSERT_TRUE(node_item->has_observer); + ASSERT_TRUE(model.node_items_[node]->has_observer); ASSERT_EQ(model.node_items_[node_1]->dependents_for_execution.size(), 1); ASSERT_EQ(model.node_items_[node_1]->dependents_for_execution[0], node); From 91baaf2393d61136f2536fe15b65d090fb0af8a9 Mon Sep 17 00:00:00 2001 From: chuxing Date: Sun, 28 Mar 2021 15:09:21 +0800 Subject: [PATCH 7/9] fix ut --- tests/ut/ge/hybrid/ge_hybrid_unittest.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/ut/ge/hybrid/ge_hybrid_unittest.cc b/tests/ut/ge/hybrid/ge_hybrid_unittest.cc index e54e1bd3..f38037a0 100644 --- a/tests/ut/ge/hybrid/ge_hybrid_unittest.cc +++ b/tests/ut/ge/hybrid/ge_hybrid_unittest.cc @@ -329,7 +329,7 @@ TEST_F(UtestGeHybrid, test_parse_parallel_group) { // repeat parse ASSERT_EQ(builder.ParseDependentByParallelGroup(), SUCCESS); - ASSERT_TRUE(node_item->has_observer); - ASSERT_EQ(node_item_1->dependents_for_execution.size(), 1); - ASSERT_EQ(node_item_1->dependents_for_execution[0], node); + ASSERT_TRUE(model.node_items_[node]->has_observer); + ASSERT_EQ(model.node_items_[node_1]->dependents_for_execution.size(), 1); + ASSERT_EQ(model.node_items_[node_1]->dependents_for_execution[0], node); } \ No newline at end of file From 07b9a48f11b4779d4e6480aeca73ff54791948e1 Mon Sep 17 00:00:00 2001 From: zhaozhixuan Date: Sun, 28 Mar 2021 15:27:49 +0800 Subject: [PATCH 8/9] Fix error of single_op memory free. --- ge/graph/manager/graph_caching_allocator.cc | 8 ++++++++ ge/graph/manager/graph_caching_allocator.h | 7 +++++++ ge/single_op/single_op_manager.cc | 4 ++++ .../ge/graph/manager/graph_caching_allocator_unittest.cc | 1 + 4 files changed, 20 insertions(+) diff --git a/ge/graph/manager/graph_caching_allocator.cc b/ge/graph/manager/graph_caching_allocator.cc index 5822056d..cc8bd90d 100644 --- a/ge/graph/manager/graph_caching_allocator.cc +++ b/ge/graph/manager/graph_caching_allocator.cc @@ -356,6 +356,14 @@ void CachingAllocator::FreeBlocks() { (void) FreeCachedBlocks(); } +void CachingAllocator::TryFreeBlocks() { + GELOGI("Try free blocks."); + std::lock_guard lock(mutex_); + if (allocated_blocks_.empty()) { + (void) FreeCachedBlocks(); + } +} + void CachingAllocator::FreeBlockBins() { GELOGI("Free block bins."); std::lock_guard lock(mutex_); diff --git a/ge/graph/manager/graph_caching_allocator.h b/ge/graph/manager/graph_caching_allocator.h index 27563c2d..a9c3202a 100644 --- a/ge/graph/manager/graph_caching_allocator.h +++ b/ge/graph/manager/graph_caching_allocator.h @@ -94,6 +94,13 @@ class CachingAllocator { /// Status Free(uint8_t *memory_addr, uint32_t device_id = 0); + /// + /// @ingroup ge_graph + /// @brief try to free memory when no memory is referenced + /// @return void + /// + void TryFreeBlocks(); + private: /// diff --git a/ge/single_op/single_op_manager.cc b/ge/single_op/single_op_manager.cc index 6569764c..6246d6a1 100644 --- a/ge/single_op/single_op_manager.cc +++ b/ge/single_op/single_op_manager.cc @@ -19,6 +19,9 @@ #include #include +#include "graph/manager/graph_mem_allocator.h" +#include "graph/manager/graph_caching_allocator.h" + namespace ge { FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY SingleOpManager::~SingleOpManager() { for (auto &it : stream_resources_) { @@ -69,6 +72,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOpManager::Release delete it->second; it->second = nullptr; (void)stream_resources_.erase(it); + MemManager::Instance().CachingInstance(RT_MEMORY_HBM).TryFreeBlocks(); return SUCCESS; } diff --git a/tests/ut/ge/graph/manager/graph_caching_allocator_unittest.cc b/tests/ut/ge/graph/manager/graph_caching_allocator_unittest.cc index 7863a70f..a754758b 100644 --- a/tests/ut/ge/graph/manager/graph_caching_allocator_unittest.cc +++ b/tests/ut/ge/graph/manager/graph_caching_allocator_unittest.cc @@ -58,6 +58,7 @@ TEST_F(UtestGraphCachingAllocatorTest, malloc_success) { EXPECT_EQ(MemManager::Instance().Initialize(mem_type), SUCCESS); uint8_t *ptr = MemManager::Instance().CachingInstance(RT_MEMORY_HBM).Malloc(kMByteSize); EXPECT_NE(nullptr, ptr); + MemManager::Instance().CachingInstance(RT_MEMORY_HBM).TryFreeBlocks(); MemManager::Instance().Finalize(); } From cbfc89d6300a6e7005db44463f94bc9de77deb45 Mon Sep 17 00:00:00 2001 From: zhaozhixuan Date: Sun, 28 Mar 2021 15:57:43 +0800 Subject: [PATCH 9/9] No need add ut. --- tests/ut/ge/graph/manager/graph_caching_allocator_unittest.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/ut/ge/graph/manager/graph_caching_allocator_unittest.cc b/tests/ut/ge/graph/manager/graph_caching_allocator_unittest.cc index a754758b..7863a70f 100644 --- a/tests/ut/ge/graph/manager/graph_caching_allocator_unittest.cc +++ b/tests/ut/ge/graph/manager/graph_caching_allocator_unittest.cc @@ -58,7 +58,6 @@ TEST_F(UtestGraphCachingAllocatorTest, malloc_success) { EXPECT_EQ(MemManager::Instance().Initialize(mem_type), SUCCESS); uint8_t *ptr = MemManager::Instance().CachingInstance(RT_MEMORY_HBM).Malloc(kMByteSize); EXPECT_NE(nullptr, ptr); - MemManager::Instance().CachingInstance(RT_MEMORY_HBM).TryFreeBlocks(); MemManager::Instance().Finalize(); }