You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

allreduce_node.cc 3.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "parallel/allreduce_fusion/allreduce_node.h"
  17. #include <queue>
  18. #include "parallel/tensor_layout/tensor_layout.h"
  19. #include "utils/log_adapter.h"
  20. namespace mindspore {
  21. namespace parallel {
  22. Status AllreduceNode::AddNext(const AllreduceNodePtr &next_node) {
  23. if (next_node == nullptr) {
  24. MS_LOG(ERROR) << "next_node is nullptr!";
  25. return FAILED;
  26. }
  27. next_.emplace_back(next_node);
  28. return SUCCESS;
  29. }
  30. Status AllreduceNode::AddPrev(const AllreduceNodePtr &prev_node, double dist, double *max) {
  31. if (prev_node == nullptr) {
  32. MS_LOG(ERROR) << "next_node is nullptr!";
  33. return FAILED;
  34. }
  35. if (dist <= 0) {
  36. MS_LOG(ERROR) << "dist must be positive! dist: " << dist;
  37. return FAILED;
  38. }
  39. prev_.emplace_back(prev_node);
  40. double add_dist = prev_node->depend_feat_size() + dist;
  41. depend_feat_size_ += add_dist;
  42. if (depend_feat_size_ > *max) {
  43. *max = depend_feat_size_;
  44. }
  45. std::queue<AllreduceNodePtr> next_queue;
  46. for (auto &next : next_) {
  47. next_queue.push(next);
  48. }
  49. while (!next_queue.empty()) {
  50. auto ele = next_queue.front();
  51. ele->AddDependFeatSize(add_dist);
  52. if (ele->depend_feat_size() > *max) {
  53. *max = ele->depend_feat_size();
  54. }
  55. for (auto &next : ele->next()) {
  56. next_queue.push(next);
  57. }
  58. next_queue.pop();
  59. }
  60. return SUCCESS;
  61. }
  62. Status AllreduceNode::Init(const CNodePtr &cnode_ptr) {
  63. if (cnode_ptr == nullptr) {
  64. MS_LOG(ERROR) << "cnode_ptr is nullptr!";
  65. return FAILED;
  66. }
  67. cnode_ptr_ = cnode_ptr;
  68. return SUCCESS;
  69. }
  70. Status AllreduceNode::AddPara(const AnfNodePtr &node_ptr) {
  71. if (node_ptr == nullptr) {
  72. MS_LOG(ERROR) << "node_ptr is nullptr!";
  73. return FAILED;
  74. }
  75. if (!node_ptr->isa<Parameter>()) {
  76. MS_LOG(ERROR) << "node_ptr is not a ParameterPtr!";
  77. return FAILED;
  78. }
  79. auto para_ptr = node_ptr->cast<ParameterPtr>();
  80. MS_EXCEPTION_IF_NULL(para_ptr);
  81. auto layout_ptr = para_ptr->tensor_layout();
  82. if (layout_ptr == nullptr) {
  83. MS_LOG(ERROR) << "layout_ptr is nullptr!";
  84. return FAILED;
  85. }
  86. auto emplace_return = paras_.emplace(node_ptr);
  87. if (emplace_return.second) {
  88. double para_size = static_cast<double>(layout_ptr->slice_shape().size());
  89. curr_para_size_ += para_size;
  90. para_size_map_[node_ptr] = para_size;
  91. } else {
  92. MS_LOG(INFO) << "node already exist!";
  93. }
  94. return SUCCESS;
  95. }
  96. Status AllreduceNode::RemovePara(const AnfNodePtr &node_ptr) {
  97. if (node_ptr == nullptr) {
  98. MS_LOG(ERROR) << "node_ptr is nullptr!";
  99. return FAILED;
  100. }
  101. auto erase_num = paras_.erase(node_ptr);
  102. if (erase_num == 0) {
  103. MS_LOG(ERROR) << "para not find!";
  104. return FAILED;
  105. }
  106. curr_para_size_ -= para_size_map_[node_ptr];
  107. return SUCCESS;
  108. }
  109. void AllreduceNode::ToString() const {
  110. MS_LOG(INFO) << "cnode: " << cnode_ptr_->DebugString() << "para size: " << paras_.size();
  111. for (auto &para : paras_) {
  112. MS_LOG(INFO) << "para name: " << para->fullname_with_scope() << " size: " << para_size_map_.at(para);
  113. }
  114. MS_LOG(INFO) << "depend_feat_size: " << depend_feat_size_ << " curr_para_size: " << curr_para_size_;
  115. }
  116. } // namespace parallel
  117. } // namespace mindspore