You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

step_parallel_utils.cc 7.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. /**
  2. * Copyright 2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "frontend/parallel/step_parallel_utils.h"
  17. #include <inttypes.h>
  18. #include <sys/time.h>
  19. #include <algorithm>
  20. #include <map>
  21. #include <set>
  22. #include <string>
  23. #include <unordered_map>
  24. #include <utility>
  25. #include "base/core_ops.h"
  26. #include "frontend/operator/ops.h"
  27. #include "frontend/optimizer/optimizer.h"
  28. #include "frontend/parallel/context.h"
  29. #include "frontend/parallel/device_manager.h"
  30. #include "frontend/parallel/graph_util/generate_graph.h"
  31. #include "frontend/parallel/graph_util/graph_info.h"
  32. #include "frontend/parallel/graph_util/node_info.h"
  33. #include "frontend/parallel/node_check.h"
  34. #include "ir/param_info.h"
  35. #include "ir/tensor.h"
  36. #include "utils/trace_base.h"
  37. #include "utils/comm_manager.h"
  38. #include "utils/ms_context.h"
  39. #include "utils/symbolic.h"
  40. #include "mindspore/core/utils/parallel_node_check.h"
  41. namespace mindspore {
  42. namespace parallel {
  43. bool IsSomePrimitive(const CNodePtr &cnode, const std::string &name) {
  44. if (!cnode) {
  45. return false;
  46. }
  47. ValueNodePtr anf_node = cnode->input(0)->cast<ValueNodePtr>();
  48. MS_EXCEPTION_IF_NULL(anf_node);
  49. PrimitivePtr prim = anf_node->value()->cast<PrimitivePtr>();
  50. return (prim->name() == name);
  51. }
  52. bool IsParallelCareNode(const CNodePtr &cnode) {
  53. MS_EXCEPTION_IF_NULL(cnode);
  54. ValueNodePtr prim_node = cnode->input(0)->cast<ValueNodePtr>();
  55. if (prim_node == nullptr) {
  56. return false;
  57. }
  58. PrimitivePtr prim = prim_node->value()->cast<PrimitivePtr>();
  59. if (prim == nullptr) {
  60. return false;
  61. }
  62. if (IsInParallelBlackList(prim)) {
  63. MS_LOG(DEBUG) << "Parallel don't care node: " << prim->name();
  64. return false;
  65. }
  66. // get_next is not in the forward graph, we need mark the get_next as the forward node
  67. if (prim->name() == GET_NEXT || prim->name() == VIRTUAL_OUTPUT) {
  68. return true;
  69. }
  70. if ((prim->name() == CAST) && !cnode->has_user_data<OperatorInfo>()) {
  71. return false;
  72. }
  73. return cnode->in_forward_flag();
  74. }
  75. Shapes GetValueListShape(const AnfNodePtr &node) {
  76. Shapes shapes;
  77. std::vector<ValuePtr> inputs_seq;
  78. if (IsValueNode<ValueList>(node)) {
  79. inputs_seq = node->cast<ValueNodePtr>()->value()->cast<ValueListPtr>()->value();
  80. } else if (IsValueNode<ValueTuple>(node)) {
  81. inputs_seq = node->cast<ValueNodePtr>()->value()->cast<ValueTuplePtr>()->value();
  82. } else {
  83. MS_LOG(EXCEPTION) << "node is eigther ValueList or ValueTuple";
  84. }
  85. for (auto &ele : inputs_seq) {
  86. auto tensor = ele->cast<tensor::TensorPtr>();
  87. if (tensor == nullptr) {
  88. MS_LOG(WARNING) << "The value node is not a tensor";
  89. break;
  90. }
  91. auto one_shape = tensor->shape();
  92. shapes.push_back(one_shape);
  93. }
  94. return shapes;
  95. }
  96. Shapes GetNodeShape(const AnfNodePtr &node) {
  97. MS_EXCEPTION_IF_NULL(node);
  98. Shapes shapes;
  99. if (IsValueNode<ValueList>(node) || IsValueNode<ValueTuple>(node)) {
  100. return GetValueListShape(node);
  101. }
  102. BaseShapePtr base_shape_ptr = node->Shape();
  103. if (node->isa<CNode>()) {
  104. auto cnode = node->cast<CNodePtr>();
  105. if (IsValueNode<Primitive>(cnode->input(0))) {
  106. PrimitivePtr prim = GetValueNode<PrimitivePtr>(cnode->input(0));
  107. MS_EXCEPTION_IF_NULL(prim);
  108. if (prim->name() == MAKEREF) {
  109. AnfNodePtr ref_node = cnode->input(1);
  110. auto func_graph = cnode->func_graph();
  111. MS_EXCEPTION_IF_NULL(ref_node);
  112. MS_EXCEPTION_IF_NULL(func_graph);
  113. return GetRefKeyNodeShape(ref_node, func_graph);
  114. }
  115. }
  116. if (cnode->input(0)->isa<CNode>()) {
  117. if (cnode->inputs().size() < 2) {
  118. MS_LOG(EXCEPTION) << "GetNodeShape: " << node->ToString() << " size is smaller than 2";
  119. }
  120. base_shape_ptr = cnode->input(1)->Shape();
  121. }
  122. }
  123. if (base_shape_ptr == nullptr) {
  124. MS_LOG(EXCEPTION) << "GetNodeShape: " << node->ToString() << " shape_ptr is nullptr, full name is "
  125. << node->fullname_with_scope();
  126. }
  127. auto tuple_shape_ptr = dyn_cast<abstract::SequeueShape>(base_shape_ptr);
  128. if (tuple_shape_ptr != nullptr) {
  129. auto tuple_shape = tuple_shape_ptr->shape();
  130. for (auto &shape : tuple_shape) {
  131. auto each_shape = dyn_cast<abstract::Shape>(shape);
  132. MS_EXCEPTION_IF_NULL(each_shape);
  133. shapes.push_back(each_shape->shape());
  134. }
  135. } else {
  136. auto shape_ptr = dyn_cast<abstract::Shape>(base_shape_ptr);
  137. MS_EXCEPTION_IF_NULL(shape_ptr);
  138. shapes.push_back(shape_ptr->shape());
  139. }
  140. return shapes;
  141. }
  142. std::string CreateInstanceName(const CNodePtr &node, size_t index) {
  143. MS_EXCEPTION_IF_NULL(node);
  144. if (!IsValueNode<Primitive>(node->input(0))) {
  145. MS_LOG(EXCEPTION) << "CreateInstanceName: " << node->ToString() << " doesn't have primitive";
  146. }
  147. std::string name_base = node->fullname_with_scope();
  148. std::string name = name_base + "_" + std::to_string(index);
  149. std::string instance_name = HashInstanceName(name);
  150. return instance_name;
  151. }
  152. void SetCommunicationOpGroupLabel(std::vector<AnfNodePtr> new_node_input) {
  153. if (new_node_input.empty()) {
  154. return;
  155. }
  156. auto prim_anf_node = new_node_input[0]->cast<ValueNodePtr>();
  157. auto prim = GetValueNode<PrimitivePtr>(prim_anf_node);
  158. MS_EXCEPTION_IF_NULL(prim);
  159. auto attrs = prim->attrs();
  160. auto iter = attrs.find(GROUP);
  161. if (iter != attrs.end()) {
  162. auto value = iter->second;
  163. MS_EXCEPTION_IF_NULL(value);
  164. if (value->isa<StringImm>()) {
  165. std::string hash_name = value->cast<StringImmPtr>()->value();
  166. MS_EXCEPTION_IF_NULL(g_device_manager);
  167. std::string rank_list_name = g_device_manager->FindRankListNameByHashName(hash_name);
  168. (void)prim->AddAttr(GROUP_RANKS, MakeValue(rank_list_name));
  169. }
  170. }
  171. }
  172. std::vector<AnfNodePtr> ReplaceOpInput(const Operator &replace_op, const std::string &instance_name,
  173. const CNodePtr &node) {
  174. OperatorArgs arg_replace_op = replace_op.second;
  175. ValuePtr pyop_instance = CreatOpInstance(arg_replace_op.first, replace_op.first, instance_name);
  176. if (pyop_instance == nullptr) {
  177. MS_LOG(EXCEPTION) << "Failure: " << replace_op.first << " CreatOpInstance failed";
  178. }
  179. OperatorParams params = arg_replace_op.second;
  180. if (node->inputs().size() < 2) {
  181. // GetNext operator dose not has input
  182. if (node->inputs().size() == 1) {
  183. return {NewValueNode(pyop_instance)};
  184. }
  185. MS_LOG(EXCEPTION) << "Failure: " << node->ToString() << " size is smaller than 2";
  186. }
  187. std::vector<AnfNodePtr> replace_input = {NewValueNode(pyop_instance), node->input(1)};
  188. if (replace_op.first == EMBEDDING_LOOKUP) {
  189. replace_input = {NewValueNode(pyop_instance), node->input(1), node->input(2)};
  190. }
  191. if (!params.empty()) {
  192. Param param_first = *(params.begin());
  193. int64_t first_position = param_first.second;
  194. if (first_position == 1) {
  195. replace_input.pop_back();
  196. }
  197. for (auto &param : params) {
  198. AnfNodePtr val = NewValueNode(param.first.second);
  199. if (val == nullptr) {
  200. MS_LOG(EXCEPTION) << "Failure:val is nullptr";
  201. }
  202. int64_t position = param.second;
  203. (void)replace_input.insert(replace_input.begin() + position, val);
  204. }
  205. } else if (replace_op.first == SYNC_BATCH_NORM) {
  206. for (size_t i = 2; i < node->inputs().size(); ++i) {
  207. replace_input.push_back(node->input(i));
  208. }
  209. }
  210. SetCommunicationOpGroupLabel(replace_input);
  211. return replace_input;
  212. }
  213. } // namespace parallel
  214. } // namespace mindspore