You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

step_parallel_utils.cc 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. /**
  2. * Copyright 2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "frontend/parallel/step_parallel_utils.h"
  17. #include <inttypes.h>
  18. #include <sys/time.h>
  19. #include <algorithm>
  20. #include <map>
  21. #include <set>
  22. #include <string>
  23. #include <utility>
  24. #include "utils/hash_map.h"
  25. #include "base/core_ops.h"
  26. #include "frontend/operator/ops.h"
  27. #include "frontend/optimizer/optimizer.h"
  28. #include "frontend/parallel/context.h"
  29. #include "frontend/parallel/device_manager.h"
  30. #include "frontend/parallel/graph_util/generate_graph.h"
  31. #include "frontend/parallel/graph_util/graph_info.h"
  32. #include "frontend/parallel/graph_util/node_info.h"
  33. #include "frontend/parallel/graph_util/pipeline_split_utils.h"
  34. #include "frontend/parallel/node_check.h"
  35. #include "frontend/parallel/parameter_manager.h"
  36. #include "ir/param_info.h"
  37. #include "ir/tensor.h"
  38. #include "utils/trace_base.h"
  39. #include "utils/comm_manager.h"
  40. #include "utils/ms_context.h"
  41. #include "utils/symbolic.h"
  42. #include "mindspore/core/utils/parallel_node_check.h"
  43. namespace mindspore {
  44. namespace parallel {
  45. bool IsSomePrimitive(const CNodePtr &cnode, const std::string &name) {
  46. if (!cnode) {
  47. return false;
  48. }
  49. ValueNodePtr anf_node = cnode->input(0)->cast<ValueNodePtr>();
  50. MS_EXCEPTION_IF_NULL(anf_node);
  51. PrimitivePtr prim = anf_node->value()->cast<PrimitivePtr>();
  52. return (prim->name() == name);
  53. }
  54. bool IsParallelCareNode(const CNodePtr &cnode) {
  55. MS_EXCEPTION_IF_NULL(cnode);
  56. ValueNodePtr prim_node = cnode->input(0)->cast<ValueNodePtr>();
  57. if (prim_node == nullptr) {
  58. return false;
  59. }
  60. PrimitivePtr prim = prim_node->value()->cast<PrimitivePtr>();
  61. if (prim == nullptr) {
  62. return false;
  63. }
  64. if (IsInParallelBlackList(prim)) {
  65. MS_LOG(DEBUG) << "Parallel don't care node: " << prim->name();
  66. return false;
  67. }
  68. // get_next is not in the forward graph, we need mark the get_next as the forward node
  69. if (prim->name() == GET_NEXT || prim->name() == VIRTUAL_OUTPUT) {
  70. return true;
  71. }
  72. if ((prim->name() == CAST) && !cnode->has_user_data<OperatorInfo>()) {
  73. return false;
  74. }
  75. return cnode->in_forward_flag();
  76. }
  77. Shapes GetValueListShape(const AnfNodePtr &node) {
  78. Shapes shapes;
  79. std::vector<ValuePtr> inputs_seq;
  80. if (IsValueNode<ValueList>(node)) {
  81. inputs_seq = node->cast<ValueNodePtr>()->value()->cast<ValueListPtr>()->value();
  82. } else if (IsValueNode<ValueTuple>(node)) {
  83. inputs_seq = node->cast<ValueNodePtr>()->value()->cast<ValueTuplePtr>()->value();
  84. } else {
  85. MS_LOG(EXCEPTION) << "node is eigther ValueList or ValueTuple";
  86. }
  87. for (auto &ele : inputs_seq) {
  88. auto tensor = ele->cast<tensor::TensorPtr>();
  89. if (tensor == nullptr) {
  90. MS_LOG(WARNING) << "The value node is not a tensor";
  91. break;
  92. }
  93. auto one_shape = tensor->shape();
  94. shapes.push_back(one_shape);
  95. }
  96. return shapes;
  97. }
  98. Shapes GetNodeShape(const AnfNodePtr &node) {
  99. MS_EXCEPTION_IF_NULL(node);
  100. Shapes shapes;
  101. if (IsValueNode<ValueList>(node) || IsValueNode<ValueTuple>(node)) {
  102. return GetValueListShape(node);
  103. }
  104. BaseShapePtr base_shape_ptr = node->Shape();
  105. if (node->isa<CNode>()) {
  106. auto cnode = node->cast<CNodePtr>();
  107. if (IsValueNode<Primitive>(cnode->input(0))) {
  108. PrimitivePtr prim = GetValueNode<PrimitivePtr>(cnode->input(0));
  109. MS_EXCEPTION_IF_NULL(prim);
  110. if (prim->name() == MAKEREF) {
  111. AnfNodePtr ref_node = cnode->input(1);
  112. auto func_graph = cnode->func_graph();
  113. MS_EXCEPTION_IF_NULL(ref_node);
  114. MS_EXCEPTION_IF_NULL(func_graph);
  115. return GetRefKeyNodeShape(ref_node, func_graph);
  116. }
  117. }
  118. if (cnode->input(0)->isa<CNode>()) {
  119. if (cnode->inputs().size() < 2) {
  120. MS_LOG(EXCEPTION) << "GetNodeShape: " << node->ToString() << " size is smaller than 2";
  121. }
  122. base_shape_ptr = cnode->input(1)->Shape();
  123. }
  124. }
  125. if (base_shape_ptr == nullptr) {
  126. MS_LOG(EXCEPTION) << "GetNodeShape: " << node->ToString() << " shape_ptr is nullptr, full name is "
  127. << node->fullname_with_scope();
  128. }
  129. auto tuple_shape_ptr = dyn_cast<abstract::SequenceShape>(base_shape_ptr);
  130. if (tuple_shape_ptr != nullptr) {
  131. auto tuple_shape = tuple_shape_ptr->shape();
  132. for (auto &shape : tuple_shape) {
  133. auto each_shape = dyn_cast<abstract::Shape>(shape);
  134. MS_EXCEPTION_IF_NULL(each_shape);
  135. shapes.push_back(each_shape->shape());
  136. }
  137. } else {
  138. auto shape_ptr = dyn_cast<abstract::Shape>(base_shape_ptr);
  139. MS_EXCEPTION_IF_NULL(shape_ptr);
  140. shapes.push_back(shape_ptr->shape());
  141. }
  142. return shapes;
  143. }
  144. RankList FindCommonMirrorGroup(const FuncGraphPtr &root) {
  145. auto parameters = root->parameters();
  146. for (auto &parameter : parameters) {
  147. auto param_ptr = parameter->cast<ParameterPtr>();
  148. MS_EXCEPTION_IF_NULL(param_ptr);
  149. if (!(param_ptr->has_default() && ParameterRequireGrad(param_ptr))) {
  150. continue;
  151. }
  152. size_t allow_repeat_num = 1;
  153. if (ParallelContext::GetInstance()->enable_parallel_optimizer() &&
  154. (!param_ptr->param_info() || param_ptr->param_info()->parallel_optimizer())) {
  155. if (ParallelContext::GetInstance()->optimizer_weight_shard_size() == -1) {
  156. MS_LOG(WARNING) << "The parameter :" << param_ptr->fullname_with_scope()
  157. << " is fully shard by optimizer parallel,"
  158. " thus cannot find common data parallel group for this rank";
  159. return {g_device_manager->global_rank()};
  160. }
  161. allow_repeat_num = size_t(ParallelContext::GetInstance()->optimizer_weight_shard_size());
  162. }
  163. if (IsFullySplitParameter(param_ptr, allow_repeat_num)) {
  164. MS_LOG(WARNING) << "The parameter :" << param_ptr->fullname_with_scope()
  165. << " is fully shard, thus cannot find common data parallel group for this rank";
  166. return {g_device_manager->global_rank()};
  167. }
  168. }
  169. AnfNodePtr ret = root->get_return();
  170. MS_EXCEPTION_IF_NULL(ret);
  171. std::vector<int64_t> common_group_list;
  172. std::vector<AnfNodePtr> all_nodes = DeepScopedGraphSearch(ret);
  173. bool is_first_group = true;
  174. for (auto &node : all_nodes) {
  175. if (!IsPrimitiveCNode(node, prim::kPrimMirror) && !IsPrimitiveCNode(node, prim::kPrimMirrorMicroStep) &&
  176. !IsPrimitiveCNode(node, prim::kPrimMirrorMiniStep)) {
  177. continue;
  178. }
  179. auto prim = GetCNodePrimitive(node);
  180. if (!prim->HasAttr(GROUP)) {
  181. MS_LOG(EXCEPTION) << "The mirror operator dose not have group attr : " << node->DebugString();
  182. }
  183. std::string group_name = GetValue<std::string>(prim->GetAttr(GROUP));
  184. std::vector<int64_t> group_list = g_device_manager->FindRankListByHashName(group_name);
  185. if (is_first_group) {
  186. common_group_list = group_list;
  187. is_first_group = false;
  188. } else {
  189. std::vector<int64_t> new_comm_group_list;
  190. std::set_intersection(common_group_list.begin(), common_group_list.end(), group_list.begin(), group_list.end(),
  191. std::back_inserter(new_comm_group_list));
  192. common_group_list = new_comm_group_list;
  193. }
  194. }
  195. MS_LOG(INFO) << "The common mirror group is:" << common_group_list;
  196. return common_group_list;
  197. }
  198. std::string CreateInstanceName(const CNodePtr &node, size_t index) {
  199. MS_EXCEPTION_IF_NULL(node);
  200. if (!IsValueNode<Primitive>(node->input(0))) {
  201. MS_LOG(EXCEPTION) << "CreateInstanceName: " << node->ToString() << " doesn't have primitive";
  202. }
  203. std::string name_base = node->fullname_with_scope();
  204. std::string name = name_base + "_" + std::to_string(index);
  205. std::string instance_name = HashInstanceName(name);
  206. return instance_name;
  207. }
  208. void SetCommunicationOpGroupLabel(std::vector<AnfNodePtr> new_node_input) {
  209. if (new_node_input.empty()) {
  210. return;
  211. }
  212. auto prim_anf_node = new_node_input[0]->cast<ValueNodePtr>();
  213. auto prim = GetValueNode<PrimitivePtr>(prim_anf_node);
  214. MS_EXCEPTION_IF_NULL(prim);
  215. auto attrs = prim->attrs();
  216. auto iter = attrs.find(GROUP);
  217. if (iter != attrs.end()) {
  218. auto value = iter->second;
  219. MS_EXCEPTION_IF_NULL(value);
  220. if (value->isa<StringImm>()) {
  221. std::string hash_name = value->cast<StringImmPtr>()->value();
  222. MS_EXCEPTION_IF_NULL(g_device_manager);
  223. std::string rank_list_name = g_device_manager->FindRankListNameByHashName(hash_name);
  224. (void)prim->AddAttr(GROUP_RANKS, MakeValue(rank_list_name));
  225. }
  226. }
  227. }
  228. std::vector<AnfNodePtr> ReplaceOpInput(const Operator &replace_op, const std::string &instance_name,
  229. const CNodePtr &node) {
  230. OperatorArgs arg_replace_op = replace_op.second;
  231. ValuePtr pyop_instance = CreateOpInstance(arg_replace_op.first, replace_op.first, instance_name);
  232. if (pyop_instance == nullptr) {
  233. MS_LOG(EXCEPTION) << "Failure: " << replace_op.first << " CreateOpInstance failed";
  234. }
  235. OperatorParams params = arg_replace_op.second;
  236. if (node->inputs().size() < 2) {
  237. // GetNext operator dose not has input
  238. if (node->inputs().size() == 1) {
  239. return {NewValueNode(pyop_instance)};
  240. }
  241. MS_LOG(EXCEPTION) << "Failure: " << node->ToString() << " size is smaller than 2";
  242. }
  243. std::vector<AnfNodePtr> replace_input = {NewValueNode(pyop_instance), node->input(1)};
  244. if (replace_op.first == EMBEDDING_LOOKUP) {
  245. replace_input = {NewValueNode(pyop_instance), node->input(1), node->input(2)};
  246. }
  247. if (!params.empty()) {
  248. Param param_first = *(params.begin());
  249. int64_t first_position = param_first.second;
  250. if (first_position == 1) {
  251. replace_input.pop_back();
  252. }
  253. for (auto &param : params) {
  254. AnfNodePtr val = NewValueNode(param.first.second);
  255. if (val == nullptr) {
  256. MS_LOG(EXCEPTION) << "Failure:val is nullptr";
  257. }
  258. int64_t position = param.second;
  259. (void)replace_input.insert(replace_input.begin() + position, val);
  260. }
  261. } else if (replace_op.first == SYNC_BATCH_NORM) {
  262. for (size_t i = 2; i < node->inputs().size(); ++i) {
  263. replace_input.push_back(node->input(i));
  264. }
  265. }
  266. SetCommunicationOpGroupLabel(replace_input);
  267. return replace_input;
  268. }
  269. void SetStridedSliceSplitStrategy(const std::vector<AnfNodePtr> &all_nodes) {
  270. for (auto &node : all_nodes) {
  271. if (!node->isa<CNode>()) {
  272. continue;
  273. }
  274. auto cnode = node->cast<CNodePtr>();
  275. MS_EXCEPTION_IF_NULL(cnode);
  276. if (!IsPrimitiveCNode(cnode, prim::kPrimStridedSlice)) {
  277. continue;
  278. }
  279. auto slice_prim = GetCNodePrimitive(cnode);
  280. MS_EXCEPTION_IF_NULL(slice_prim);
  281. if (slice_prim->HasAttr(FUNC_GRAPH_FLAG_STRIDED_SLICE)) {
  282. SetStridedSliceStrategy(cnode);
  283. }
  284. }
  285. }
  286. } // namespace parallel
  287. } // namespace mindspore