You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

parameter_manager.cc 26 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640
  1. /**
  2. * Copyright 2020-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "frontend/parallel/parameter_manager.h"
  17. #include <inttypes.h>
  18. #include <sys/time.h>
  19. #include <algorithm>
  20. #include <map>
  21. #include <memory>
  22. #include <set>
  23. #include <string>
  24. #include <utility>
  25. #include "utils/hash_map.h"
  26. #include "base/core_ops.h"
  27. #include "frontend/operator/ops.h"
  28. #include "frontend/optimizer/optimizer.h"
  29. #include "frontend/parallel/context.h"
  30. #include "frontend/parallel/device_manager.h"
  31. #include "frontend/parallel/graph_util/generate_graph.h"
  32. #include "frontend/parallel/graph_util/graph_info.h"
  33. #include "frontend/parallel/graph_util/node_info.h"
  34. #include "frontend/parallel/graph_util/pipeline_split_utils.h"
  35. #include "frontend/parallel/node_check.h"
  36. #include "ir/param_info.h"
  37. #include "ir/tensor.h"
  38. #include "utils/trace_base.h"
  39. #include "utils/comm_manager.h"
  40. #include "utils/ms_context.h"
  41. #include "utils/symbolic.h"
  42. #include "mindspore/core/utils/parallel_node_check.h"
  43. #include "frontend/parallel/step_parallel_utils.h"
  44. namespace mindspore {
  45. namespace parallel {
  46. static ParameterUsersInfo FindRefKeyNodeUsers(const RefKeyPair &ref_key_pair, bool (*IsCareNode)(const CNodePtr &)) {
  47. // Dealing with the RefKey case
  48. ParameterUsersInfo parameter_user_info;
  49. auto refkeys = ref_key_pair.second;
  50. auto cnode = ref_key_pair.first;
  51. auto cnode_ptr = cnode->cast<CNodePtr>();
  52. if ((cnode_ptr == nullptr) || !IsValueNode<Primitive>(cnode_ptr->input(0)) || !IsCareNode(cnode_ptr)) {
  53. return parameter_user_info;
  54. }
  55. if (refkeys.size() > 1) {
  56. MS_LOG(EXCEPTION) << "CNode: " << cnode->fullname_with_scope() << "'s inputs have more than 1 RefKeys";
  57. }
  58. MS_EXCEPTION_IF_NULL(cnode->func_graph());
  59. auto cnode_func_graph = cnode->func_graph();
  60. MS_EXCEPTION_IF_NULL(cnode->func_graph()->manager());
  61. // Find the RefKey being used
  62. auto candidate_set_by_refkey = cnode_func_graph->manager()->node_users()[refkeys[0]];
  63. for (auto &candidate : candidate_set_by_refkey) {
  64. auto candidate_node = candidate.first;
  65. auto c = candidate_node->cast<CNodePtr>();
  66. if ((c == nullptr) || !IsValueNode<Primitive>(c->input(0)) || !IsCareNode(c)) {
  67. continue;
  68. }
  69. parameter_user_info.second.second.insert(candidate);
  70. }
  71. // Find the corresponding Parameter being used
  72. std::vector<AnfNodePtr> parameters = FindParameterByRefKeyNode(refkeys[0], cnode_func_graph);
  73. if (parameters.size() != 1) {
  74. MS_LOG(EXCEPTION) << "Find parameter by ref key node failed";
  75. }
  76. parameter_user_info.first = parameters[0]->cast<ParameterPtr>()->name();
  77. parameter_user_info.second.first = parameters[0];
  78. auto candidate_set_by_para = cnode_func_graph->manager()->node_users()[parameters[0]];
  79. for (auto &candidate : candidate_set_by_para) {
  80. auto candidate_node = candidate.first;
  81. auto c = candidate_node->cast<CNodePtr>();
  82. if ((c == nullptr) || !IsValueNode<Primitive>(c->input(0)) || !IsCareNode(c)) {
  83. continue;
  84. }
  85. parameter_user_info.second.second.insert(candidate);
  86. }
  87. return parameter_user_info;
  88. }
  89. static ParameterUsersInfo FindParameterNodeUsers(const AnfNodePtr &node) {
  90. // In this case, node is a Parameter
  91. ParameterUsersInfo parameter_user_info;
  92. MS_EXCEPTION_IF_NULL(node->func_graph());
  93. MS_EXCEPTION_IF_NULL(node->func_graph()->manager());
  94. auto candidate_set = node->func_graph()->manager()->node_users()[node];
  95. for (auto &candidate : candidate_set) {
  96. auto candidate_node = candidate.first;
  97. if (IsPrimitiveCNode(candidate_node, prim::kPrimLoad)) {
  98. if (candidate.second != 1) {
  99. continue;
  100. }
  101. auto load_node_users = node->func_graph()->manager()->node_users()[candidate_node];
  102. for (auto &node_user : load_node_users) {
  103. auto cnode = node_user.first->cast<CNodePtr>();
  104. if (cnode == nullptr || !cnode->has_user_data<OperatorInfo>() || IsSomePrimitive(cnode, RECEIVE)) {
  105. continue;
  106. }
  107. parameter_user_info.second.second.insert(node_user);
  108. }
  109. } else {
  110. auto c = candidate_node->cast<CNodePtr>();
  111. if (c == nullptr || !c->has_user_data<OperatorInfo>() || IsSomePrimitive(c, RECEIVE)) {
  112. continue;
  113. }
  114. parameter_user_info.second.second.insert(candidate);
  115. }
  116. }
  117. parameter_user_info.first = node->cast<ParameterPtr>()->name();
  118. parameter_user_info.second.first = node;
  119. return parameter_user_info;
  120. }
  121. static RefKeyPair CNodeWithRefKeys(const AnfNodePtr &cnode) {
  122. MS_EXCEPTION_IF_NULL(cnode);
  123. std::vector<AnfNodePtr> refkeys;
  124. if (cnode->isa<CNode>()) {
  125. auto cnode_ptr = cnode->cast<CNodePtr>();
  126. auto inputs = cnode_ptr->inputs();
  127. for (auto &one_input : inputs) {
  128. if (IsValueNode<RefKey>(one_input)) {
  129. refkeys.push_back(one_input);
  130. }
  131. }
  132. if (refkeys.size() >= 1) {
  133. return std::make_pair(cnode, refkeys);
  134. }
  135. }
  136. return {nullptr, refkeys};
  137. }
  138. ParameterUsersInfo FindParameterUsers(const AnfNodePtr &node, bool (*IsCareNode)(const CNodePtr &)) {
  139. ParameterUsersInfo parameter_users_info;
  140. auto cnode_with_refkeys = CNodeWithRefKeys(node);
  141. if (cnode_with_refkeys.first != nullptr) {
  142. // the node is a ref key node
  143. return FindRefKeyNodeUsers(cnode_with_refkeys, IsCareNode);
  144. } else if (node->isa<Parameter>()) {
  145. // the node is a parameter node
  146. return FindParameterNodeUsers(node);
  147. }
  148. return parameter_users_info;
  149. }
  150. static bool IsUsedParameter(const FuncGraphPtr &graph, const AnfNodePtr &parameter, size_t max_depth) {
  151. if (max_depth > MAX_RECURSIVE_DEPTH) {
  152. MS_LOG(EXCEPTION) << "Recursive call is larger than 100000.";
  153. }
  154. MS_EXCEPTION_IF_NULL(graph);
  155. MS_EXCEPTION_IF_NULL(parameter);
  156. auto manager = graph->manager();
  157. auto node_users = manager->node_users()[parameter];
  158. if (node_users.empty()) {
  159. return false;
  160. }
  161. for (auto node_user : node_users) {
  162. auto use_node = node_user.first->cast<CNodePtr>();
  163. if (IsValueNode<FuncGraph>(use_node->input(0))) {
  164. auto graph_sub = GetValueNode<FuncGraphPtr>(use_node->input(0));
  165. auto parameters = graph_sub->parameters();
  166. auto parameter_sub = parameters[IntToSize(node_user.second - 1)];
  167. return IsUsedParameter(graph_sub, parameter_sub, max_depth + 1);
  168. }
  169. if (use_node->input(0)->isa<CNode>()) {
  170. auto cnode = use_node->input(0)->cast<CNodePtr>();
  171. if (!IsSomePrimitive(cnode, J) || !IsValueNode<FuncGraph>(cnode->input(1))) {
  172. return true;
  173. }
  174. auto graph_sub = GetValueNode<FuncGraphPtr>(cnode->input(1));
  175. auto parameters = graph_sub->parameters();
  176. auto parameter_sub = parameters[IntToSize(node_user.second - 1)];
  177. return IsUsedParameter(graph_sub, parameter_sub, max_depth + 1);
  178. }
  179. return true;
  180. }
  181. return true;
  182. }
  183. static RankList GetGroupByTensorInfo(const TensorInfo &tensor_info) {
  184. CheckGlobalDeviceManager();
  185. int64_t rank = g_device_manager->global_rank();
  186. RankList stage_device_list = g_device_manager->GetDeviceListInThisStage();
  187. Shape dev_matrix_shape = tensor_info.tensor_layout().device_arrangement().array();
  188. Shape tensor_map = tensor_info.tensor_layout().tensor_map().array();
  189. DeviceMatrix dev_matrix(rank, stage_device_list, dev_matrix_shape);
  190. RankList group_devices;
  191. if (dev_matrix.GetDevicesByTensorMap(tensor_map, &group_devices) != SUCCESS) {
  192. MS_LOG(EXCEPTION) << "Get devices by tensor map failed";
  193. }
  194. std::sort(group_devices.begin(), group_devices.end());
  195. return group_devices;
  196. }
  197. static ParameterSliceInfo GetParameterSliceInfo(const std::pair<AnfNodePtr, int64_t> &param_info) {
  198. auto user_cnode = param_info.first->cast<CNodePtr>();
  199. MS_EXCEPTION_IF_NULL(user_cnode);
  200. auto user_input_index = param_info.second;
  201. OperatorInfoPtr op_info = user_cnode->user_data<OperatorInfo>();
  202. MS_EXCEPTION_IF_NULL(op_info);
  203. TensorInfo tensor_info;
  204. if (IsPrimitiveCNode(user_cnode, prim::kPrimSend)) {
  205. auto param_index = IntToSize(GetValue<int>(user_cnode->GetPrimalAttr(PARAM_INDEX)));
  206. tensor_info = op_info->inputs_tensor_info()[param_index];
  207. } else {
  208. size_t input_tensor_info_size = op_info->inputs_tensor_info().size();
  209. if (SizeToLong(input_tensor_info_size) <= user_input_index - 1) {
  210. MS_LOG(EXCEPTION) << op_info->name() << ": the size of inputs tensor info is " << input_tensor_info_size
  211. << ", but the index is " << (user_input_index - 1);
  212. }
  213. tensor_info = op_info->inputs_tensor_info()[LongToSize(user_input_index - 1)];
  214. }
  215. ParameterSliceInfo parameter_slice_info;
  216. parameter_slice_info.slice_shape = tensor_info.slice_shape();
  217. parameter_slice_info.group_ranks = GetGroupByTensorInfo(tensor_info);
  218. MS_LOG(DEBUG) << "The op name is " << op_info->name() << ", the parameter index is " << (user_input_index - 1)
  219. << ", the slice shape is " << tensor_info.slice_shape() << ", the origin shape is "
  220. << tensor_info.shape() << ", the group rank list is " << parameter_slice_info.group_ranks;
  221. return parameter_slice_info;
  222. }
  223. void CheckParameterSplit(const std::vector<AnfNodePtr> &all_nodes) {
  224. for (auto &node : all_nodes) {
  225. ParameterUsersInfo parameter_users_info = FindParameterUsers(node, IsParallelCareNode);
  226. auto &users_set = parameter_users_info.second.second;
  227. if (users_set.size() <= 1) {
  228. continue;
  229. }
  230. auto parameter_name = parameter_users_info.first;
  231. MS_LOG(INFO) << "The parameter: " << parameter_name << " has " << users_set.size() << " users";
  232. auto &first_user = users_set.front();
  233. ParameterSliceInfo parameter_slice_info = GetParameterSliceInfo(first_user);
  234. Shape first_user_slice_shape = parameter_slice_info.slice_shape;
  235. RankList first_user_group_list = parameter_slice_info.group_ranks;
  236. for (auto iter = users_set.begin() + 1; iter != users_set.end(); ++iter) {
  237. auto &user = *iter;
  238. ParameterSliceInfo user_slice_info = GetParameterSliceInfo(user);
  239. Shape user_slice_shape = user_slice_info.slice_shape;
  240. RankList user_group_list = user_slice_info.group_ranks;
  241. if (first_user_slice_shape != user_slice_shape) {
  242. MS_LOG(EXCEPTION) << "The parameter: " << parameter_name
  243. << " has multiple users, but the slice shapes are different";
  244. }
  245. if (ParallelContext::GetInstance()->pipeline_stage_split_num() == 1 && first_user_group_list != user_group_list) {
  246. MS_LOG(EXCEPTION) << "The parameter: " << parameter_name
  247. << " has multiple users, but the group rank list are different, "
  248. << "the group rank list for first user is " << first_user_group_list
  249. << ", and the group rank list for this user is " << user_group_list;
  250. }
  251. }
  252. }
  253. }
  254. namespace {
  255. void RevertSymbolicKeyInstance(const FuncGraphPtr &root, const AnfNodePtr &node) {
  256. MS_EXCEPTION_IF_NULL(root);
  257. MS_EXCEPTION_IF_NULL(node);
  258. auto symbolic_key = GetValueNode<SymbolicKeyInstancePtr>(node);
  259. MS_EXCEPTION_IF_NULL(symbolic_key);
  260. auto all_upstream_node = root->manager()->node_users()[node];
  261. for (auto &upstream_node : all_upstream_node) {
  262. FuncGraphPtr fg = upstream_node.first->func_graph();
  263. if (symbolic_key->node()->isa<Parameter>()) {
  264. for (auto &param : root->parameters()) {
  265. if (*param == *symbolic_key->node()) {
  266. AnfNodePtr reverted_node = root->NewCNode({NewValueNode(prim::kPrimEmbed), param});
  267. MS_EXCEPTION_IF_NULL(reverted_node);
  268. MS_LOG(DEBUG) << "before replace " << node->ToString() << " to node " << reverted_node->DebugString();
  269. (void)fg->manager()->Replace(node, reverted_node);
  270. MS_LOG(DEBUG) << "revert node " << node->ToString() << " to node " << reverted_node->DebugString();
  271. }
  272. }
  273. }
  274. }
  275. }
  276. } // namespace
  277. void HandleSymbolicKeyInstance(const FuncGraphPtr &root, const std::vector<AnfNodePtr> &all_nodes) {
  278. MS_EXCEPTION_IF_NULL(root);
  279. for (auto &node : all_nodes) {
  280. // revert back SymbolicKeyInstance to embed() primitive
  281. if (IsValueNode<SymbolicKeyInstance>(node)) {
  282. RevertSymbolicKeyInstance(root, node);
  283. continue;
  284. }
  285. }
  286. }
  287. bool ParameterIsCloned(const AnfNodePtr &parameter_node) {
  288. MS_EXCEPTION_IF_NULL(parameter_node);
  289. auto cloned_parameter = parameter_node->cast<ParameterPtr>();
  290. MS_EXCEPTION_IF_NULL(cloned_parameter);
  291. // find the clone parameter
  292. if (!cloned_parameter->has_default()) {
  293. return false;
  294. }
  295. auto param_value = cloned_parameter->param_info();
  296. if (param_value == nullptr) {
  297. return false;
  298. }
  299. bool cloned = param_value->cloned();
  300. if (!cloned) {
  301. return false;
  302. }
  303. MS_LOG(INFO) << "The parameter: " << cloned_parameter->name() << " is cloned";
  304. return true;
  305. }
  306. void HandleNoUsedParameter(const FuncGraphPtr &root) {
  307. MS_EXCEPTION_IF_NULL(root);
  308. bool full_batch = ParallelContext::GetInstance()->full_batch();
  309. if (full_batch) {
  310. return;
  311. }
  312. // in grad accumulation mode, if use dynamic lr, it has some parameters in optimizer which no used for first graph,
  313. // but used for second graph(such as global_step), so can not change their shapes
  314. int64_t grad_accumulation_step = ParallelContext::GetInstance()->grad_accumulation_step();
  315. if (grad_accumulation_step > 1) {
  316. MS_LOG(INFO) << "In grad accumulation mode, do not handle no used parameters";
  317. return;
  318. }
  319. auto dev_num = g_device_manager->stage_device_num();
  320. auto parameters = root->parameters();
  321. for (auto &parameter : parameters) {
  322. if (IsUsedParameter(root, parameter, 0)) {
  323. continue;
  324. }
  325. auto parameter_shape = GetNodeShape(parameter);
  326. if (parameter_shape.empty()) {
  327. continue;
  328. }
  329. Shape slice_shape = parameter_shape[0];
  330. if (slice_shape.empty()) {
  331. continue;
  332. }
  333. slice_shape[0] = slice_shape[0] / dev_num;
  334. auto slice_shape_ptr = std::make_shared<abstract::Shape>(slice_shape);
  335. auto abstract = parameter->abstract();
  336. MS_EXCEPTION_IF_NULL(abstract);
  337. auto abstract_cloned = abstract->Clone();
  338. MS_EXCEPTION_IF_NULL(abstract_cloned);
  339. abstract_cloned->set_shape(slice_shape_ptr);
  340. parameter->set_abstract(abstract_cloned);
  341. }
  342. }
  343. bool IsFullySplitParameter(const ParameterPtr &param_ptr, size_t allow_repeat_num) {
  344. auto tensor_layout = param_ptr->user_data<parallel::TensorLayout>();
  345. if (tensor_layout == nullptr) {
  346. return false;
  347. }
  348. auto dev_mat_shape = tensor_layout->device_arrangement().array();
  349. auto tensor_map = tensor_layout->tensor_map().array();
  350. int64_t rank = g_device_manager->global_rank();
  351. RankList rank_list = g_device_manager->GetDeviceListInThisStage();
  352. DeviceMatrix dev_matrix(rank, rank_list, dev_mat_shape);
  353. RankList group_devices;
  354. if (dev_matrix.GetDevicesByTensorMap(tensor_map, &group_devices) != SUCCESS) {
  355. MS_LOG(WARNING) << "Get devices by tensor map failed, invalid tensor layout";
  356. return false;
  357. }
  358. if (group_devices.size() <= allow_repeat_num) {
  359. MS_LOG(INFO) << "The parameter: " << param_ptr->name() << " is fully split";
  360. return true;
  361. }
  362. return false;
  363. }
  364. static void InsertFullySplitParamGradAccu(const std::pair<AnfNodePtr, int> &node_user,
  365. const FuncGraphManagerPtr &manager, const AnfNodePtr &accu_parameter) {
  366. auto cnode = node_user.first->cast<CNodePtr>();
  367. auto prim = GetCNodePrimitive(cnode);
  368. if (prim == nullptr) {
  369. MS_LOG(WARNING) << cnode->DebugString() << " can not insert fully split param grad accumulation node";
  370. return;
  371. }
  372. OperatorAttrs attrs;
  373. auto py_instance = CreatOpInstance(attrs, "_VirtualAdd", "grad_accu");
  374. auto value_node = NewValueNode(py_instance);
  375. std::vector<AnfNodePtr> virtual_node_input = {value_node, cnode->input(IntToSize(node_user.second)), accu_parameter};
  376. auto graph = cnode->func_graph();
  377. auto virtual_node = graph->NewCNode(virtual_node_input);
  378. manager->SetEdge(cnode, node_user.second, virtual_node);
  379. }
  380. void HandleFullySplitParameters(const FuncGraphPtr &root) {
  381. int64_t grad_accumulation_step = ParallelContext::GetInstance()->grad_accumulation_step();
  382. if ((grad_accumulation_step <= 1) || root->has_flag(ACCUMULATION)) {
  383. return;
  384. }
  385. auto parameters = root->parameters();
  386. auto node_users_map = root->manager()->node_users();
  387. for (auto &parameter : parameters) {
  388. auto param_ptr = parameter->cast<ParameterPtr>();
  389. MS_EXCEPTION_IF_NULL(param_ptr);
  390. if (!IsFullySplitParameter(param_ptr)) {
  391. continue;
  392. }
  393. auto accu_parameter = FindGradAccuParameter(parameters, param_ptr->name());
  394. if (!accu_parameter) {
  395. continue; // some parameters no need to handle, such as itself or lr
  396. }
  397. auto node_users = node_users_map[parameter];
  398. for (auto &user : node_users) {
  399. auto node = user.first;
  400. auto cnode = node->cast<CNodePtr>();
  401. MS_EXCEPTION_IF_NULL(cnode);
  402. if (!cnode->in_forward_flag()) {
  403. continue;
  404. }
  405. InsertFullySplitParamGradAccu(user, root->manager(), accu_parameter);
  406. MS_LOG(INFO) << "Insert full split assign add node for " << param_ptr->name();
  407. break; // only need to insert once, if the parameter has many users
  408. }
  409. }
  410. }
  411. void SetClonedTensorShapeForOptimizer(const FuncGraphPtr &root) {
  412. MS_EXCEPTION_IF_NULL(root);
  413. auto grad_accumulation_shard = ParallelContext::GetInstance()->grad_accumulation_shard();
  414. for (auto &cloned_parameter_node : root->parameters()) {
  415. MS_EXCEPTION_IF_NULL(cloned_parameter_node);
  416. auto cloned_parameter = cloned_parameter_node->cast<ParameterPtr>();
  417. MS_EXCEPTION_IF_NULL(cloned_parameter);
  418. if (!ParameterIsCloned(cloned_parameter_node)) {
  419. continue;
  420. }
  421. auto param_value = cloned_parameter->param_info();
  422. if (param_value == nullptr) {
  423. continue;
  424. }
  425. // get the cloned index
  426. int64_t cloned_index = param_value->cloned_index();
  427. // find the be cloned parameter
  428. bool found_be_cloned_parameter = false;
  429. ParameterPtr cloned_from_parameter = nullptr;
  430. AnfNodePtr cloned_from_node = nullptr;
  431. for (auto &be_cloned_parameter_node : root->parameters()) {
  432. MS_EXCEPTION_IF_NULL(be_cloned_parameter_node);
  433. auto be_cloned_parameter = be_cloned_parameter_node->cast<ParameterPtr>();
  434. MS_EXCEPTION_IF_NULL(be_cloned_parameter);
  435. if (!be_cloned_parameter->has_default()) {
  436. continue;
  437. }
  438. auto param_value_in = be_cloned_parameter->param_info();
  439. if (param_value_in == nullptr) {
  440. continue;
  441. }
  442. if (!param_value_in->be_cloned()) {
  443. continue;
  444. }
  445. // get the be cloned index
  446. auto &be_cloned_index = param_value_in->be_cloned_index();
  447. if (std::find(be_cloned_index.begin(), be_cloned_index.end(), cloned_index) != be_cloned_index.end()) {
  448. found_be_cloned_parameter = true;
  449. cloned_from_parameter = be_cloned_parameter;
  450. cloned_from_node = be_cloned_parameter_node;
  451. }
  452. }
  453. if (found_be_cloned_parameter) {
  454. // set the shape and tensor layout for cloned parameter
  455. std::string param_name = cloned_parameter_node->cast<ParameterPtr>()->name();
  456. if (cloned_from_parameter->user_data<TensorLayout>() == nullptr) {
  457. MS_LOG(WARNING) << "The parameter " << param_name << " has not tensor layout, skip it";
  458. continue;
  459. }
  460. auto tensor_layout = cloned_from_parameter->user_data<TensorLayout>();
  461. MS_EXCEPTION_IF_NULL(cloned_parameter_node->abstract());
  462. MS_EXCEPTION_IF_NULL(cloned_from_node->abstract());
  463. auto cloned_abstract = cloned_parameter_node->abstract()->Clone();
  464. MS_EXCEPTION_IF_NULL(cloned_abstract);
  465. // from pipeline or grad accumulation
  466. if (param_name.find(ACCU_GRADS) != std::string::npos) {
  467. auto slice_shape = cloned_from_parameter->user_data<TensorLayout>()->slice_shape().array();
  468. auto opt_shard_group = tensor_layout->opt_shard_group();
  469. auto opt_shard_shape = cloned_from_parameter->user_data<TensorLayout>()->opt_shard_slice_shape();
  470. std::shared_ptr<abstract::BaseShape> parallel_shape = nullptr;
  471. // set opt shard shape if the pipeline sharding is set
  472. if (grad_accumulation_shard && !opt_shard_group.empty()) {
  473. parallel_shape = std::make_shared<abstract::Shape>(opt_shard_shape);
  474. } else {
  475. parallel_shape = std::make_shared<abstract::Shape>(slice_shape);
  476. }
  477. MS_EXCEPTION_IF_NULL(parallel_shape);
  478. cloned_abstract->set_shape(parallel_shape);
  479. // in opt shard, accu_grad's shape is different from the original param's shape
  480. // if the grad_accumulation_shard is enabled, the accu_grads will be a opt-sharded shape
  481. if (!grad_accumulation_shard && ParallelContext::GetInstance()->enable_parallel_optimizer()) {
  482. TensorLayout new_layout = *tensor_layout;
  483. new_layout.set_opt_shard_group("");
  484. tensor_layout = std::make_shared<TensorLayout>(new_layout);
  485. }
  486. } else {
  487. cloned_abstract->set_shape(cloned_from_node->abstract()->GetShapeTrack());
  488. }
  489. cloned_parameter->set_user_data<TensorLayout>(tensor_layout);
  490. cloned_parameter_node->set_abstract(cloned_abstract);
  491. // copy the fusion tag
  492. auto cloned_param_info = cloned_parameter->param_info();
  493. MS_EXCEPTION_IF_NULL(cloned_param_info);
  494. auto cloned_from_param_info = cloned_from_parameter->param_info();
  495. MS_EXCEPTION_IF_NULL(cloned_from_param_info);
  496. cloned_param_info->set_comm_fusion(cloned_from_param_info->comm_fusion());
  497. MS_LOG(INFO) << "The parameter: " << cloned_parameter->name()
  498. << " is cloned, the be cloned parameter is: " << cloned_from_parameter->name()
  499. << ", clone index is: " << cloned_index;
  500. } else {
  501. MS_LOG(EXCEPTION) << "The parameter: " << cloned_parameter->name() << " is cloned, cloned index is "
  502. << cloned_index << ", but not found the be cloned parameter";
  503. }
  504. }
  505. }
  506. void HandleAdaFactorOpt(const FuncGraphPtr &root) {
  507. MS_EXCEPTION_IF_NULL(root);
  508. for (auto &param_node : root->parameters()) {
  509. MS_EXCEPTION_IF_NULL(param_node);
  510. auto param = param_node->cast<ParameterPtr>();
  511. MS_EXCEPTION_IF_NULL(param);
  512. std::string param_name = param->name();
  513. if (param_name.find(EXP_AVG) != std::string::npos) {
  514. continue;
  515. }
  516. auto tensor_layout = param->user_data<TensorLayout>();
  517. if (tensor_layout == nullptr) {
  518. continue;
  519. }
  520. int64_t row_col_count = 0;
  521. int64_t exp_avg_sq_count = 0;
  522. for (auto &row_col_node : root->parameters()) {
  523. MS_EXCEPTION_IF_NULL(row_col_node);
  524. auto row_col_param = row_col_node->cast<ParameterPtr>();
  525. MS_EXCEPTION_IF_NULL(row_col_param);
  526. std::string row_col_param_name = row_col_param->name();
  527. std::string exp_row_name = EXP_AVG_SQ_ROW + param_name;
  528. std::string exp_col_name = EXP_AVG_SQ_COL + param_name;
  529. std::string exp_avg_name = EXP_AVG_SQ + param_name;
  530. if ((row_col_param_name != exp_row_name) && (row_col_param_name != exp_col_name) &&
  531. (row_col_param_name != exp_avg_name)) {
  532. continue;
  533. }
  534. auto slice_shape = tensor_layout->slice_shape().array();
  535. auto shape_size = slice_shape.size();
  536. bool is_row_or_col_param = (row_col_param_name == exp_row_name) || (row_col_param_name == exp_col_name);
  537. if (is_row_or_col_param && shape_size <= 1) {
  538. continue;
  539. }
  540. if (row_col_param_name == exp_avg_name && shape_size != 1) {
  541. continue;
  542. }
  543. auto origin_shape = tensor_layout->tensor_shape().array();
  544. auto dev_mat = tensor_layout->device_arrangement().array();
  545. auto tensor_map = tensor_layout->tensor_map().array();
  546. if (row_col_param_name == exp_row_name) {
  547. slice_shape.pop_back();
  548. origin_shape.pop_back();
  549. tensor_map.pop_back();
  550. row_col_count++;
  551. } else if (row_col_param_name == exp_col_name) {
  552. (void)slice_shape.erase(slice_shape.begin() + static_cast<different_type>(SECOND_FROM_END(shape_size)));
  553. (void)origin_shape.erase(origin_shape.begin() + static_cast<different_type>(SECOND_FROM_END(shape_size)));
  554. (void)tensor_map.erase(tensor_map.begin() + static_cast<different_type>(SECOND_FROM_END(shape_size)));
  555. row_col_count++;
  556. } else {
  557. exp_avg_sq_count++;
  558. }
  559. TensorLayout new_tensor_layout;
  560. if (new_tensor_layout.InitFromVector(dev_mat, tensor_map, origin_shape) != SUCCESS) {
  561. MS_LOG(EXCEPTION) << "Init tensor layout failed";
  562. }
  563. auto cloned_abstract = row_col_node->abstract()->Clone();
  564. MS_EXCEPTION_IF_NULL(cloned_abstract);
  565. std::shared_ptr<abstract::BaseShape> parallel_shape = std::make_shared<abstract::Shape>(slice_shape);
  566. MS_EXCEPTION_IF_NULL(parallel_shape);
  567. cloned_abstract->set_shape(parallel_shape);
  568. row_col_param->set_user_data<TensorLayout>(std::make_shared<TensorLayout>(new_tensor_layout));
  569. row_col_node->set_abstract(cloned_abstract);
  570. MS_LOG(INFO) << "Set the slice shape for " << row_col_param_name << ", origin shape is " << origin_shape
  571. << ", new slice shape is " << slice_shape;
  572. if (row_col_count == 2 || exp_avg_sq_count == 1) {
  573. break;
  574. }
  575. }
  576. }
  577. }
  578. } // namespace parallel
  579. } // namespace mindspore