You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

device_matrix.cc 5.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "frontend/parallel/device_matrix.h"
  17. #include <algorithm>
  18. #include <cstdint>
  19. #include <functional>
  20. #include <numeric>
  21. #include <utility>
  22. #include <vector>
  23. #include "frontend/parallel/ops_info/operator_info.h"
  24. #include "frontend/parallel/status.h"
  25. #include "utils/log_adapter.h"
  26. namespace mindspore {
  27. namespace parallel {
  28. DeviceMatrix::DeviceMatrix(int32_t rank, RankList dev_list, Shape dev_shape)
  29. : rank_(rank), dev_list_(std::move(dev_list)), dev_shape_(std::move(dev_shape)) {
  30. if (!std::any_of(dev_list_.begin(), dev_list_.end(), [rank](int32_t a) { return a == rank; })) {
  31. MS_LOG(EXCEPTION) << "Rank " << rank << " is not in the current stage!";
  32. }
  33. int32_t total = std::accumulate(dev_shape_.begin(), dev_shape_.end(), 1, std::multiplies<int>());
  34. if (IntToSize(total) != dev_list_.size()) {
  35. MS_LOG(EXCEPTION) << "Device shape does not match the size of the device list!";
  36. }
  37. }
  38. Status DeviceMatrix::CreateGroupList() {
  39. size_t size = dev_shape_.size();
  40. RankList group;
  41. for (size_t i = 0; i < size; i++) {
  42. Status status = GetDevicesAlongDim(SizeToUint(i), &group);
  43. group_list_.push_back(group);
  44. if (status == Status::FAILED) {
  45. return Status::FAILED;
  46. }
  47. }
  48. return Status::SUCCESS;
  49. }
  50. Status DeviceMatrix::GetDevicesAlongDim(const uint32_t &dim, RankList *devices) {
  51. if (dim >= dev_shape_.size()) {
  52. MS_LOG(EXCEPTION) << "The dimension " << dim << " is out of the size of the device shape!";
  53. }
  54. if (dev_shape_[dim] == 1) {
  55. *devices = {rank_};
  56. return Status::SUCCESS;
  57. }
  58. RankList group;
  59. std::vector<RankList> local_group_list;
  60. // lower than dim
  61. int32_t step = 1;
  62. for (uint32_t i = dim + 1; i < dev_shape_.size(); i++) {
  63. step = step * dev_shape_[i];
  64. }
  65. int32_t num = *dev_list_.begin();
  66. for (int32_t i = 0; i < dev_shape_[dim]; i++) {
  67. group.push_back(num);
  68. num += step;
  69. }
  70. for (int32_t i = 0; i < step; i++) {
  71. local_group_list.push_back(group);
  72. (void)std::for_each(group.begin(), group.end(), [](int32_t &a) { a++; });
  73. }
  74. // higher than dim
  75. step = step * dev_shape_[dim];
  76. int32_t len = SizeToInt(dev_list_.size()) / step;
  77. // search rank
  78. int32_t target = rank_;
  79. for (int32_t i = 0; i < len; i++) {
  80. for (RankList &temp : local_group_list) {
  81. if (std::any_of(temp.begin(), temp.end(), [target](int32_t a) { return a == target; })) {
  82. *devices = temp;
  83. return Status::SUCCESS;
  84. }
  85. (void)std::for_each(temp.begin(), temp.end(), [step](int32_t &a) { a = a + step; });
  86. }
  87. }
  88. MS_LOG(ERROR) << "Can't find groups for rank" << rank_ << " in device list!";
  89. return Status::FAILED;
  90. }
  91. Shape ConvertRankToCoordinate(int32_t rank, const Shape &dev_shape) {
  92. Shape dev_coordinate;
  93. for (size_t i = 0; i < dev_shape.size(); ++i) {
  94. int32_t size = dev_shape[dev_shape.size() - i - 1];
  95. if (size == 0) {
  96. MS_LOG(EXCEPTION) << "Invalid dev shape: " << ShapeToString(dev_shape);
  97. } else {
  98. int32_t index = rank % size;
  99. (void)dev_coordinate.insert(dev_coordinate.begin(), index);
  100. rank = rank / size;
  101. }
  102. }
  103. return dev_coordinate;
  104. }
  105. Status DeviceMatrix::GetDevicesByTensorMap(const Shape &tensor_map, RankList *rank_list) {
  106. for (auto &element : tensor_map) {
  107. // -1 means the corresponding dimension is not split.
  108. if (element == MAP_NONE) {
  109. continue;
  110. } else if ((element < 0) || (IntToSize(element) >= dev_shape_.size())) {
  111. MS_LOG(ERROR) << "create group by tensor map: the tensor map is invalid";
  112. return FAILED;
  113. }
  114. }
  115. // Convert the global rank to the local rank(The index of the array) to compute the coordinate
  116. uint32_t local_rank = 0;
  117. for (auto &tmp_rank : dev_list_) {
  118. if (tmp_rank == rank_) {
  119. break;
  120. }
  121. ++local_rank;
  122. }
  123. if (local_rank == dev_list_.size()) {
  124. MS_LOG(ERROR) << "Rank id: " << local_rank << "is not in the device list.";
  125. return FAILED;
  126. }
  127. Shape current_rank_coordinate = ConvertRankToCoordinate((int32_t)local_rank, dev_shape_);
  128. for (uint32_t loop_local_rank = 0; loop_local_rank < dev_list_.size(); ++loop_local_rank) {
  129. Shape tmp_rank_coordinate = ConvertRankToCoordinate(loop_local_rank, dev_shape_);
  130. bool matched = true;
  131. for (auto &map : tensor_map) {
  132. if (map == MAP_NONE) {
  133. continue;
  134. }
  135. size_t index = dev_shape_.size() - IntToSize(map) - 1;
  136. if (current_rank_coordinate[index] != tmp_rank_coordinate[index]) {
  137. matched = false;
  138. break;
  139. }
  140. }
  141. if (matched) {
  142. rank_list->push_back(dev_list_[loop_local_rank]);
  143. }
  144. }
  145. return SUCCESS;
  146. }
  147. std::string ShapeToString(const Shape &shape) {
  148. std::string str = "[";
  149. for (size_t i = 0; i < shape.size(); ++i) {
  150. str += std::to_string(shape[i]);
  151. if (i < shape.size() - 1) {
  152. str += ", ";
  153. }
  154. }
  155. return str + "]";
  156. }
  157. std::string ListToString(const RankList &list) {
  158. std::string str = "[";
  159. for (auto &element : list) {
  160. str += std::to_string(element) + ", ";
  161. }
  162. return str + "]";
  163. }
  164. } // namespace parallel
  165. } // namespace mindspore