You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

parameter_aggregator.cc 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331
  1. /**
  2. * Copyright 2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "ps/server/parameter_aggregator.h"
  17. #include <map>
  18. #include <memory>
  19. #include <string>
  20. #include <vector>
  21. #include <utility>
  22. #include <algorithm>
  23. namespace mindspore {
  24. namespace ps {
  25. namespace server {
  26. bool ParameterAggregator::Init(const CNodePtr &cnode, size_t threshold_count) {
  27. MS_EXCEPTION_IF_NULL(cnode);
  28. memory_register_ = std::make_shared<MemoryRegister>();
  29. MS_EXCEPTION_IF_NULL(memory_register_);
  30. required_push_count_ = threshold_count;
  31. // The required_pull_count_ is the count for Pull, which should be the same as required_push_count_.
  32. // required_pull_count_ normally used in parameter server training mode.
  33. required_pull_count_ = threshold_count;
  34. MS_LOG(DEBUG) << "Start initializing kernels for " << AnfAlgo::GetCNodeName(cnode);
  35. InitAggregationKernels(cnode);
  36. InitOptimizerKernels(cnode);
  37. return true;
  38. }
  39. bool ParameterAggregator::UpdateData(const std::map<std::string, Address> &new_data) {
  40. std::map<std::string, AddressPtr> &name_to_addr = memory_register_->addresses();
  41. for (const auto &data : new_data) {
  42. const std::string &name = data.first;
  43. if (name_to_addr.count(name) == 0) {
  44. continue;
  45. }
  46. MS_LOG(DEBUG) << "Update data for " << name << ". Destination size: " << name_to_addr[name]->size
  47. << ". Source size: " << data.second.size;
  48. int ret = memcpy_s(name_to_addr[name]->addr, name_to_addr[name]->size, data.second.addr, data.second.size);
  49. if (ret != 0) {
  50. MS_LOG(ERROR) << "memcpy_s error, errorno(" << ret << ")";
  51. return false;
  52. }
  53. }
  54. return true;
  55. }
  56. bool ParameterAggregator::LaunchAggregators() {
  57. for (auto &aggregator_with_params : aggregation_kernel_parameters_) {
  58. KernelParams &params = aggregator_with_params.second;
  59. std::shared_ptr<kernel::AggregationKernel> aggr_kernel = aggregator_with_params.first;
  60. RETURN_IF_NULL(aggr_kernel, false);
  61. bool ret = aggr_kernel->Launch(params.inputs, params.workspace, params.outputs);
  62. if (!ret) {
  63. MS_LOG(ERROR) << "Launching aggregation kernel " << typeid(aggr_kernel.get()).name() << " failed.";
  64. continue;
  65. }
  66. }
  67. return true;
  68. }
  69. bool ParameterAggregator::LaunchOptimizers() {
  70. for (auto &optimizer_with_params : optimizer_kernel_parameters_) {
  71. KernelParams &params = optimizer_with_params.second;
  72. std::shared_ptr<kernel::OptimizerKernel> optimizer_kernel = optimizer_with_params.first;
  73. RETURN_IF_NULL(optimizer_kernel, false);
  74. bool ret = optimizer_kernel->Launch(params.inputs, params.workspace, params.outputs);
  75. if (!ret) {
  76. MS_LOG(ERROR) << "Launching optimizer kernel " << typeid(optimizer_kernel.get()).name() << " failed.";
  77. continue;
  78. }
  79. }
  80. // As long as all the optimizer kernels are launched, consider optimizing for this ParameterAggregator as done.
  81. optimizing_done_ = true;
  82. return true;
  83. }
  84. AddressPtr ParameterAggregator::Pull() {
  85. if (memory_register_ == nullptr) {
  86. MS_LOG(ERROR)
  87. << "The memory register of ParameterAggregator is nullptr. Please initialize ParameterAggregator first.";
  88. return nullptr;
  89. }
  90. current_pull_count_++;
  91. if (current_pull_count_ == required_pull_count_) {
  92. pulling_done_ = true;
  93. }
  94. MS_LOG(DEBUG) << "The " << current_pull_count_ << " time of Pull. Pulling done status: " << pulling_done_;
  95. std::map<std::string, AddressPtr> &name_to_addr = memory_register_->addresses();
  96. return name_to_addr["weight"];
  97. }
  98. AddressPtr ParameterAggregator::GetWeight() {
  99. if (memory_register_ == nullptr) {
  100. MS_LOG(ERROR)
  101. << "The memory register of ParameterAggregator is nullptr. Please initialize ParameterAggregator first.";
  102. return nullptr;
  103. }
  104. std::map<std::string, AddressPtr> &name_to_addr = memory_register_->addresses();
  105. return name_to_addr["weight"];
  106. }
  107. void ParameterAggregator::ResetAggregationStatus() {
  108. for (auto &aggregator_with_params : aggregation_kernel_parameters_) {
  109. std::shared_ptr<kernel::AggregationKernel> aggr_kernel = aggregator_with_params.first;
  110. if (aggr_kernel == nullptr) {
  111. MS_LOG(ERROR) << "The aggregation kernel is nullptr.";
  112. continue;
  113. }
  114. aggr_kernel->Reset();
  115. }
  116. return;
  117. }
  118. void ParameterAggregator::ResetOptimizingStatus() { optimizing_done_ = false; }
  119. void ParameterAggregator::ResetPullingStatus() {
  120. pulling_done_ = false;
  121. current_pull_count_ = 0;
  122. }
  123. bool ParameterAggregator::IsAggregationDone() const {
  124. // Only consider aggregation done after each aggregation kernel is done.
  125. for (auto &aggregator_with_params : aggregation_kernel_parameters_) {
  126. std::shared_ptr<kernel::AggregationKernel> aggr_kernel = aggregator_with_params.first;
  127. RETURN_IF_NULL(aggr_kernel, false);
  128. if (!aggr_kernel->IsAggregationDone()) {
  129. return false;
  130. }
  131. }
  132. return true;
  133. }
  134. bool ParameterAggregator::IsOptimizingDone() const { return optimizing_done_; }
  135. bool ParameterAggregator::IsPullingDone() const { return pulling_done_; }
  136. bool ParameterAggregator::InitAggregationKernels(const CNodePtr &cnode) {
  137. MS_EXCEPTION_IF_NULL(cnode);
  138. std::vector<std::string> aggr_kernel_names = SelectAggregationAlgorithm(cnode);
  139. for (const std::string &name : aggr_kernel_names) {
  140. auto aggr_kernel = kernel::AggregationKernelFactory::GetInstance().Create(name, cnode);
  141. if (aggr_kernel == nullptr) {
  142. MS_LOG(EXCEPTION) << "Fail to create aggregation kernel " << name << " for " << AnfAlgo::GetCNodeName(cnode);
  143. return false;
  144. }
  145. // set_done_count must be called before InitKernel because InitKernel may use this count.
  146. aggr_kernel->set_done_count(required_push_count_);
  147. aggr_kernel->InitKernel(cnode);
  148. const ReuseKernelNodeInfo &reuse_kernel_node_inputs_info = aggr_kernel->reuse_kernel_node_inputs_info();
  149. if (!AssignMemory(aggr_kernel, cnode, reuse_kernel_node_inputs_info, memory_register_)) {
  150. MS_LOG(EXCEPTION) << "Assigning memory for kernel " << name << " failed.";
  151. return false;
  152. }
  153. if (!GenerateAggregationKernelParams(aggr_kernel, memory_register_)) {
  154. MS_LOG(EXCEPTION) << "Generating aggregation kernel parameters for " << name << " failed.";
  155. return false;
  156. }
  157. }
  158. return true;
  159. }
  160. bool ParameterAggregator::InitOptimizerKernels(const CNodePtr &cnode) {
  161. if (PSContext::instance()->server_mode() == kServerModeFL) {
  162. MS_LOG(DEBUG) << "Federated learning mode doesn't need optimizer kernel.";
  163. return false;
  164. }
  165. MS_EXCEPTION_IF_NULL(cnode);
  166. const std::string &name = AnfAlgo::GetCNodeName(cnode);
  167. auto optimizer_kernel = kernel::OptimizerKernelFactory::GetInstance().Create(name, cnode);
  168. if (optimizer_kernel == nullptr) {
  169. MS_LOG(EXCEPTION) << "Failed to create optimizer kernel for " << name;
  170. return false;
  171. }
  172. optimizer_kernel->InitKernel(cnode);
  173. const ReuseKernelNodeInfo &reuse_kernel_node_inputs_info = optimizer_kernel->reuse_kernel_node_inputs_info();
  174. if (!AssignMemory(optimizer_kernel, cnode, reuse_kernel_node_inputs_info, memory_register_)) {
  175. MS_LOG(EXCEPTION) << "Assigning memory for kernel " << name << " failed.";
  176. return false;
  177. }
  178. if (!GenerateOptimizerKernelParams(optimizer_kernel, memory_register_)) {
  179. MS_LOG(ERROR) << "Generating optimizer kernel parameters failed.";
  180. return false;
  181. }
  182. return true;
  183. }
  184. template <typename K>
  185. bool ParameterAggregator::AssignMemory(K server_kernel, const CNodePtr &cnode,
  186. const ReuseKernelNodeInfo &reuse_kernel_node_inputs_info,
  187. std::shared_ptr<MemoryRegister> memory_register) {
  188. MS_EXCEPTION_IF_NULL(server_kernel);
  189. MS_EXCEPTION_IF_NULL(cnode);
  190. const std::vector<std::string> &input_names = server_kernel->input_names();
  191. const std::vector<size_t> &input_size_list = server_kernel->GetInputSizeList();
  192. if (input_names.size() != input_size_list.size()) {
  193. MS_LOG(EXCEPTION) << "Server kernel " << typeid(server_kernel.get()).name()
  194. << " input number is not matched: input_names size is " << input_names.size()
  195. << ", input_size_list size is " << input_size_list.size();
  196. return false;
  197. }
  198. if (reuse_kernel_node_inputs_info.size() > input_names.size()) {
  199. MS_LOG(EXCEPTION) << "The reuse kernel node information number is invalid: got "
  200. << reuse_kernel_node_inputs_info.size() << ", but input_names size is " << input_names.size();
  201. return false;
  202. }
  203. for (size_t i = 0; i < input_names.size(); i++) {
  204. const std::string &name = input_names[i];
  205. if (memory_register->addresses().count(name) != 0) {
  206. MS_LOG(DEBUG) << "The memory for " << name << " is already assigned.";
  207. continue;
  208. }
  209. if (reuse_kernel_node_inputs_info.count(name) != 0) {
  210. // Reusing memory of the kernel node means the memory of the input is already assigned by the front end, which
  211. // is to say, the input node is a parameter node.
  212. size_t index = reuse_kernel_node_inputs_info.at(name);
  213. MS_LOG(INFO) << "Try to reuse memory of kernel node " << AnfAlgo::GetCNodeName(cnode) << " for parameter " << name
  214. << ", kernel node index " << index;
  215. AddressPtr input_addr = GenerateParameterNodeAddrPtr(cnode, index);
  216. MS_EXCEPTION_IF_NULL(input_addr);
  217. memory_register->RegisterAddressPtr(name, input_addr);
  218. } else {
  219. MS_LOG(INFO) << "Assign new memory for " << name;
  220. auto input_addr = std::make_unique<char[]>(input_size_list[i]);
  221. MS_EXCEPTION_IF_NULL(input_addr);
  222. memory_register->RegisterArray(name, &input_addr, input_size_list[i]);
  223. }
  224. }
  225. return true;
  226. }
  227. bool ParameterAggregator::GenerateAggregationKernelParams(const std::shared_ptr<kernel::AggregationKernel> aggr_kernel,
  228. const std::shared_ptr<MemoryRegister> memory_register) {
  229. RETURN_IF_NULL(aggr_kernel, false);
  230. RETURN_IF_NULL(memory_register, false);
  231. KernelParams aggr_params = {};
  232. const std::vector<std::string> &input_names = aggr_kernel->input_names();
  233. std::transform(input_names.begin(), input_names.end(), std::back_inserter(aggr_params.inputs),
  234. [&](const std::string &name) { return memory_register->addresses()[name]; });
  235. const std::vector<std::string> &workspace_names = aggr_kernel->workspace_names();
  236. std::transform(workspace_names.begin(), workspace_names.end(), std::back_inserter(aggr_params.workspace),
  237. [&](const std::string &name) { return memory_register->addresses()[name]; });
  238. const std::vector<std::string> &output_names = aggr_kernel->output_names();
  239. std::transform(output_names.begin(), output_names.end(), std::back_inserter(aggr_params.outputs),
  240. [&](const std::string &name) { return memory_register->addresses()[name]; });
  241. aggr_kernel->SetParameterAddress(aggr_params.inputs, aggr_params.workspace, aggr_params.outputs);
  242. aggregation_kernel_parameters_.push_back(std::make_pair(aggr_kernel, aggr_params));
  243. return true;
  244. }
  245. bool ParameterAggregator::GenerateOptimizerKernelParams(const std::shared_ptr<kernel::OptimizerKernel> optimizer_kernel,
  246. const std::shared_ptr<MemoryRegister> memory_register) {
  247. RETURN_IF_NULL(optimizer_kernel, false);
  248. RETURN_IF_NULL(memory_register, false);
  249. KernelParams optimizer_params = {};
  250. const std::vector<std::string> &input_names = optimizer_kernel->input_names();
  251. std::transform(input_names.begin(), input_names.end(), std::back_inserter(optimizer_params.inputs),
  252. [&](const std::string &name) { return memory_register->addresses()[name]; });
  253. const std::vector<std::string> &workspace_names = optimizer_kernel->workspace_names();
  254. std::transform(workspace_names.begin(), workspace_names.end(), std::back_inserter(optimizer_params.workspace),
  255. [&](const std::string &name) { return memory_register->addresses()[name]; });
  256. const std::vector<std::string> &output_names = optimizer_kernel->output_names();
  257. std::transform(output_names.begin(), output_names.end(), std::back_inserter(optimizer_params.outputs),
  258. [&](const std::string &name) { return memory_register->addresses()[name]; });
  259. optimizer_kernel_parameters_.push_back(std::make_pair(optimizer_kernel, optimizer_params));
  260. return true;
  261. }
  262. std::vector<std::string> ParameterAggregator::SelectAggregationAlgorithm(const CNodePtr &cnode) {
  263. std::vector<std::string> aggregation_algorithm = {};
  264. if (PSContext::instance()->server_mode() == kServerModeFL ||
  265. PSContext::instance()->server_mode() == kServerModeHybrid) {
  266. aggregation_algorithm.push_back("FedAvg");
  267. } else if (PSContext::instance()->server_mode() == kServerModePS) {
  268. aggregation_algorithm.push_back("DenseGradAccum");
  269. } else {
  270. MS_LOG(ERROR) << "Server doesn't support mode " << PSContext::instance()->server_mode();
  271. }
  272. MS_LOG(INFO) << "Aggregation algorithm selection result: " << aggregation_algorithm;
  273. return aggregation_algorithm;
  274. }
  275. template bool ParameterAggregator::AssignMemory(std::shared_ptr<kernel::OptimizerKernel> server_kernel,
  276. const CNodePtr &cnode,
  277. const ReuseKernelNodeInfo &reuse_kernel_node_inputs_info,
  278. std::shared_ptr<MemoryRegister> memory_register);
  279. template bool ParameterAggregator::AssignMemory(std::shared_ptr<kernel::AggregationKernel> server_kernel,
  280. const CNodePtr &cnode,
  281. const ReuseKernelNodeInfo &reuse_kernel_node_inputs_info,
  282. std::shared_ptr<MemoryRegister> memory_register);
  283. } // namespace server
  284. } // namespace ps
  285. } // namespace mindspore