You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ps_context.h 7.9 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef MINDSPORE_CCSRC_PS_CONTEXT_H_
  17. #define MINDSPORE_CCSRC_PS_CONTEXT_H_
  18. #include <map>
  19. #include <string>
  20. #include <memory>
  21. #include "ps/constants.h"
  22. #include "ps/core/cluster_metadata.h"
  23. namespace mindspore {
  24. namespace ps {
  25. constexpr char kServerModePS[] = "PARAMETER_SERVER";
  26. constexpr char kServerModeFL[] = "FEDERATED_LEARNING";
  27. constexpr char kServerModeHybrid[] = "HYBRID_TRAINING";
  28. constexpr char kEnvRole[] = "MS_ROLE";
  29. constexpr char kEnvRoleOfPServer[] = "MS_PSERVER";
  30. constexpr char kEnvRoleOfServer[] = "MS_SERVER";
  31. constexpr char kEnvRoleOfWorker[] = "MS_WORKER";
  32. constexpr char kEnvRoleOfScheduler[] = "MS_SCHED";
  33. constexpr char kEnvRoleOfNotPS[] = "MS_NOT_PS";
  34. // Use binary data to represent federated learning server's context so that we can judge which round resets the
  35. // iteration. From right to left, each bit stands for:
  36. // 0: Server is in parameter server mode.
  37. // 1: Server is in federated learning mode.
  38. // 2: Server is in mixed training mode.
  39. // 3: Server enables sucure aggregation.
  40. // 4: Server needs worker to overwrite weights.
  41. // For example: 01010 stands for that the server is in federated learning mode and sucure aggregation is enabled.
  42. enum class ResetterRound { kNoNeedToReset, kUpdateModel, kReconstructSeccrets, kWorkerOverwriteWeights };
  43. const std::map<uint32_t, ResetterRound> kServerContextToResetRoundMap = {
  44. {0b00010, ResetterRound::kUpdateModel},
  45. {0b01010, ResetterRound::kReconstructSeccrets},
  46. {0b11100, ResetterRound::kWorkerOverwriteWeights},
  47. {0b10100, ResetterRound::kWorkerOverwriteWeights},
  48. {0b00100, ResetterRound::kUpdateModel}};
  49. class PSContext {
  50. public:
  51. ~PSContext() = default;
  52. PSContext(PSContext const &) = delete;
  53. PSContext &operator=(const PSContext &) = delete;
  54. static std::shared_ptr<PSContext> instance();
  55. void SetPSEnable(bool enabled);
  56. bool is_ps_mode() const;
  57. void Reset();
  58. std::string ms_role() const;
  59. bool is_worker() const;
  60. bool is_server() const;
  61. bool is_scheduler() const;
  62. uint32_t initial_worker_num();
  63. uint32_t initial_server_num();
  64. std::string scheduler_host();
  65. uint16_t scheduler_port();
  66. void SetPSRankId(int rank_id);
  67. int ps_rank_id() const;
  68. void InsertHashTableSize(const std::string &param_name, size_t cache_vocab_size, size_t embedding_size,
  69. size_t vocab_size) const;
  70. void ReInsertHashTableSize(const std::string &new_param_name, const std::string &cur_param_name,
  71. size_t cache_vocab_size, size_t embedding_size) const;
  72. void InsertWeightInitInfo(const std::string &param_name, size_t global_seed, size_t op_seed) const;
  73. void InsertAccumuInitInfo(const std::string &param_name, float init_val) const;
  74. void CloneHashTable(const std::string &dest_param_name, const std::string &src_param_name) const;
  75. void set_cache_enable(bool cache_enable) const;
  76. void set_rank_id(int rank_id) const;
  77. bool enable_ssl() const;
  78. void set_enable_ssl(bool enabled);
  79. // In new server framework, process role, worker number, server number, scheduler ip and scheduler port should be set
  80. // by ps_context.
  81. void set_server_mode(const std::string &server_mode);
  82. const std::string &server_mode() const;
  83. void set_ms_role(const std::string &role);
  84. void set_worker_num(uint32_t worker_num);
  85. uint32_t worker_num() const;
  86. void set_server_num(uint32_t server_num);
  87. uint32_t server_num() const;
  88. void set_scheduler_ip(const std::string &sched_ip);
  89. std::string scheduler_ip() const;
  90. void set_scheduler_port(uint16_t sched_port);
  91. uint16_t scheduler_port() const;
  92. // Methods federated learning.
  93. // Generate which round should reset the iteration.
  94. void GenerateResetterRound();
  95. ResetterRound resetter_round() const;
  96. void set_fl_server_port(uint16_t fl_server_port);
  97. uint16_t fl_server_port() const;
  98. // Set true if this process is a federated learning worker in cross-silo scenario.
  99. void set_fl_client_enable(bool enabled);
  100. bool fl_client_enable();
  101. void set_start_fl_job_threshold(size_t start_fl_job_threshold);
  102. size_t start_fl_job_threshold() const;
  103. void set_fl_name(const std::string &fl_name);
  104. const std::string &fl_name() const;
  105. // Set the iteration number of the federated learning.
  106. void set_fl_iteration_num(uint64_t fl_iteration_num);
  107. uint64_t fl_iteration_num() const;
  108. // Set the training epoch number of the client.
  109. void set_client_epoch_num(uint64_t client_epoch_num);
  110. uint64_t client_epoch_num() const;
  111. // Set the data batch size of the client.
  112. void set_client_batch_size(uint64_t client_batch_size);
  113. uint64_t client_batch_size() const;
  114. // Set true if worker will overwrite weights on server. Used in hybrid training.
  115. void set_worker_upload_weights(uint64_t worker_upload_weights);
  116. uint64_t worker_upload_weights() const;
  117. // Set true if using secure aggregation for federated learning.
  118. void set_secure_aggregation(bool secure_aggregation);
  119. bool secure_aggregation() const;
  120. private:
  121. PSContext()
  122. : ps_enabled_(false),
  123. is_worker_(false),
  124. is_pserver_(false),
  125. is_sched_(false),
  126. enable_ssl_(false),
  127. rank_id_(-1),
  128. worker_num_(0),
  129. server_num_(0),
  130. scheduler_host_(""),
  131. scheduler_port_(0),
  132. role_(kEnvRoleOfNotPS),
  133. server_mode_(""),
  134. resetter_round_(ResetterRound::kNoNeedToReset),
  135. fl_server_port_(0),
  136. fl_client_enable_(false),
  137. fl_name_(""),
  138. start_fl_job_threshold_(0),
  139. fl_iteration_num_(0),
  140. client_epoch_num_(0),
  141. client_batch_size_(0),
  142. secure_aggregation_(false),
  143. worker_upload_weights_(false) {}
  144. bool ps_enabled_;
  145. bool is_worker_;
  146. bool is_pserver_;
  147. bool is_sched_;
  148. bool enable_ssl_;
  149. int rank_id_;
  150. uint32_t worker_num_;
  151. uint32_t server_num_;
  152. std::string scheduler_host_;
  153. uint16_t scheduler_port_;
  154. // The server process's role.
  155. std::string role_;
  156. // Server mode which could be Parameter Server, Federated Learning and Hybrid Training mode.
  157. std::string server_mode_;
  158. // The round which will reset the iteration. Used in federated learning for now.
  159. ResetterRound resetter_round_;
  160. // Http port of federated learning server.
  161. uint16_t fl_server_port_;
  162. // Whether this process is the federated client. Used in cross-silo scenario of federated learning.
  163. bool fl_client_enable_;
  164. // Federated learning job name.
  165. std::string fl_name_;
  166. // The threshold count of startFLJob round. Used in federated learning for now.
  167. size_t start_fl_job_threshold_;
  168. // Iteration number of federeated learning, which is the number of interactions between client and server.
  169. uint64_t fl_iteration_num_;
  170. // Client training epoch number. Used in federated learning for now.
  171. uint64_t client_epoch_num_;
  172. // Client training data batch size. Used in federated learning for now.
  173. uint64_t client_batch_size_;
  174. // Whether to use secure aggregation algorithm. Used in federated learning for now.
  175. bool secure_aggregation_;
  176. // Whether there's a federated learning worker uploading weights to federated learning server. Used in hybrid training
  177. // mode for now.
  178. bool worker_upload_weights_;
  179. };
  180. } // namespace ps
  181. } // namespace mindspore
  182. #endif // MINDSPORE_CCSRC_PS_CONTEXT_H_