You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

_ps_context.py 4.9 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """Context for parameter server training mode"""
  16. import os
  17. from mindspore._c_expression import PSContext
  18. _ps_context = None
  19. def ps_context():
  20. """
  21. Get the global _ps_context, if it is not created, create a new one.
  22. Returns:
  23. _ps_context, the global parameter server training mode context.
  24. """
  25. global _ps_context
  26. if _ps_context is None:
  27. _ps_context = PSContext.get_instance()
  28. return _ps_context
  29. _set_ps_context_func_map = {
  30. "enable_ps": ps_context().set_ps_enable
  31. }
  32. _get_ps_context_func_map = {
  33. "enable_ps": ps_context().is_ps_mode
  34. }
  35. def _get_ps_mode_rank():
  36. ps_rank = ps_context().ps_rank_id()
  37. if ps_rank == -1:
  38. raise RuntimeError("The parameter server mode training is not enabled yet.")
  39. return ps_rank
  40. def _set_ps_context(**kwargs):
  41. """
  42. Set parameter server training mode context.
  43. Note:
  44. Some other environment variables should also be set for parameter server training mode.
  45. These environment variables are listed below:
  46. .. code-block::
  47. MS_SERVER_NUM # Server number
  48. MS_WORKER_NUM # Worker number
  49. MS_SCHED_HOST # Scheduler IP address
  50. MS_SCHED_PORT # Scheduler port
  51. MS_ROLE # The role of this process:
  52. # MS_SCHED represents the scheduler,
  53. # MS_WORKER represents the worker,
  54. # MS_PSERVER represents the Server
  55. Args:
  56. enable_ps (bool): Whether to enable parameter server training mode.
  57. Only after enable_ps is set True, the environment variables will be effective.
  58. Default: False.
  59. Raises:
  60. ValueError: If input key is not the attribute in parameter server training mode context.
  61. Examples:
  62. >>> context.set_ps_context(enable_ps=True)
  63. """
  64. for key, value in kwargs.items():
  65. if key not in _set_ps_context_func_map:
  66. raise ValueError("Set PS context keyword %s is not recognized!" % key)
  67. set_func = _set_ps_context_func_map[key]
  68. set_func(value)
  69. def _get_ps_context(attr_key):
  70. """
  71. Get parameter server training mode context attribute value according to the key.
  72. Args:
  73. attr_key (str): The key of the attribute.
  74. Returns:
  75. Returns attribute value according to the key.
  76. Raises:
  77. ValueError: If input key is not attribute in auto parallel context.
  78. """
  79. if attr_key not in _get_ps_context_func_map:
  80. raise ValueError("Get PS context keyword %s is not recognized!" % attr_key)
  81. get_func = _get_ps_context_func_map[attr_key]
  82. value = get_func()
  83. return value
  84. def _reset_ps_context():
  85. """
  86. Reset parameter server training mode context attributes to the default values:
  87. - enable_ps: False.
  88. """
  89. ps_context().reset()
  90. def _is_role_worker():
  91. return ps_context().is_worker()
  92. def _is_role_pserver():
  93. return ps_context().is_server()
  94. def _is_role_sched():
  95. return ps_context().is_scheduler()
  96. def _insert_hash_table_size(name, cache_vocab_size, embedding_size, vocab_size):
  97. ps_context().insert_hash_table_size(name, cache_vocab_size, embedding_size, vocab_size)
  98. def _reinsert_hash_table_size(new_name, cur_name, cache_vocab_size, embedding_size):
  99. ps_context().reinsert_hash_table_size(new_name, cur_name, cache_vocab_size, embedding_size)
  100. def _insert_weight_init_info(name, global_seed, op_seed):
  101. ps_context().insert_weight_init_info(name, global_seed, op_seed)
  102. def _insert_accumu_init_info(name, init_val):
  103. ps_context().insert_accumu_init_info(name, init_val)
  104. def _clone_hash_table(dest_param_name, src_param_name):
  105. ps_context().clone_hash_table(dest_param_name, src_param_name)
  106. def _set_cache_enable(cache_enable):
  107. # Environment variables are used to specify a maximum number of OpenBLAS threads:
  108. # In ubuntu(GPU) environment, numpy will use too many threads for computing,
  109. if cache_enable:
  110. os.environ['OPENBLAS_NUM_THREADS'] = '2'
  111. os.environ['GOTO_NUM_THREADS'] = '2'
  112. os.environ['OMP_NUM_THREADS'] = '2'
  113. ps_context().set_cache_enable(cache_enable)
  114. def _set_rank_id(rank_id):
  115. ps_context().set_rank_id(rank_id)