You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

_utils.py 8.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """Utils of auto parallel"""
  16. from mindspore._c_expression import reset_op_id
  17. from mindspore.communication.management import get_group_size, get_rank
  18. from mindspore.parallel._auto_parallel_context import auto_parallel_context, _set_auto_parallel_context,\
  19. _reset_auto_parallel_context
  20. def _get_parallel_mode():
  21. return auto_parallel_context().get_parallel_mode()
  22. def _get_mirror_mean():
  23. return auto_parallel_context().get_mirror_mean()
  24. def _get_device_num():
  25. """Get the device num."""
  26. parallel_mode = auto_parallel_context().get_parallel_mode()
  27. if parallel_mode == "stand_alone":
  28. device_num = 1
  29. return device_num
  30. if auto_parallel_context().get_device_num_is_set() is False:
  31. device_num = get_group_size()
  32. else:
  33. device_num = auto_parallel_context().get_device_num()
  34. return device_num
  35. def _get_global_rank():
  36. """Get the global rank."""
  37. parallel_mode = auto_parallel_context().get_parallel_mode()
  38. if parallel_mode == "stand_alone":
  39. global_rank = 0
  40. return global_rank
  41. if auto_parallel_context().get_global_rank_is_set() is False:
  42. global_rank = get_rank()
  43. else:
  44. global_rank = auto_parallel_context().get_global_rank()
  45. return global_rank
  46. def _get_parameter_broadcast():
  47. """Get the parameter broadcast."""
  48. parallel_mode = auto_parallel_context().get_parallel_mode()
  49. if parallel_mode == "stand_alone":
  50. parameter_broadcast = False
  51. return parameter_broadcast
  52. if auto_parallel_context().get_parameter_broadcast_is_set() is True:
  53. parameter_broadcast = auto_parallel_context().get_parameter_broadcast()
  54. elif parallel_mode in ("data_parallel", "hybrid_parallel"):
  55. parameter_broadcast = True
  56. else:
  57. parameter_broadcast = False
  58. return parameter_broadcast
  59. def _device_number_check(parallel_mode, device_number):
  60. """
  61. Check device num.
  62. Args:
  63. parallel_mode (str): The parallel mode.
  64. device_number (int): The device number.
  65. """
  66. if parallel_mode == "stand_alone" and device_number != 1:
  67. raise ValueError("If parallel_mode is stand_alone, device_number must be 1, "
  68. "device_number: {0}, parallel_mode:{1}".format(device_number, parallel_mode))
  69. def _parameter_broadcast_check(parallel_mode, parameter_broadcast):
  70. """
  71. Check parameter broadcast.
  72. Note:
  73. If parallel mode is semi_auto_parallel or auto_parallel, parameter broadcast is not supported. Using the same
  74. random seed to make sure parameters on multiple devices are the same.
  75. Args:
  76. parallel_mode (str): The parallel mode.
  77. parameter_broadcast (bool): The parameter broadcast.
  78. Raises:
  79. ValueError: If parameter is broadcasted
  80. but the parallel mode is "stand_alone" or "semi_auto_parallel" or "auto_parallel").
  81. """
  82. if parameter_broadcast is True and parallel_mode in ("stand_alone", "semi_auto_parallel", "auto_parallel"):
  83. raise ValueError("stand_alone, semi_auto_parallel and auto_parallel "
  84. "do not support parameter broadcast, parallel_mode: {0}, parameter_broadcast:{1}"
  85. .format(parallel_mode, parameter_broadcast))
  86. _parallel_mode = None
  87. _device_num = None
  88. _global_rank = None
  89. _parameter_broadcast = None
  90. _mirror_mean = None
  91. _cast_before_mirror = None
  92. _loss_repeated_mean = None
  93. _communication_backend = None
  94. _has_checkpointed = False
  95. _enable_all_reduce_fusion = None
  96. def _checkpoint_auto_parallel_context():
  97. """checkpoint auto parallel context"""
  98. global _has_checkpointed
  99. if _has_checkpointed is True:
  100. return
  101. global _parallel_mode
  102. global _device_num
  103. global _global_rank
  104. global _parameter_broadcast
  105. global _mirror_mean
  106. global _cast_before_mirror
  107. global _loss_repeated_mean
  108. global _communication_backend
  109. global _enable_all_reduce_fusion
  110. _parallel_mode = auto_parallel_context().get_parallel_mode()
  111. _device_num = _get_device_num()
  112. _global_rank = _get_global_rank()
  113. _parameter_broadcast = auto_parallel_context().get_parameter_broadcast()
  114. _mirror_mean = auto_parallel_context().get_mirror_mean()
  115. _cast_before_mirror = auto_parallel_context().get_cast_before_mirror()
  116. _loss_repeated_mean = auto_parallel_context().get_loss_repeated_mean()
  117. _communication_backend = auto_parallel_context().get_communication_backend()
  118. _enable_all_reduce_fusion = auto_parallel_context().get_enable_all_reduce_fusion()
  119. _has_checkpointed = True
  120. def _restore_auto_parallel_context():
  121. """restore auto parallel context"""
  122. global _parallel_mode
  123. global _device_num
  124. global _global_rank
  125. global _parameter_broadcast
  126. global _mirror_mean
  127. global _cast_before_mirror
  128. global _loss_repeated_mean
  129. global _communication_backend
  130. global _enable_all_reduce_fusion
  131. _set_auto_parallel_context(parallel_mode=_parallel_mode, device_num=_device_num, global_rank=_global_rank,
  132. parameter_broadcast=_parameter_broadcast, mirror_mean=_mirror_mean,
  133. cast_before_mirror=_cast_before_mirror, loss_repeated_mean=_loss_repeated_mean)
  134. auto_parallel_context().set_communication_backend(_communication_backend)
  135. auto_parallel_context().set_enable_all_reduce_fusion(_enable_all_reduce_fusion)
  136. def _reset_checkpoint_auto_parallel_context():
  137. """reset the _has_checkpointed"""
  138. global _has_checkpointed
  139. _has_checkpointed = False
  140. def _callback_wrapper(list_callback, run_context, callback_type):
  141. """
  142. reset the context for callback of model train
  143. Raises:
  144. ValueError: If the type keyword is not recognized
  145. """
  146. _callback_func_map = {
  147. "begin": list_callback.begin,
  148. "epoch_begin": list_callback.epoch_begin,
  149. "step_begin": list_callback.step_begin,
  150. "step_end": list_callback.step_end,
  151. "epoch_end": list_callback.epoch_end,
  152. "end": list_callback.end}
  153. if callback_type not in _callback_func_map:
  154. raise ValueError("Get type keyword %s is not recognized!" % callback_type)
  155. func = _callback_func_map[callback_type]
  156. if callback_type == "begin":
  157. _reset_checkpoint_auto_parallel_context()
  158. _checkpoint_auto_parallel_context()
  159. global _parallel_mode
  160. if _parallel_mode == "stand_alone":
  161. func(run_context)
  162. return
  163. _reset_auto_parallel_context()
  164. func(run_context)
  165. _restore_auto_parallel_context()
  166. PARAMETER_CLONED_INDEX = 0
  167. class _CloneInfo():
  168. """
  169. The clone info of parameter.
  170. Attributes:
  171. be_cloned (bool): Whether the parameter is cloned.
  172. cloned (bool): Whether the parameter clone from other parameter.
  173. be_cloned_index (tuple): If the parameter is cloned, generate one index per clone.
  174. cloned_index (int): If the parameter clone from other parameter, it has a unique index.
  175. """
  176. def __init__(self):
  177. self.be_cloned = False
  178. self.cloned = False
  179. self.be_cloned_index = []
  180. self.cloned_index = None
  181. def _set_clone_info(clone_from, clone_to):
  182. """
  183. Set the clone info.
  184. Args:
  185. clone_from (_CloneInfo): The clone info of be_cloned parameter.
  186. clone_to (_CloneInfo): The clone info of cloned parameter.
  187. """
  188. global PARAMETER_CLONED_INDEX
  189. clone_to.be_cloned = False
  190. clone_to.cloned = True
  191. clone_to.be_cloned_index = []
  192. clone_to.cloned_index = PARAMETER_CLONED_INDEX
  193. clone_from.be_cloned = True
  194. clone_from.be_cloned_index.append(PARAMETER_CLONED_INDEX)
  195. PARAMETER_CLONED_INDEX = PARAMETER_CLONED_INDEX + 1
  196. def _get_python_op(op_name, op_path, instance_name, arglist):
  197. """Get python operator."""
  198. module = __import__(op_path, fromlist=["None"])
  199. cls = getattr(module, op_name)
  200. op = cls(*arglist)
  201. op.set_prim_instance_name(instance_name)
  202. return op
  203. def _reset_op_id():
  204. """Reset op id."""
  205. reset_op_id()