You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_pipeline_opt_detection.py 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393
  1. # Copyright 2022 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. import numpy as np
  16. import mindspore as ms
  17. import mindspore.nn as nn
  18. from mindspore import context
  19. from mindspore import Tensor
  20. from mindspore.ops import operations as P
  21. from mindspore.common.parameter import Parameter
  22. from mindspore.common.initializer import initializer
  23. from mindspore.train.model import Model
  24. from mindspore.nn.wrap.cell_wrapper import PipelineCell, MicroBatchInterleaved
  25. class DatasetLenet():
  26. def __init__(self, data, label, length=3):
  27. self.data = data
  28. self.label = label
  29. self.index = 1
  30. self.length = length
  31. def __iter__(self):
  32. return self
  33. def __next__(self):
  34. if self.index >= self.length:
  35. raise StopIteration
  36. self.index += 1
  37. return self.data, self.label
  38. def reset(self):
  39. self.index = 0
  40. @staticmethod
  41. def get_dataset_size():
  42. return 32
  43. @staticmethod
  44. def get_repeat_count():
  45. return 1
  46. @staticmethod
  47. def get_batch_size():
  48. return 32
  49. def create_tuple_iterator(self, num_epochs=1, do_copy=True):
  50. return self
  51. class MatMulCell(nn.Cell):
  52. def __init__(self, strategy1, strategy2, param=None, dtype=ms.float32):
  53. super().__init__()
  54. self.param = Parameter(initializer("zeros", [64, 64]), name="param")
  55. if param is not None:
  56. self.param = param
  57. self.param1 = Parameter(initializer("zeros", [64, 64]), name="param1")
  58. self.matmul = P.MatMul().shard(strategy1)
  59. self.matmul1 = P.MatMul().shard(strategy2)
  60. self.cast = P.Cast()
  61. self.dtype = dtype
  62. def construct(self, x):
  63. out = self.matmul(self.cast(x, self.dtype), self.cast(self.param, self.dtype))
  64. out = self.matmul1(out, self.cast(self.param1, self.dtype))
  65. return out
  66. class Net(nn.Cell):
  67. def __init__(self, strategy1, strategy2, param=None, dtype=ms.float32):
  68. super().__init__()
  69. self.block = nn.CellList()
  70. for i in range(2):
  71. cell = MatMulCell(strategy1, strategy2, param, dtype)
  72. cell.pipeline_stage = i
  73. self.block.append(cell)
  74. def construct(self, x):
  75. for i in range(2):
  76. x = self.block[i](x)
  77. return x
  78. class PipelineSplit(nn.Cell):
  79. def __init__(self, strategy1, strategy2, dtype=ms.float32):
  80. super().__init__()
  81. self.cell = Net(strategy1, strategy2, dtype=dtype)
  82. def construct(self, x, label):
  83. x = self.cell(x)
  84. return x
  85. class PipelineSplitSharedParam(nn.Cell):
  86. def __init__(self, strategy1, strategy2, dtype=ms.float32):
  87. super().__init__()
  88. self.param = Parameter(initializer("zeros", [64, 64]), name="param")
  89. self.cell = Net(strategy1, strategy2, self.param, dtype)
  90. def construct(self, x, label):
  91. x = self.cell(x)
  92. return x
  93. def test_pipeline_split_stage0():
  94. """
  95. Feature:pipeline stage0 + opt detection
  96. Description:pipeline opt detection
  97. Expectation:success
  98. """
  99. context.set_auto_parallel_context(device_num=8, global_rank=0, pipeline_stages=2)
  100. context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
  101. data = Tensor(np.ones([32, 64]), dtype=ms.float32)
  102. label = Tensor(np.ones([64, 64]), dtype=ms.float32)
  103. strategy1 = ((4, 1), (1, 1))
  104. strategy2 = ((2, 1), (1, 1))
  105. net = PipelineCell(PipelineSplit(strategy1, strategy2), 4)
  106. params = net.trainable_params()
  107. dataset = DatasetLenet(data, label, 3)
  108. optimizer = nn.Lamb(params, learning_rate=0.01)
  109. model = Model(net, optimizer=optimizer)
  110. model.train(2, dataset, dataset_sink_mode=False)
  111. for _, param in model._train_network.parameters_and_names():
  112. assert param.name != "cell.block.1.param"
  113. assert param.name != "cell.block.1.param1"
  114. def test_pipeline_split_stage1():
  115. """
  116. Feature:pipeline stage1 + opt detection
  117. Description:pipeline opt detection
  118. Expectation:success
  119. """
  120. context.set_auto_parallel_context(device_num=8, global_rank=4, pipeline_stages=2)
  121. context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
  122. data = Tensor(np.ones([32, 64]), dtype=ms.float32)
  123. label = Tensor(np.ones([64, 64]), dtype=ms.float32)
  124. strategy1 = ((4, 1), (1, 1))
  125. strategy2 = ((2, 1), (1, 1))
  126. net = PipelineCell(PipelineSplit(strategy1, strategy2), 4)
  127. params = net.trainable_params()
  128. dataset = DatasetLenet(data, label, 4)
  129. optimizer = nn.Lamb(params, learning_rate=0.001)
  130. model = Model(net, optimizer=optimizer)
  131. model.train(2, dataset, dataset_sink_mode=False)
  132. for _, param in model._train_network.parameters_and_names():
  133. assert param.name != "cell.block.0.param"
  134. assert param.name != "cell.block.0.param1"
  135. def test_pipeline_split_shared_parameter_stage0():
  136. """
  137. Feature:pipeline stage0 + opt detection + shared parameter
  138. Description:pipeline opt detection
  139. Expectation:success
  140. """
  141. context.set_auto_parallel_context(device_num=8, global_rank=0, pipeline_stages=2)
  142. context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
  143. data = Tensor(np.ones([32, 64]), dtype=ms.float32)
  144. label = Tensor(np.ones([64, 64]), dtype=ms.float32)
  145. strategy1 = ((4, 1), (1, 1))
  146. strategy2 = ((2, 1), (1, 1))
  147. net = PipelineCell(PipelineSplitSharedParam(strategy1, strategy2), 4)
  148. params = net.trainable_params()
  149. dataset = DatasetLenet(data, label, 6)
  150. optimizer = nn.Lamb(params, learning_rate=0.03)
  151. model = Model(net, optimizer=optimizer)
  152. model.train(2, dataset, dataset_sink_mode=False)
  153. def test_pipeline_split_shared_parameter_stage1():
  154. """
  155. Feature:pipeline stage1 + opt detection + shared parameter
  156. Description:pipeline opt detection
  157. Expectation:success
  158. """
  159. context.set_auto_parallel_context(device_num=8, global_rank=4, pipeline_stages=2)
  160. context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
  161. data = Tensor(np.ones([32, 64]), dtype=ms.float32)
  162. label = Tensor(np.ones([64, 64]), dtype=ms.float32)
  163. strategy1 = ((4, 1), (1, 1))
  164. strategy2 = ((2, 1), (1, 1))
  165. net = PipelineCell(PipelineSplitSharedParam(strategy1, strategy2), 4)
  166. params = net.trainable_params()
  167. dataset = DatasetLenet(data, label, 7)
  168. optimizer = nn.Lamb(params, learning_rate=0.04)
  169. model = Model(net, optimizer=optimizer)
  170. model.train(2, dataset, dataset_sink_mode=False)
  171. def test_pipeline_split_stage0_opt_shard():
  172. """
  173. Feature:pipeline stage0 + opt detection + opt shard
  174. Description:pipeline opt detection
  175. Expectation:success
  176. """
  177. context.set_auto_parallel_context(device_num=8, global_rank=0, pipeline_stages=2, enable_parallel_optimizer=True)
  178. context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
  179. data = Tensor(np.ones([32, 64]), dtype=ms.float32)
  180. label = Tensor(np.ones([64, 64]), dtype=ms.float32)
  181. strategy1 = ((4, 1), (1, 1))
  182. strategy2 = ((2, 1), (1, 1))
  183. net = PipelineCell(PipelineSplit(strategy1, strategy2), 4)
  184. params = net.trainable_params()
  185. dataset = DatasetLenet(data, label, 6)
  186. optimizer = nn.Lamb(params, learning_rate=0.02)
  187. model = Model(net, optimizer=optimizer)
  188. model.train(2, dataset, dataset_sink_mode=False)
  189. for _, param in model._train_network.parameters_and_names():
  190. assert param.name != "cell.block.1.param"
  191. assert param.name != "cell.block.1.param1"
  192. def test_pipeline_split_stage1_opt_shard():
  193. """
  194. Feature:pipeline stage1 + opt detection + opt shard
  195. Description:pipeline opt detection
  196. Expectation:success
  197. """
  198. context.set_auto_parallel_context(device_num=8, global_rank=4, pipeline_stages=2, enable_parallel_optimizer=True)
  199. context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
  200. data = Tensor(np.ones([32, 64]), dtype=ms.float32)
  201. label = Tensor(np.ones([64, 64]), dtype=ms.float32)
  202. strategy1 = ((4, 1), (1, 1))
  203. strategy2 = ((2, 1), (1, 1))
  204. net = PipelineCell(PipelineSplit(strategy1, strategy2), 4)
  205. params = net.trainable_params()
  206. dataset = DatasetLenet(data, label, 8)
  207. optimizer = nn.Lamb(params, learning_rate=0.04)
  208. model = Model(net, optimizer=optimizer)
  209. model.train(2, dataset, dataset_sink_mode=False)
  210. for _, param in model._train_network.parameters_and_names():
  211. assert param.name != "cell.block.0.param"
  212. assert param.name != "cell.block.0.param1"
  213. def test_pipeline_split_shared_parameter_stage0_opt_shard():
  214. """
  215. Feature:pipeline stage0 + opt detection + opt shard + shared parameter
  216. Description:pipeline opt detection
  217. Expectation:success
  218. """
  219. context.set_auto_parallel_context(device_num=8, global_rank=0, pipeline_stages=2, enable_parallel_optimizer=True)
  220. context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
  221. data = Tensor(np.ones([32, 64]), dtype=ms.float32)
  222. label = Tensor(np.ones([64, 64]), dtype=ms.float32)
  223. strategy1 = ((4, 1), (1, 1))
  224. strategy2 = ((2, 1), (1, 1))
  225. net = PipelineCell(PipelineSplitSharedParam(strategy1, strategy2), 4)
  226. params = net.trainable_params()
  227. dataset = DatasetLenet(data, label, 2)
  228. optimizer = nn.Lamb(params, learning_rate=0.06)
  229. model = Model(net, optimizer=optimizer)
  230. model.train(2, dataset, dataset_sink_mode=False)
  231. def test_pipeline_split_shared_parameter_stage1_opt_shard():
  232. """
  233. Feature:pipeline stage1 + opt detection + opt shard + shared parameter
  234. Description:pipeline opt detection
  235. Expectation:success
  236. """
  237. context.set_auto_parallel_context(device_num=8, global_rank=4, pipeline_stages=2, enable_parallel_optimizer=True)
  238. context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
  239. data = Tensor(np.ones([32, 64]), dtype=ms.float32)
  240. label = Tensor(np.ones([64, 64]), dtype=ms.float32)
  241. strategy1 = ((4, 1), (1, 1))
  242. strategy2 = ((2, 1), (1, 1))
  243. net = PipelineCell(PipelineSplitSharedParam(strategy1, strategy2), 4)
  244. params = net.trainable_params()
  245. dataset = DatasetLenet(data, label, 9)
  246. optimizer = nn.Lamb(params, learning_rate=0.06)
  247. model = Model(net, optimizer=optimizer)
  248. model.train(2, dataset, dataset_sink_mode=False)
  249. def test_pipeline_split_with_micro_batch_interleaved_stage0():
  250. """
  251. Feature: test PipelineSplit with MicroBatchInterleaved in auto parallel.
  252. Description: net with MicroBatchInterleaved in semi auto parallel.
  253. Expectation: success.
  254. """
  255. context.set_auto_parallel_context(device_num=8, global_rank=0, pipeline_stages=2)
  256. context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
  257. data = Tensor(np.ones([32, 64]), dtype=ms.float32)
  258. label = Tensor(np.ones([64, 64]), dtype=ms.float32)
  259. strategy1 = ((4, 1), (1, 1))
  260. strategy2 = ((2, 1), (1, 1))
  261. micro_batch_interleaved = 2
  262. net = PipelineCell(MicroBatchInterleaved(PipelineSplit(strategy1, strategy2), micro_batch_interleaved), 4)
  263. params = net.trainable_params()
  264. dataset = DatasetLenet(data, label, 3)
  265. optimizer = nn.Lamb(params, learning_rate=0.07)
  266. model = Model(net, optimizer=optimizer)
  267. model.train(2, dataset, dataset_sink_mode=False)
  268. for _, param in model._train_network.parameters_and_names():
  269. assert param.name != "cell.block.1.param"
  270. assert param.name != "cell.block.1.param1"
  271. def test_pipeline_split_with_micro_batch_interleaved_stage1():
  272. """
  273. Feature: test PipelineSplit with MicroBatchInterleaved in auto parallel.
  274. Description: net with MicroBatchInterleaved in semi auto parallel.
  275. Expectation: success.
  276. """
  277. context.set_auto_parallel_context(device_num=8, global_rank=4, pipeline_stages=2)
  278. context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
  279. data = Tensor(np.ones([32, 64]), dtype=ms.float32)
  280. label = Tensor(np.ones([64, 64]), dtype=ms.float32)
  281. strategy1 = ((4, 1), (1, 1))
  282. strategy2 = ((2, 1), (1, 1))
  283. micro_batch_interleaved = 2
  284. net = PipelineCell(MicroBatchInterleaved(PipelineSplit(strategy1, strategy2), micro_batch_interleaved), 4)
  285. params = net.trainable_params()
  286. dataset = DatasetLenet(data, label, 3)
  287. optimizer = nn.Lamb(params, learning_rate=0.08)
  288. model = Model(net, optimizer=optimizer)
  289. model.train(2, dataset, dataset_sink_mode=False)
  290. for _, param in model._train_network.parameters_and_names():
  291. assert param.name != "cell.block.0.param"
  292. assert param.name != "cell.block.0.param1"
  293. def test_pipeline_split_shared_parameter_with_micro_batch_interleaved_stage0_opt_shard():
  294. """
  295. Feature: test PipelineSplitSharedParameter with MicroBatchInterleaved in auto parallel.
  296. Description: net with MicroBatchInterleaved in semi auto parallel.
  297. Expectation: success.
  298. """
  299. context.set_auto_parallel_context(device_num=8, global_rank=0, pipeline_stages=2, enable_parallel_optimizer=True)
  300. context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
  301. data = Tensor(np.ones([32, 64]), dtype=ms.float32)
  302. label = Tensor(np.ones([64, 64]), dtype=ms.float32)
  303. strategy1 = ((4, 1), (1, 1))
  304. strategy2 = ((2, 1), (1, 1))
  305. micro_batch_interleaved = 2
  306. net = PipelineCell(MicroBatchInterleaved(PipelineSplitSharedParam(strategy1, strategy2),
  307. micro_batch_interleaved), 4)
  308. params = net.trainable_params()
  309. dataset = DatasetLenet(data, label, 5)
  310. optimizer = nn.Lamb(params, learning_rate=0.06)
  311. model = Model(net, optimizer=optimizer)
  312. model.train(2, dataset, dataset_sink_mode=False)
  313. def test_pipeline_split_shared_parameter_with_micro_batch_interleaved_stage1_opt_shard():
  314. """
  315. Feature: test PipelineSplitSharedParameter with MicroBatchInterleaved in auto parallel.
  316. Description: net with MicroBatchInterleaved in semi auto parallel.
  317. Expectation: success.
  318. """
  319. context.set_auto_parallel_context(device_num=8, global_rank=4, pipeline_stages=2, enable_parallel_optimizer=True)
  320. context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
  321. data = Tensor(np.ones([32, 64]), dtype=ms.float32)
  322. label = Tensor(np.ones([64, 64]), dtype=ms.float32)
  323. strategy1 = ((4, 1), (1, 1))
  324. strategy2 = ((2, 1), (1, 1))
  325. micro_batch_interleaved = 2
  326. net = PipelineCell(MicroBatchInterleaved(PipelineSplitSharedParam(strategy1, strategy2),
  327. micro_batch_interleaved), 4)
  328. params = net.trainable_params()
  329. dataset = DatasetLenet(data, label, 4)
  330. optimizer = nn.Lamb(params, learning_rate=0.02)
  331. model = Model(net, optimizer=optimizer)
  332. model.train(2, dataset, dataset_sink_mode=False)
  333. def run_pipeline_split_function(pipeline_net, micro_batch_interleaved=1):
  334. """
  335. Feature: test PipelineSplitSharedParameter with MicroBatchInterleaved in auto parallel.
  336. Description: net with MicroBatchInterleaved in semi auto parallel.
  337. Expectation: success.
  338. """
  339. data = Tensor(np.ones([32, 64]), dtype=ms.float32)
  340. label = Tensor(np.ones([64, 64]), dtype=ms.float32)
  341. net = PipelineCell(MicroBatchInterleaved(pipeline_net, micro_batch_interleaved), 4)
  342. params = net.trainable_params()
  343. dataset = DatasetLenet(data, label, 3)
  344. optimizer = nn.Lamb(params, learning_rate=0.01)
  345. model = Model(net, optimizer=optimizer)
  346. model.train(2, dataset, dataset_sink_mode=False)