You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

normalization.py 32 kB

5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """normalization"""
  16. from mindspore.ops import operations as P
  17. from mindspore.ops import functional as F
  18. from mindspore.common.parameter import Parameter
  19. from mindspore.common.initializer import initializer
  20. from mindspore.ops.primitive import constexpr
  21. import mindspore.context as context
  22. from mindspore._checkparam import Validator, check_typename
  23. from mindspore._extends import cell_attr_register
  24. from mindspore.communication.management import get_group_size, get_rank
  25. from mindspore.communication import management
  26. from mindspore.ops import _selected_ops
  27. from ..cell import Cell
  28. __all__ = ['BatchNorm1d', 'BatchNorm2d', 'LayerNorm', 'GroupNorm', 'GlobalBatchNorm']
  29. class _BatchNorm(Cell):
  30. """Batch Normalization base class."""
  31. @cell_attr_register
  32. def __init__(self,
  33. num_features,
  34. eps=1e-5,
  35. momentum=0.9,
  36. affine=True,
  37. gamma_init='ones',
  38. beta_init='zeros',
  39. moving_mean_init='zeros',
  40. moving_var_init='ones',
  41. use_batch_statistics=None,
  42. device_num_each_group=1,
  43. input_dims='2d'):
  44. super(_BatchNorm, self).__init__()
  45. if num_features < 1:
  46. raise ValueError("num_features must be at least 1")
  47. if momentum < 0 or momentum > 1:
  48. raise ValueError("momentum should be a number in range [0, 1], but got {}".format(momentum))
  49. self.use_batch_statistics = use_batch_statistics
  50. self.num_features = num_features
  51. self.eps = eps
  52. self.input_dims = input_dims
  53. self.moving_mean = Parameter(initializer(
  54. moving_mean_init, num_features), name="mean", requires_grad=False)
  55. self.moving_variance = Parameter(initializer(
  56. moving_var_init, num_features), name="variance", requires_grad=False)
  57. self.gamma = Parameter(initializer(
  58. gamma_init, num_features), name="gamma", requires_grad=affine)
  59. self.beta = Parameter(initializer(
  60. beta_init, num_features), name="beta", requires_grad=affine)
  61. self.group = Validator.check_positive_int(device_num_each_group)
  62. self.is_global = False
  63. if self.group != 1:
  64. self.rank_id = get_rank()
  65. self.rank_size = get_group_size()
  66. self.device_list = [i for i in range(0, self.rank_size)]
  67. self.rank_list = self.list_group(self.device_list, self.group)
  68. self.rank_list_idx = len(self.rank_list)
  69. for i in range(self.rank_list_idx):
  70. if self.rank_id in self.rank_list[i] and self.group != 1:
  71. self.is_global = True
  72. management.create_group('group' + str(i), self.rank_list[i])
  73. self.all_reduce = P.AllReduce(P.ReduceOp.SUM, 'group' + str(i)).add_prim_attr('fusion', 1)
  74. self.shape = P.Shape()
  75. self.reduce_mean = P.ReduceMean(keep_dims=True)
  76. self.square = P.Square()
  77. self.sqrt = P.Sqrt()
  78. self.cast = P.Cast()
  79. self.dtype = P.DType()
  80. self.reshape = P.Reshape()
  81. self.is_ascend = context.get_context("device_target") == "Ascend"
  82. self.is_gpu = context.get_context("device_target") == "GPU"
  83. self.is_graph_mode = context.get_context("mode") == context.GRAPH_MODE
  84. self.momentum = 1.0 - momentum
  85. if context.get_context("enable_ge"):
  86. self.is_ge_backend = True
  87. else:
  88. self.is_ge_backend = False
  89. if self.is_graph_mode and (self.is_ge_backend or self.is_ascend):
  90. self.bn_train = P.BatchNorm(is_training=True,
  91. epsilon=self.eps)
  92. elif self.is_gpu:
  93. self.bn_train = P.FusedBatchNormEx(mode=1,
  94. epsilon=self.eps,
  95. momentum=self.momentum)
  96. else:
  97. self.bn_train = P.FusedBatchNorm(mode=1,
  98. epsilon=self.eps,
  99. momentum=self.momentum)
  100. self.bn_infer = P.BatchNorm(is_training=False, epsilon=self.eps)
  101. self.enable_global_sync = self.is_global and (self.is_ge_backend or (self.is_graph_mode and self.is_ascend))
  102. self.enable_default_train = self.is_graph_mode and not self.is_global and \
  103. (self.is_ge_backend or self.is_ascend)
  104. data_parallel_strategy = ((1,), (1,))
  105. data_parallel_strategy_one = ((1,), ())
  106. self.sub_mean = P.Sub().shard(data_parallel_strategy)
  107. self.sub_var = P.Sub().shard(data_parallel_strategy)
  108. self.mul_mean = P.Mul().shard(data_parallel_strategy_one)
  109. self.mul_var = P.Mul().shard(data_parallel_strategy_one)
  110. self.assign_sub_mean = P.AssignSub().shard(data_parallel_strategy)
  111. self.assign_sub_var = P.AssignSub().shard(data_parallel_strategy)
  112. def _check_data_dim(self, x):
  113. raise NotImplementedError
  114. def list_group(self, world_rank, group_size):
  115. if group_size > get_group_size():
  116. raise ValueError("group size can not be greater than local rank size, group size is {}, "
  117. "local_rank_size is {}".format(group_size, get_group_size()))
  118. if len(world_rank) % group_size != 0:
  119. raise ValueError("please make your group size correct.")
  120. world_rank_list = zip(*(iter(world_rank),) * group_size)
  121. group_list = [list(i) for i in world_rank_list]
  122. return group_list
  123. def _global_sync(self, x, axes, re_shape):
  124. """calculate global batch normalization output"""
  125. x_mean = self.reduce_mean(x, axes)
  126. x_mean_square = self.reduce_mean(self.square(x), axes)
  127. global_batch_mean = self.all_reduce(x_mean) / self.group
  128. global_batch_mean_square = self.all_reduce(x_mean_square) / self.group
  129. global_mean = global_batch_mean
  130. global_var = global_batch_mean_square - self.square(global_mean)
  131. var_sqrt = self.sqrt(global_var + self.eps)
  132. mean_first = (x - global_mean) / var_sqrt
  133. y = mean_first * self.reshape(self.gamma, re_shape) + self.reshape(self.beta, re_shape)
  134. mean_sub = self.sub_mean(self.reshape(self.moving_mean, re_shape), global_mean)
  135. tmp_mean = self.mul_mean(mean_sub, self.cast(self.momentum, self.dtype(mean_sub)))
  136. mean_sub2 = self.sub_var(self.reshape(self.moving_mean, re_shape), global_var)
  137. tmp_variance = self.mul_var(mean_sub2, self.cast(self.momentum, self.dtype(mean_sub2)))
  138. y = F.depend(y, self.assign_sub_mean(self.moving_mean, self.reshape(tmp_mean, self.shape(self.moving_mean))))
  139. y = F.depend(y, self.assign_sub_var(self.moving_variance,
  140. self.reshape(tmp_variance, self.shape(self.moving_variance))))
  141. return y
  142. def construct(self, x):
  143. _shape_check_bn(self.shape(x), self.input_dims)
  144. if self.use_batch_statistics is None:
  145. flag = self.training
  146. else:
  147. flag = self.use_batch_statistics
  148. if flag:
  149. if self.enable_global_sync:
  150. axes, re_shape = _shape_infer(F.shape(x), self.num_features)
  151. return self._global_sync(x, axes, re_shape)
  152. if self.enable_default_train:
  153. y, batch_mean, batch_var, _, _ = self.bn_train(x,
  154. self.gamma,
  155. self.beta,
  156. None,
  157. None)
  158. mean_sub = self.sub_mean(self.moving_mean, batch_mean)
  159. temp_mean = self.mul_mean(mean_sub, self.momentum)
  160. mean_sub2 = self.sub_var(self.moving_variance, batch_var)
  161. temp_variance = self.mul_var(mean_sub2, self.momentum)
  162. y = F.depend(y, self.assign_sub_mean(self.moving_mean, temp_mean))
  163. y = F.depend(y, self.assign_sub_var(self.moving_variance, temp_variance))
  164. return y
  165. return self.bn_train(x,
  166. self.gamma,
  167. self.beta,
  168. self.moving_mean,
  169. self.moving_variance)[0]
  170. return self.bn_infer(x,
  171. self.gamma,
  172. self.beta,
  173. self.moving_mean,
  174. self.moving_variance)[0]
  175. def extend_repr(self):
  176. return 'num_features={}, eps={}, momentum={}, gamma={}, beta={}, moving_mean={}, moving_variance={}'.format(
  177. self.num_features, self.eps, self.momentum, self.gamma, self.beta, self.moving_mean, self.moving_variance)
  178. @constexpr
  179. def _channel_check(channel, num_channel):
  180. if channel != num_channel:
  181. raise ValueError("the input channel is not equal with num_channel")
  182. @constexpr
  183. def _shape_check(in_shape):
  184. if len(in_shape) != 4:
  185. raise ValueError("The input must has 4 dims.")
  186. @constexpr
  187. def _shape_check_bn(in_shape, in_dims):
  188. dim = len(in_shape)
  189. if in_dims == '1d' and dim != 2:
  190. raise ValueError("The input must has 2 dims.")
  191. if in_dims == '2d' and dim != 4:
  192. raise ValueError("The input must has 4 dims.")
  193. if in_dims == 'both' and dim != 2 and dim != 4:
  194. raise ValueError("The input must has 2 dims or 4 dims.")
  195. @constexpr
  196. def _shape_infer(x_shape, num_feature):
  197. """global batch normalization shape and axes infer"""
  198. if len(x_shape) == 4:
  199. axes = (0, 2, 3)
  200. re_shape = (1, num_feature, 1, 1)
  201. else:
  202. axes = (0,)
  203. re_shape = (1, num_feature)
  204. return axes, re_shape
  205. class BatchNorm1d(_BatchNorm):
  206. r"""
  207. Batch normalization layer over a 2D input.
  208. Batch Normalization is widely used in convolutional networks. This layer
  209. applies Batch Normalization over a 2D input (a mini-batch of 1D inputs) to
  210. reduce internal covariate shift as described in the paper
  211. `Batch Normalization: Accelerating Deep Network Training by
  212. Reducing Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`_. It
  213. rescales and recenters the feature using a mini-batch of data and
  214. the learned parameters which can be described in the following formula.
  215. .. math::
  216. y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
  217. Note:
  218. The implementation of BatchNorm is different in graph mode and pynative mode, therefore the mode is not
  219. recommended to be changed after net was initilized.
  220. Args:
  221. num_features (int): `C` from an expected input of size (N, C).
  222. eps (float): A value added to the denominator for numerical stability. Default: 1e-5.
  223. momentum (float): A floating hyperparameter of the momentum for the
  224. running_mean and running_var computation. Default: 0.9.
  225. affine (bool): A bool value. When set to True, gamma and beta can be learned. Default: True.
  226. gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
  227. The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
  228. 'he_uniform', etc. Default: 'ones'.
  229. beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
  230. The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
  231. 'he_uniform', etc. Default: 'zeros'.
  232. moving_mean_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the moving mean.
  233. The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
  234. 'he_uniform', etc. Default: 'zeros'.
  235. moving_var_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the moving variance.
  236. The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
  237. 'he_uniform', etc. Default: 'ones'.
  238. use_batch_statistics (bool): If true, use the mean value and variance value of current batch data. If false,
  239. use the mean value and variance value of specified value. If None, the training process will use the mean
  240. and variance of current batch data and track the running mean and variance, the evaluation process will use
  241. the running mean and variance. Default: None.
  242. Inputs:
  243. - **input** (Tensor) - Tensor of shape :math:`(N, C_{in})`.
  244. Outputs:
  245. Tensor, the normalized, scaled, offset tensor, of shape :math:`(N, C_{out})`.
  246. Examples:
  247. >>> net = nn.BatchNorm1d(num_features=16)
  248. >>> input = Tensor(np.random.randint(0, 255, [3, 16]), mindspore.float32)
  249. >>> net(input)
  250. """
  251. def __init__(self,
  252. num_features,
  253. eps=1e-5,
  254. momentum=0.9,
  255. affine=True,
  256. gamma_init='ones',
  257. beta_init='zeros',
  258. moving_mean_init='zeros',
  259. moving_var_init='ones',
  260. use_batch_statistics=None):
  261. super(BatchNorm1d, self).__init__(num_features,
  262. eps,
  263. momentum,
  264. affine,
  265. gamma_init,
  266. beta_init,
  267. moving_mean_init,
  268. moving_var_init,
  269. use_batch_statistics,
  270. input_dims='1d')
  271. def _check_data_dim(self, x):
  272. if x.dim() != 2:
  273. pass
  274. class BatchNorm2d(_BatchNorm):
  275. r"""
  276. Batch normalization layer over a 4D input.
  277. Batch Normalization is widely used in convolutional networks. This layer
  278. applies Batch Normalization over a 4D input (a mini-batch of 2D inputs with
  279. additional channel dimension) to avoid internal covariate shift as described
  280. in the paper `Batch Normalization: Accelerating Deep Network Training by
  281. Reducing Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`_. It
  282. rescales and recenters the feature using a mini-batch of data and
  283. the learned parameters which can be described in the following formula.
  284. .. math::
  285. y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
  286. Note:
  287. The implementation of BatchNorm is different in graph mode and pynative mode, therefore that mode can not be
  288. changed after net was initilized.
  289. Args:
  290. num_features (int): `C` from an expected input of size (N, C, H, W).
  291. eps (float): A value added to the denominator for numerical stability. Default: 1e-5.
  292. momentum (float): A floating hyperparameter of the momentum for the
  293. running_mean and running_var computation. Default: 0.9.
  294. affine (bool): A bool value. When set to True, gamma and beta can be learned. Default: True.
  295. gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
  296. The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
  297. 'he_uniform', etc. Default: 'ones'.
  298. beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
  299. The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
  300. 'he_uniform', etc. Default: 'zeros'.
  301. moving_mean_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the moving mean.
  302. The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
  303. 'he_uniform', etc. Default: 'zeros'.
  304. moving_var_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the moving variance.
  305. The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
  306. 'he_uniform', etc. Default: 'ones'.
  307. use_batch_statistics (bool): If true, use the mean value and variance value of current batch data. If false,
  308. use the mean value and variance value of specified value. If None, the training process will use the mean
  309. and variance of current batch data and track the running mean and variance, the evaluation process will use
  310. the running mean and variance. Default: None.
  311. Inputs:
  312. - **input** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
  313. Outputs:
  314. Tensor, the normalized, scaled, offset tensor, of shape :math:`(N, C_{out}, H_{out}, W_{out})`.
  315. Examples:
  316. >>> net = nn.BatchNorm2d(num_features=3)
  317. >>> input = Tensor(np.random.randint(0, 255, [1, 3, 224, 224]), mindspore.float32)
  318. >>> net(input)
  319. """
  320. def __init__(self,
  321. num_features,
  322. eps=1e-5,
  323. momentum=0.9,
  324. affine=True,
  325. gamma_init='ones',
  326. beta_init='zeros',
  327. moving_mean_init='zeros',
  328. moving_var_init='ones',
  329. use_batch_statistics=None):
  330. super(BatchNorm2d, self).__init__(num_features,
  331. eps,
  332. momentum,
  333. affine,
  334. gamma_init,
  335. beta_init,
  336. moving_mean_init,
  337. moving_var_init,
  338. use_batch_statistics,
  339. input_dims='2d')
  340. def _check_data_dim(self, x):
  341. if x.dim() != 4:
  342. pass
  343. class GlobalBatchNorm(_BatchNorm):
  344. r"""
  345. Global normalization layer over a N-dimension input.
  346. Global Normalization is cross device synchronized batch normalization. The implementation of Batch Normalization
  347. only normalizes the data within each device. Global normalization will normalize the input within the group.
  348. It has been described in the paper `Batch Normalization: Accelerating Deep Network Training by
  349. Reducing Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`_. It rescales and recenters the
  350. feature using a mini-batch of data and the learned parameters which can be described in the following formula.
  351. .. math::
  352. y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
  353. Note:
  354. Currently, GlobalBatchNorm only supports 2D and 4D inputs.
  355. Args:
  356. num_features (int): `C` from an expected input of size (N, C, H, W).
  357. device_num_each_group (int): The number of devices in each group. Default: 2.
  358. eps (float): A value added to the denominator for numerical stability. Default: 1e-5.
  359. momentum (float): A floating hyperparameter of the momentum for the
  360. running_mean and running_var computation. Default: 0.9.
  361. gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
  362. The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
  363. 'he_uniform', etc. Default: 'ones'.
  364. beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
  365. The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
  366. 'he_uniform', etc. Default: 'zeros'.
  367. moving_mean_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the moving mean.
  368. The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
  369. 'he_uniform', etc. Default: 'zeros'.
  370. moving_var_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the moving variance.
  371. The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
  372. 'he_uniform', etc. Default: 'ones'.
  373. use_batch_statistics (bool): If true, use the mean value and variance value of current batch data. If false,
  374. use the mean value and variance value of specified value. If None, training process will use the mean and
  375. variance of current batch data and track the running mean and variance, eval process will use the running
  376. mean and variance. Default: None.
  377. Inputs:
  378. - **input** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
  379. Outputs:
  380. Tensor, the normalized, scaled, offset tensor, of shape :math:`(N, C_{out}, H_{out}, W_{out})`.
  381. Examples:
  382. >>> global_bn_op = nn.GlobalBatchNorm(num_features=3, device_num_each_group=4)
  383. >>> input = Tensor(np.random.randint(0, 255, [1, 3, 224, 224]), mindspore.float32)
  384. >>> global_bn_op(input)
  385. """
  386. def __init__(self,
  387. num_features,
  388. eps=1e-5,
  389. momentum=0.9,
  390. affine=True,
  391. gamma_init='ones',
  392. beta_init='zeros',
  393. moving_mean_init='zeros',
  394. moving_var_init='ones',
  395. use_batch_statistics=None,
  396. device_num_each_group=2):
  397. super(GlobalBatchNorm, self).__init__(num_features,
  398. eps,
  399. momentum,
  400. affine,
  401. gamma_init,
  402. beta_init,
  403. moving_mean_init,
  404. moving_var_init,
  405. use_batch_statistics,
  406. device_num_each_group,
  407. input_dims='both')
  408. self.group = Validator.check_positive_int(device_num_each_group)
  409. if self.group <= 1:
  410. raise ValueError("the number of group must be greater than 1.")
  411. def _check_data_dim(self, x):
  412. if x.dim == 0:
  413. pass
  414. class LayerNorm(Cell):
  415. r"""
  416. Applies Layer Normalization over a mini-batch of inputs.
  417. Layer normalization is widely used in recurrent neural networks. It applies
  418. normalization on a mini-batch of inputs for each single training case as described
  419. in the paper `Layer Normalization <https://arxiv.org/pdf/1607.06450.pdf>`_. Unlike batch
  420. normalization, layer normalization performs exactly the same computation at training and
  421. testing time. It can be described using the following formula. It is applied across all channels
  422. and pixel but only one batch size.
  423. .. math::
  424. y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
  425. Args:
  426. normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis
  427. `begin_norm_axis ... R - 1`.
  428. begin_norm_axis (int): The first normalization dimension: normalization will be performed along dimensions
  429. `begin_norm_axis: rank(inputs)`, the value should be in [-1, rank(input)). Default: -1.
  430. begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters
  431. will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with
  432. the normalized inputs accordingly, the value should be in [-1, rank(input)). Default: -1.
  433. gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
  434. The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
  435. 'he_uniform', etc. Default: 'ones'.
  436. beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
  437. The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
  438. 'he_uniform', etc. Default: 'zeros'.
  439. epsilon (float): A value added to the denominator for numerical stability. Default: 1e-7.
  440. Inputs:
  441. - **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`,
  442. and `input_shape[begin_norm_axis:]` is equal to `normalized_shape`.
  443. Outputs:
  444. Tensor, the normalized and scaled offset tensor, has the same shape and data type as the `input_x`.
  445. Examples:
  446. >>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
  447. >>> shape1 = x.shape[1:]
  448. >>> m = nn.LayerNorm(shape1, begin_norm_axis=1, begin_params_axis=1)
  449. >>> m(x).shape
  450. (20, 5, 10, 10)
  451. """
  452. def __init__(self,
  453. normalized_shape,
  454. begin_norm_axis=-1,
  455. begin_params_axis=-1,
  456. gamma_init='ones',
  457. beta_init='zeros',
  458. epsilon=1e-7
  459. ):
  460. super(LayerNorm, self).__init__()
  461. if not isinstance(normalized_shape, (tuple, list)):
  462. raise TypeError("The type of 'normalized_shape' should be tuple[int] or list[int], but '{}' type is {}."
  463. .format(normalized_shape, type(normalized_shape)))
  464. self.normalized_shape = normalized_shape
  465. self.begin_norm_axis = begin_norm_axis
  466. self.begin_params_axis = begin_params_axis
  467. self.epsilon = epsilon
  468. self.gamma = Parameter(initializer(
  469. gamma_init, normalized_shape), name="gamma")
  470. self.beta = Parameter(initializer(
  471. beta_init, normalized_shape), name="beta")
  472. self.layer_norm = _selected_ops.LayerNorm(begin_norm_axis=self.begin_norm_axis,
  473. begin_params_axis=self.begin_params_axis)
  474. def construct(self, input_x):
  475. y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
  476. return y
  477. def extend_repr(self):
  478. """Display instance object as string."""
  479. s = 'normalized_shape={}, begin_norm_axis={}, begin_params_axis={}, gamma{}, beta={}'.format(
  480. self.normalized_shape, self.begin_norm_axis, self.begin_params_axis, self.gamma, self.beta)
  481. return s
  482. class GroupNorm(Cell):
  483. r"""
  484. Group Normalization over a mini-batch of inputs.
  485. Group normalization is widely used in recurrent neural networks. It applies
  486. normalization on a mini-batch of inputs for each single training case as described
  487. in the paper `Group Normalization <https://arxiv.org/pdf/1803.08494.pdf>`_. Group normalization
  488. divides the channels into groups and computes within each group the mean and variance for normalization,
  489. and it performs very stable over a wide range of batch size. It can be described using the following formula.
  490. .. math::
  491. y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
  492. Args:
  493. num_groups (int): The number of groups to be divided along the channel dimension.
  494. num_channels (int): The number of channels per group.
  495. eps (float): A value added to the denominator for numerical stability. Default: 1e-5.
  496. affine (bool): A bool value, this layer will have learnable affine parameters when set to true. Default: True.
  497. gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
  498. The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
  499. 'he_uniform', etc. Default: 'ones'. If gamma_init is a Tensor, the shape must be [num_channels].
  500. beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
  501. The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
  502. 'he_uniform', etc. Default: 'zeros'. If beta_init is a Tensor, the shape must be [num_channels].
  503. Inputs:
  504. - **input_x** (Tensor) - The input feature with shape [N, C, H, W].
  505. Outputs:
  506. Tensor, the normalized and scaled offset tensor, has the same shape and data type as the `input_x`.
  507. Examples:
  508. >>> goup_norm_op = nn.GroupNorm(2, 2)
  509. >>> x = Tensor(np.ones([1, 2, 4, 4], np.float32))
  510. >>> goup_norm_op(x)
  511. [[[[0. 0. 0. 0.]
  512. [0. 0. 0. 0.]
  513. [0. 0. 0. 0.]
  514. [0. 0. 0. 0.]]
  515. [[0. 0. 0. 0.]
  516. [0. 0. 0. 0.]
  517. [0. 0. 0. 0.]
  518. [0. 0. 0. 0.]]]]
  519. """
  520. def __init__(self, num_groups, num_channels, eps=1e-05, affine=True, gamma_init='ones', beta_init='zeros'):
  521. super(GroupNorm, self).__init__()
  522. self.num_groups = Validator.check_positive_int(num_groups)
  523. self.num_channels = Validator.check_positive_int(num_channels)
  524. if num_channels % num_groups != 0:
  525. raise ValueError("num_channels should be divided by num_groups")
  526. self.eps = check_typename('eps', eps, (float,))
  527. self.affine = Validator.check_bool(affine)
  528. gamma = initializer(gamma_init, num_channels)
  529. beta = initializer(beta_init, num_channels)
  530. if self.affine:
  531. self.gamma = Parameter(gamma, name='gamma')
  532. self.beta = Parameter(beta, name='beta')
  533. else:
  534. self.gamma = gamma
  535. self.beta = beta
  536. self.shape = F.shape
  537. self.reshape = F.reshape
  538. self.reduce_mean = P.ReduceMean(keep_dims=True)
  539. self.square = F.square
  540. self.reduce_sum = P.ReduceSum(keep_dims=True)
  541. self.sqrt = P.Sqrt()
  542. def _cal_output(self, x):
  543. """calculate groupnorm output"""
  544. batch, channel, height, width = self.shape(x)
  545. _channel_check(channel, self.num_channels)
  546. x = self.reshape(x, (batch, self.num_groups, -1))
  547. mean = self.reduce_mean(x, 2)
  548. var = self.reduce_sum(self.square(x - mean), 2) / (channel * height * width / self.num_groups)
  549. std = self.sqrt(var + self.eps)
  550. x = (x - mean) / std
  551. x = self.reshape(x, (batch, channel, height, width))
  552. output = x * self.reshape(self.gamma, (-1, 1, 1)) + self.reshape(self.beta, (-1, 1, 1))
  553. return output
  554. def construct(self, x):
  555. _shape_check(self.shape(x))
  556. output = self._cal_output(x)
  557. return output
  558. def extend_repr(self):
  559. """Display instance object as string."""
  560. s = 'num_groups={}, num_channels={}'.format(self.num_groups, self.num_channels)
  561. return s