| @@ -30,6 +30,7 @@ __all__ = ['Parameter', 'ParameterTuple'] | |||
| PARAMETER_NAME_DEFAULT = "Parameter" | |||
| PARAMETER_NAME_PREFIX_MAX_LEN = 1024 | |||
| def _is_in_parallel_mode(): | |||
| """Get parallel mode.""" | |||
| return auto_parallel_context().get_parallel_mode() in ["semi_auto_parallel", "auto_parallel"] | |||
| @@ -51,10 +52,12 @@ class Parameter(MetaTensor_): | |||
| A Parameter has to belong to a Cell. | |||
| If there is an operator in the network that requires part of the inputs to be Parameter, | |||
| then the Parameters as this part of the inputs are not allowed to be cast. | |||
| It is recommended to use the default value of `name` when initialize a parameter as one attribute of a cell, | |||
| otherwise, the parameter name may be different than expected. | |||
| Args: | |||
| default_input (Union[Tensor, MetaTensor, Number]): Parameter data, to be set initialized. | |||
| name (str): Name of the child parameter. | |||
| name (str): Name of the child parameter. Default: None. | |||
| requires_grad (bool): True if the parameter requires gradient. Default: True. | |||
| layerwise_parallel (bool): A kind of model parallel mode. When layerwise_parallel is true in parallel mode, | |||
| broadcast and gradients communication would not be applied to parameters. Default: False. | |||
| @@ -72,7 +75,7 @@ class Parameter(MetaTensor_): | |||
| >>> def __init__(self): | |||
| >>> super(Net, self).__init__() | |||
| >>> self.matmul = P.MatMul() | |||
| >>> self.weight = Parameter(Tensor(np.ones((1,2))), name="w", requires_grad=True) | |||
| >>> self.weight = Parameter(Tensor(np.ones((1,2))), requires_grad=True) | |||
| >>> | |||
| >>> def construct(self, x): | |||
| >>> out = self.matmul(self.weight, x) | |||
| @@ -88,7 +91,7 @@ class Parameter(MetaTensor_): | |||
| """ | |||
| __base_type__ = {} | |||
| def __new__(cls, default_input, name, *args, **kwargs): | |||
| def __new__(cls, default_input, *args, **kwargs): | |||
| input_class, *class_init_args = Parameter._get_parameter_new_args(default_input) | |||
| new_type = Parameter._get_base_class(input_class) | |||
| obj = input_class.__new__(new_type) | |||
| @@ -112,7 +115,7 @@ class Parameter(MetaTensor_): | |||
| return ( | |||
| Parameter, (data, self.name, self.requires_grad, self.layerwise_parallel)) | |||
| def __init__(self, default_input, name, requires_grad=True, layerwise_parallel=False): | |||
| def __init__(self, default_input, name=None, requires_grad=True, layerwise_parallel=False): | |||
| self._param_info = ParamInfo() | |||
| self.name = name | |||
| self.requires_grad = requires_grad | |||
| @@ -276,24 +279,20 @@ class Parameter(MetaTensor_): | |||
| """ | |||
| self._is_init = is_init_ | |||
| def clone(self, prefix, init='same'): | |||
| def clone(self, init='same'): | |||
| """ | |||
| Clone the parameter. | |||
| Args: | |||
| prefix (str): Namespace of parameter. The cloned Parameter name is | |||
| combined of prefix and current name: `f"{perfix}.{self.name}"`. | |||
| init (Union[Tensor, str, MetaTensor, numbers.Number]): Initialize the shape of the parameter. | |||
| Default: 'same'. | |||
| Returns: | |||
| Parameter, a new parameter. | |||
| """ | |||
| Validator.check_str_by_regular(prefix) | |||
| x = copy(self) | |||
| # pylint: disable=protected-access | |||
| x._param_info = self._param_info.clone() | |||
| x._param_info.name = prefix + '.' + self._param_info.name | |||
| x.is_init = False | |||
| x.is_param_ps = self.is_param_ps | |||
| x.init_in_server = self.init_in_server | |||
| @@ -464,10 +463,25 @@ class ParameterTuple(tuple): | |||
| def __new__(cls, iterable): | |||
| """Create instance object of ParameterTuple.""" | |||
| data = tuple(iterable) | |||
| ids = set() | |||
| orders = {} | |||
| for x in data: | |||
| if not isinstance(x, Parameter): | |||
| raise TypeError(f"ParameterTuple input should be `Parameter` collection." | |||
| f"But got a {type(iterable)}, {iterable}") | |||
| if id(x) not in ids: | |||
| ids.add(id(x)) | |||
| if x.name not in orders.keys(): | |||
| orders[x.name] = [0, x] | |||
| else: | |||
| if isinstance(orders[x.name], list): | |||
| name = x.name | |||
| orders[name][1].name = name + "_" + str(0) | |||
| x.name = x.name + "_" + str(1) | |||
| orders[name] = 1 | |||
| else: | |||
| orders[x.name] += 1 | |||
| x.name = x.name + "_" + str(orders[x.name]) | |||
| return tuple.__new__(ParameterTuple, tuple(data)) | |||
| def clone(self, prefix, init='same'): | |||
| @@ -484,7 +498,8 @@ class ParameterTuple(tuple): | |||
| Validator.check_str_by_regular(prefix) | |||
| new = [] | |||
| for x in self: | |||
| x1 = x.clone(prefix, init) | |||
| x1 = x.clone(init) | |||
| x1.name = prefix + "." + x1.name | |||
| new.append(x1) | |||
| return ParameterTuple(new) | |||
| @@ -20,6 +20,7 @@ import os | |||
| from collections import OrderedDict | |||
| import numpy | |||
| from mindspore import log as logger | |||
| from mindspore.common.parameter import PARAMETER_NAME_DEFAULT | |||
| from .. import context | |||
| from ..common import dtype as mstype | |||
| from ..common.api import _executor, _pynative_exec | |||
| @@ -619,6 +620,8 @@ class Cell(Cell_): | |||
| raise KeyError("Duplicated parameter name '{}'.".format(param_name)) | |||
| if not isinstance(param, Parameter) and param is not None: | |||
| raise TypeError("The type of parameter should be 'Parameter' if not None.") | |||
| if isinstance(param, Parameter) and param.name == PARAMETER_NAME_DEFAULT: | |||
| param.name = param_name | |||
| self._params[param_name] = param | |||
| def cast_param(self, param): | |||
| @@ -55,11 +55,11 @@ class DepthWiseConv(nn.Cell): | |||
| self.bias_add = P.BiasAdd() | |||
| weight_shape = [channel_multiplier, in_planes, kernel_size, kernel_size] | |||
| self.weight = Parameter(initializer(KaimingNormal(mode='fan_out'), weight_shape), name='weight') | |||
| self.weight = Parameter(initializer(KaimingNormal(mode='fan_out'), weight_shape)) | |||
| if has_bias: | |||
| bias_shape = [channel_multiplier * in_planes] | |||
| self.bias = Parameter(initializer('zeros', bias_shape), name='bias') | |||
| self.bias = Parameter(initializer('zeros', bias_shape)) | |||
| else: | |||
| self.bias = None | |||
| @@ -469,12 +469,12 @@ class DepthWiseConv(nn.Cell): | |||
| self.depthwise_conv = P.Conv2D(out_channel=in_planes * 1, kernel_size=kernel_size, | |||
| stride=stride, pad_mode="same", group=in_planes) | |||
| self.weight = Parameter(initializer( | |||
| weight_init, [in_planes * 1, 1, kernel_size, kernel_size]), name='depthwise_weight') | |||
| weight_init, [in_planes * 1, 1, kernel_size, kernel_size])) | |||
| else: | |||
| self.depthwise_conv = P.DepthwiseConv2dNative( | |||
| channel_multiplier=1, kernel_size=kernel_size, stride=stride, pad_mode='same',) | |||
| self.weight = Parameter(initializer( | |||
| weight_init, [1, in_planes, kernel_size, kernel_size]), name='depthwise_weight') | |||
| weight_init, [1, in_planes, kernel_size, kernel_size])) | |||
| def construct(self, x): | |||
| x = self.depthwise_conv(x, self.weight) | |||
| @@ -28,9 +28,8 @@ class DenseNoTranpose(nn.Cell): | |||
| def __init__(self, input_channels, output_channels, weight_init): | |||
| super(DenseNoTranpose, self).__init__() | |||
| self.weight = Parameter(initializer(weight_init, [input_channels, output_channels], mstype.float16), | |||
| name="weight") | |||
| self.bias = Parameter(initializer("zeros", [output_channels], mstype.float16).to_tensor(), name="bias") | |||
| self.weight = Parameter(initializer(weight_init, [input_channels, output_channels], mstype.float16)) | |||
| self.bias = Parameter(initializer("zeros", [output_channels], mstype.float16).to_tensor()) | |||
| self.matmul = P.MatMul(transpose_b=False) | |||
| self.bias_add = P.BiasAdd() | |||
| @@ -26,9 +26,8 @@ class DenseNoTranpose(nn.Cell): | |||
| """Dense method""" | |||
| def __init__(self, input_channels, output_channels, weight_init): | |||
| super(DenseNoTranpose, self).__init__() | |||
| self.weight = Parameter(initializer(weight_init, [input_channels, output_channels], mstype.float16), | |||
| name="weight") | |||
| self.bias = Parameter(initializer("zeros", [output_channels], mstype.float16).to_tensor(), name="bias") | |||
| self.weight = Parameter(initializer(weight_init, [input_channels, output_channels], mstype.float16)) | |||
| self.bias = Parameter(initializer("zeros", [output_channels], mstype.float16).to_tensor()) | |||
| self.matmul = P.MatMul(transpose_b=False) | |||
| self.bias_add = P.BiasAdd() | |||
| @@ -55,7 +55,7 @@ class THOR_GPU(Optimizer): | |||
| Validator.check_value_type("momentum", momentum, [float], self.cls_name) | |||
| if isinstance(momentum, float) and momentum < 0.0: | |||
| raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) | |||
| self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") | |||
| self.momentum = Parameter(Tensor(momentum, mstype.float32)) | |||
| self.params = self.parameters | |||
| self.use_nesterov = Validator.check_bool(use_nesterov) | |||
| self.moments = self.params.clone(prefix="moments", init='zeros') | |||
| @@ -160,7 +160,7 @@ class THOR(Optimizer): | |||
| super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale) | |||
| if isinstance(momentum, float) and momentum < 0.0: | |||
| raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) | |||
| self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") | |||
| self.momentum = Parameter(Tensor(momentum, mstype.float32)) | |||
| self.params = self.parameters | |||
| self.moments = self.params.clone(prefix="moments", init='zeros') | |||
| self.hyper_map = C.HyperMap() | |||
| @@ -109,11 +109,10 @@ class _Conv(Cell): | |||
| 'attr \'group\' of \'Conv2D\' Op.') | |||
| self.weight = Parameter(initializer( | |||
| weight_init, [out_channels, in_channels // group, *kernel_size]), name='weight') | |||
| weight_init, [out_channels, in_channels // group, *kernel_size])) | |||
| if Validator.check_bool(has_bias): | |||
| self.bias = Parameter(_initializer( | |||
| bias_init, [out_channels]), name='bias') | |||
| self.bias = Parameter(initializer(bias_init, [out_channels])) | |||
| else: | |||
| if bias_init != 'zeros': | |||
| logger.warning("Value of 'has_bias' is False, value of 'bias_init' will be ignored.") | |||
| @@ -174,12 +173,10 @@ class Conv2d_Thor_GPU(_Conv): | |||
| split_dim = 128 | |||
| matrix_A_shape, matrix_G_shape = caculate_matmul_shape(self.matrix_A_dim, self.matrix_G_dim, split_dim) | |||
| self.matrix_A_inv = Parameter(np.zeros(matrix_A_shape).astype(np.float32), | |||
| name='matrix_A_inv', requires_grad=False) | |||
| self.matrix_G_inv = Parameter(np.zeros(matrix_G_shape).astype(np.float32), | |||
| name='matrix_A_inv', requires_grad=False) | |||
| self.matrix_A_inv = Parameter(np.zeros(matrix_A_shape).astype(np.float32), requires_grad=False) | |||
| self.matrix_G_inv = Parameter(np.zeros(matrix_G_shape).astype(np.float32), requires_grad=False) | |||
| self.broadcast_to = P.BroadcastTo(matrix_A_shape) | |||
| self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) | |||
| self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False) | |||
| self.img2col = P.Im2Col(kernel_size=kernel_size, stride=stride, pad_mode="same") | |||
| self.matmul = P.MatMul(transpose_b=True) | |||
| self.shape = P.Shape() | |||
| @@ -195,7 +192,7 @@ class Conv2d_Thor_GPU(_Conv): | |||
| self.axis = 0 | |||
| self.sqrt = P.Sqrt() | |||
| self.reduce_mean = P.ReduceMean(keep_dims=False) | |||
| self.damping = Parameter(Tensor(damping), name="damping_value", requires_grad=False) | |||
| self.damping = Parameter(Tensor(damping), requires_grad=False) | |||
| self.dampingA = Tensor(np.identity(self.matrix_A_dim), mstype.float32) | |||
| self.dampingG = Tensor(np.identity(self.matrix_G_dim), mstype.float32) | |||
| self.cholesky = P.CholeskyTrsm(split_dim=split_dim) | |||
| @@ -301,14 +298,14 @@ class Dense_Thor_GPU(Cell): | |||
| weight_init.shape[1] != in_channels: | |||
| raise ValueError("weight_init shape error") | |||
| self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight") | |||
| self.weight = Parameter(initializer(weight_init, [out_channels, in_channels])) | |||
| if self.has_bias: | |||
| if isinstance(bias_init, Tensor): | |||
| if bias_init.dim() != 1 or bias_init.shape[0] != out_channels: | |||
| raise ValueError("bias_init shape error") | |||
| self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias") | |||
| self.bias = Parameter(initializer(bias_init, [out_channels])) | |||
| self.matmul = P.MatMul(transpose_b=True) | |||
| self.bias_add = P.BiasAdd() | |||
| @@ -317,12 +314,10 @@ class Dense_Thor_GPU(Cell): | |||
| self.activation_flag = self.activation is not None | |||
| split_dim = 128 | |||
| matrix_A_shape, matrix_G_shape = caculate_matmul_shape(self.in_channels, self.out_channels, split_dim) | |||
| self.matrix_A_inv = Parameter(Tensor(np.zeros(matrix_A_shape).astype(np.float32)), | |||
| name='matrix_A_inv', requires_grad=False) | |||
| self.matrix_G_inv = Parameter(Tensor(np.zeros(matrix_G_shape).astype(np.float32)), | |||
| name="matrix_G_inv", requires_grad=False) | |||
| self.matrix_A_inv = Parameter(Tensor(np.zeros(matrix_A_shape).astype(np.float32)), requires_grad=False) | |||
| self.matrix_G_inv = Parameter(Tensor(np.zeros(matrix_G_shape).astype(np.float32)), requires_grad=False) | |||
| self.broadcast_to = P.BroadcastTo(matrix_A_shape) | |||
| self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) | |||
| self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False) | |||
| self.shape = P.Shape() | |||
| self.reshape = P.Reshape() | |||
| self.transpose = P.Transpose() | |||
| @@ -331,7 +326,7 @@ class Dense_Thor_GPU(Cell): | |||
| self.loss_scale = Tensor(1 / loss_scale, mstype.float16) | |||
| self.batch_size = Tensor(batch_size, mstype.float16) | |||
| self.getG = P.InsertGradientOf(self.save_gradient) | |||
| self.damping = Parameter(Tensor(damping), name="damping_value", requires_grad=False) | |||
| self.damping = Parameter(Tensor(damping), requires_grad=False) | |||
| self.dampingA = Tensor(np.identity(in_channels), mstype.float32) | |||
| self.dampingG = Tensor(np.identity(out_channels), mstype.float32) | |||
| self.cast = P.Cast() | |||
| @@ -467,20 +462,20 @@ class Conv2d_Thor(_Conv): | |||
| self.matrix_G_device_shape[3]) | |||
| self.matrix_A_inv = Parameter( | |||
| Tensor(np.reshape(np.identity(self.matrix_A_device_dim).astype(np.float16), self.matrix_A_device_shape)), | |||
| name='matrix_A_inv', requires_grad=False) | |||
| self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), name="A_inv_max", requires_grad=False) | |||
| requires_grad=False) | |||
| self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), requires_grad=False) | |||
| self.matrix_G_inv = Parameter( | |||
| Tensor(np.reshape(np.identity(self.matrix_G_device_dim).astype(np.float16), self.matrix_G_device_shape)), | |||
| name="matrix_G_inv", requires_grad=False) | |||
| requires_grad=False) | |||
| self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), name="G_inv_max", requires_grad=False) | |||
| self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), requires_grad=False) | |||
| self.fake_G = Tensor( | |||
| np.reshape(np.identity(self.matrix_G_device_dim).astype(np.float16), self.matrix_G_device_shape)) | |||
| self.shape = P.Shape() | |||
| self.reshape = P.Reshape() | |||
| self.transpose = P.Transpose() | |||
| self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) | |||
| self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False) | |||
| self.mul = P.Mul() | |||
| self.cast = P.Cast() | |||
| self.damping = Tensor(damping) | |||
| @@ -648,14 +643,14 @@ class Dense_Thor(Cell): | |||
| weight_init.shape[1] != in_channels: | |||
| raise ValueError("weight_init shape error") | |||
| self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight") | |||
| self.weight = Parameter(initializer(weight_init, [out_channels, in_channels])) | |||
| if self.has_bias: | |||
| if isinstance(bias_init, Tensor): | |||
| if bias_init.dim() != 1 or bias_init.shape[0] != out_channels: | |||
| raise ValueError("bias_init shape error") | |||
| self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias") | |||
| self.bias = Parameter(initializer(bias_init, [out_channels])) | |||
| self.matmul = P.MatMul(transpose_b=True) | |||
| self.bias_add = P.BiasAdd() | |||
| @@ -663,10 +658,8 @@ class Dense_Thor(Cell): | |||
| self.activation = get_activation(activation) | |||
| self.activation_flag = self.activation is not None | |||
| self.matrix_A_inv = Parameter(Tensor(np.zeros([128, 128, 16, 16]).astype(np.float16)), name='matrix_A_inv', | |||
| requires_grad=False) | |||
| self.matrix_G_inv = Parameter(Tensor(np.zeros([63, 63, 16, 16]).astype(np.float16)), name="matrix_G_inv", | |||
| requires_grad=False) | |||
| self.matrix_A_inv = Parameter(Tensor(np.zeros([128, 128, 16, 16]).astype(np.float16)), requires_grad=False) | |||
| self.matrix_G_inv = Parameter(Tensor(np.zeros([63, 63, 16, 16]).astype(np.float16)), requires_grad=False) | |||
| self.fake_G = Tensor(np.zeros([63, 63, 16, 16]).astype(np.float16)) | |||
| self.matmul = P.MatMul(transpose_b=True) | |||
| @@ -676,7 +669,7 @@ class Dense_Thor(Cell): | |||
| self.shape = P.Shape() | |||
| self.reshape = P.Reshape() | |||
| self.transpose = P.Transpose() | |||
| self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) | |||
| self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False) | |||
| self.mul = P.Mul() | |||
| self.cast = P.Cast() | |||
| self.damping = Tensor(damping) | |||
| @@ -689,8 +682,8 @@ class Dense_Thor(Cell): | |||
| self.assignadd = P.AssignAdd() | |||
| self.freq = Tensor(frequency, mstype.int32) | |||
| self.axis = 0 | |||
| self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), name="A_inv_max", requires_grad=False) | |||
| self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), name="G_inv_max", requires_grad=False) | |||
| self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), requires_grad=False) | |||
| self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), requires_grad=False) | |||
| self.fused_abs_max1 = P.CusFusedAbsMax1([1001, 1001]) | |||
| self.fused_abs_max2 = P.CusFusedAbsMax1() | |||
| self.log = P.Log() | |||
| @@ -33,13 +33,12 @@ class CTCLoss(_Loss): | |||
| def __init__(self, max_sequence_length, max_label_length, batch_size): | |||
| super(CTCLoss, self).__init__() | |||
| self.sequence_length = Parameter(Tensor(np.array([max_sequence_length] * batch_size), mstype.int32), | |||
| name="sequence_length") | |||
| self.sequence_length = Parameter(Tensor(np.array([max_sequence_length] * batch_size), mstype.int32)) | |||
| labels_indices = [] | |||
| for i in range(batch_size): | |||
| for j in range(max_label_length): | |||
| labels_indices.append([i, j]) | |||
| self.labels_indices = Parameter(Tensor(np.array(labels_indices), mstype.int64), name="labels_indices") | |||
| self.labels_indices = Parameter(Tensor(np.array(labels_indices), mstype.int64)) | |||
| self.reshape = P.Reshape() | |||
| self.ctc_loss = P.CTCLoss(ctc_merge_repeated=True) | |||
| @@ -45,12 +45,10 @@ class StackedRNN(nn.Cell): | |||
| self.rnn1 = P.DynamicRNN(forget_bias=0.0) | |||
| self.rnn2 = P.DynamicRNN(forget_bias=0.0) | |||
| self.w1 = Parameter(np.random.uniform(-k, k, (input_size + hidden_size, 4 * hidden_size)).astype(np.float16), | |||
| name="w1") | |||
| self.w2 = Parameter(np.random.uniform(-k, k, (hidden_size + hidden_size, 4 * hidden_size)).astype(np.float16), | |||
| name="w2") | |||
| self.b1 = Parameter(np.random.uniform(-k, k, (4 * hidden_size)).astype(np.float16), name="b1") | |||
| self.b2 = Parameter(np.random.uniform(-k, k, (4 * hidden_size)).astype(np.float16), name="b2") | |||
| self.w1 = Parameter(np.random.uniform(-k, k, (input_size + hidden_size, 4 * hidden_size)).astype(np.float16)) | |||
| self.w2 = Parameter(np.random.uniform(-k, k, (hidden_size + hidden_size, 4 * hidden_size)).astype(np.float16)) | |||
| self.b1 = Parameter(np.random.uniform(-k, k, (4 * hidden_size)).astype(np.float16)) | |||
| self.b2 = Parameter(np.random.uniform(-k, k, (4 * hidden_size)).astype(np.float16)) | |||
| self.h1 = Tensor(np.zeros(shape=(1, batch_size, hidden_size)).astype(np.float16)) | |||
| self.h2 = Tensor(np.zeros(shape=(1, batch_size, hidden_size)).astype(np.float16)) | |||
| @@ -98,7 +96,7 @@ class StackedRNNForGPU(nn.Cell): | |||
| self.cast = P.Cast() | |||
| k = (1 / hidden_size) ** 0.5 | |||
| weight_shape = 4 * hidden_size * (input_size + 3 * hidden_size + 4) | |||
| self.weight = Parameter(np.random.uniform(-k, k, (weight_shape, 1, 1)).astype(np.float32), name='weight') | |||
| self.weight = Parameter(np.random.uniform(-k, k, (weight_shape, 1, 1)).astype(np.float32)) | |||
| self.h = Tensor(np.zeros(shape=(num_layer, batch_size, hidden_size)).astype(np.float32)) | |||
| self.c = Tensor(np.zeros(shape=(num_layer, batch_size, hidden_size)).astype(np.float32)) | |||
| @@ -39,7 +39,6 @@ class MeanConv(nn.Cell): | |||
| """ | |||
| def __init__(self, | |||
| name, | |||
| feature_in_dim, | |||
| feature_out_dim, | |||
| activation, | |||
| @@ -47,8 +46,7 @@ class MeanConv(nn.Cell): | |||
| super(MeanConv, self).__init__() | |||
| self.out_weight = Parameter( | |||
| initializer("XavierUniform", [feature_in_dim * 2, feature_out_dim], dtype=mstype.float32), | |||
| name=name + 'out_weight') | |||
| initializer("XavierUniform", [feature_in_dim * 2, feature_out_dim], dtype=mstype.float32)) | |||
| if activation == "tanh": | |||
| self.act = P.Tanh() | |||
| @@ -90,15 +88,13 @@ class AttenConv(nn.Cell): | |||
| """ | |||
| def __init__(self, | |||
| name, | |||
| feature_in_dim, | |||
| feature_out_dim, | |||
| dropout=0.2): | |||
| super(AttenConv, self).__init__() | |||
| self.out_weight = Parameter( | |||
| initializer("XavierUniform", [feature_in_dim * 2, feature_out_dim], dtype=mstype.float32), | |||
| name=name + 'out_weight') | |||
| initializer("XavierUniform", [feature_in_dim * 2, feature_out_dim], dtype=mstype.float32)) | |||
| self.cast = P.Cast() | |||
| self.squeeze = P.Squeeze(1) | |||
| self.concat = P.Concat(axis=1) | |||
| @@ -147,10 +143,8 @@ class BGCF(nn.Cell): | |||
| input_dim): | |||
| super(BGCF, self).__init__() | |||
| self.user_embeddings = Parameter(initializer("XavierUniform", [num_user, input_dim], dtype=mstype.float32), | |||
| name='user_embed') | |||
| self.item_embeddings = Parameter(initializer("XavierUniform", [num_item, input_dim], dtype=mstype.float32), | |||
| name='item_embed') | |||
| self.user_embed = Parameter(initializer("XavierUniform", [num_user, input_dim], dtype=mstype.float32)) | |||
| self.item_embed = Parameter(initializer("XavierUniform", [num_item, input_dim], dtype=mstype.float32)) | |||
| self.cast = P.Cast() | |||
| self.tanh = P.Tanh() | |||
| self.shape = P.Shape() | |||
| @@ -163,30 +157,27 @@ class BGCF(nn.Cell): | |||
| (self.input_dim, self.num_user, self.num_item) = dataset_argv | |||
| self.layer_dim = architect_argv | |||
| self.gnew_agg_mean = MeanConv('gnew_agg_mean', self.input_dim, self.layer_dim, | |||
| self.gnew_agg_mean = MeanConv(self.input_dim, self.layer_dim, | |||
| activation=activation, dropout=neigh_drop_rate[1]) | |||
| self.gnew_agg_mean.to_float(mstype.float16) | |||
| self.gnew_agg_user = AttenConv('gnew_agg_att_user', self.input_dim, | |||
| self.layer_dim, dropout=neigh_drop_rate[2]) | |||
| self.gnew_agg_user = AttenConv(self.input_dim, self.layer_dim, dropout=neigh_drop_rate[2]) | |||
| self.gnew_agg_user.to_float(mstype.float16) | |||
| self.gnew_agg_item = AttenConv('gnew_agg_att_item', self.input_dim, | |||
| self.layer_dim, dropout=neigh_drop_rate[2]) | |||
| self.gnew_agg_item = AttenConv(self.input_dim, self.layer_dim, dropout=neigh_drop_rate[2]) | |||
| self.gnew_agg_item.to_float(mstype.float16) | |||
| self.user_feature_dim = self.input_dim | |||
| self.item_feature_dim = self.input_dim | |||
| self.final_weight = Parameter( | |||
| initializer("XavierUniform", [self.input_dim * 3, self.input_dim * 3], dtype=mstype.float32), | |||
| name='final_weight') | |||
| initializer("XavierUniform", [self.input_dim * 3, self.input_dim * 3], dtype=mstype.float32)) | |||
| self.raw_agg_funcs_user = MeanConv('raw_agg_user', self.input_dim, self.layer_dim, | |||
| self.raw_agg_funcs_user = MeanConv(self.input_dim, self.layer_dim, | |||
| activation=activation, dropout=neigh_drop_rate[0]) | |||
| self.raw_agg_funcs_user.to_float(mstype.float16) | |||
| self.raw_agg_funcs_item = MeanConv('raw_agg_item', self.input_dim, self.layer_dim, | |||
| self.raw_agg_funcs_item = MeanConv(self.input_dim, self.layer_dim, | |||
| activation=activation, dropout=neigh_drop_rate[0]) | |||
| self.raw_agg_funcs_item.to_float(mstype.float16) | |||
| @@ -207,14 +198,14 @@ class BGCF(nn.Cell): | |||
| neg_gnew_neighs, | |||
| neg_item_num): | |||
| """Aggregate user and item embeddings""" | |||
| all_user_embed = self.gather(self.user_embeddings, self.concat_0((u_id, pos_users)), 0) | |||
| all_user_embed = self.gather(self.user_embed, self.concat_0((u_id, pos_users)), 0) | |||
| u_self_matrix_at_layers = self.gather(self.user_embeddings, u_group_nodes, 0) | |||
| u_neigh_matrix_at_layers = self.gather(self.item_embeddings, u_neighs, 0) | |||
| u_self_matrix_at_layers = self.gather(self.user_embed, u_group_nodes, 0) | |||
| u_neigh_matrix_at_layers = self.gather(self.item_embed, u_neighs, 0) | |||
| u_output_mean = self.raw_agg_funcs_user(u_self_matrix_at_layers, u_neigh_matrix_at_layers) | |||
| u_gnew_neighs_matrix = self.gather(self.item_embeddings, u_gnew_neighs, 0) | |||
| u_gnew_neighs_matrix = self.gather(self.item_embed, u_gnew_neighs, 0) | |||
| u_output_from_gnew_mean = self.gnew_agg_mean(u_self_matrix_at_layers, u_gnew_neighs_matrix) | |||
| u_output_from_gnew_att = self.gnew_agg_user(u_self_matrix_at_layers, | |||
| @@ -223,14 +214,14 @@ class BGCF(nn.Cell): | |||
| u_output = self.concat_1((u_output_mean, u_output_from_gnew_mean, u_output_from_gnew_att)) | |||
| all_user_rep = self.tanh(u_output) | |||
| all_pos_item_embed = self.gather(self.item_embeddings, self.concat_0((pos_item_id, pos_items)), 0) | |||
| all_pos_item_embed = self.gather(self.item_embed, self.concat_0((pos_item_id, pos_items)), 0) | |||
| i_self_matrix_at_layers = self.gather(self.item_embeddings, i_group_nodes, 0) | |||
| i_neigh_matrix_at_layers = self.gather(self.user_embeddings, i_neighs, 0) | |||
| i_self_matrix_at_layers = self.gather(self.item_embed, i_group_nodes, 0) | |||
| i_neigh_matrix_at_layers = self.gather(self.user_embed, i_neighs, 0) | |||
| i_output_mean = self.raw_agg_funcs_item(i_self_matrix_at_layers, i_neigh_matrix_at_layers) | |||
| i_gnew_neighs_matrix = self.gather(self.user_embeddings, i_gnew_neighs, 0) | |||
| i_gnew_neighs_matrix = self.gather(self.user_embed, i_gnew_neighs, 0) | |||
| i_output_from_gnew_mean = self.gnew_agg_mean(i_self_matrix_at_layers, i_gnew_neighs_matrix) | |||
| i_output_from_gnew_att = self.gnew_agg_item(i_self_matrix_at_layers, | |||
| @@ -239,14 +230,14 @@ class BGCF(nn.Cell): | |||
| i_output = self.concat_1((i_output_mean, i_output_from_gnew_mean, i_output_from_gnew_att)) | |||
| all_pos_item_rep = self.tanh(i_output) | |||
| neg_item_embed = self.gather(self.item_embeddings, neg_item_id, 0) | |||
| neg_item_embed = self.gather(self.item_embed, neg_item_id, 0) | |||
| neg_self_matrix_at_layers = self.gather(self.item_embeddings, neg_group_nodes, 0) | |||
| neg_neigh_matrix_at_layers = self.gather(self.user_embeddings, neg_neighs, 0) | |||
| neg_self_matrix_at_layers = self.gather(self.item_embed, neg_group_nodes, 0) | |||
| neg_neigh_matrix_at_layers = self.gather(self.user_embed, neg_neighs, 0) | |||
| neg_output_mean = self.raw_agg_funcs_item(neg_self_matrix_at_layers, neg_neigh_matrix_at_layers) | |||
| neg_gnew_neighs_matrix = self.gather(self.user_embeddings, neg_gnew_neighs, 0) | |||
| neg_gnew_neighs_matrix = self.gather(self.user_embed, neg_gnew_neighs, 0) | |||
| neg_output_from_gnew_mean = self.gnew_agg_mean(neg_self_matrix_at_layers, neg_gnew_neighs_matrix) | |||
| neg_output_from_gnew_att = self.gnew_agg_item(neg_self_matrix_at_layers, | |||
| @@ -80,14 +80,14 @@ class GNNFeatureTransform(nn.Cell): | |||
| weight_init.shape[1] != in_channels: | |||
| raise ValueError("weight_init shape error") | |||
| self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight") | |||
| self.weight = Parameter(initializer(weight_init, [out_channels, in_channels])) | |||
| if self.has_bias: | |||
| if isinstance(bias_init, Tensor): | |||
| if bias_init.dim() != 1 or bias_init.shape[0] != out_channels: | |||
| raise ValueError("bias_init shape error") | |||
| self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias") | |||
| self.bias = Parameter(initializer(bias_init, [out_channels])) | |||
| self.matmul = P.MatMul(transpose_b=True) | |||
| self.bias_add = P.BiasAdd() | |||
| @@ -280,7 +280,7 @@ class AttentionHead(nn.Cell): | |||
| self.coef_drop = nn.Dropout(keep_prob=1 - coef_drop_ratio) | |||
| self.matmul = P.MatMul() | |||
| self.bias_add = P.BiasAdd() | |||
| self.bias = Parameter(initializer('zeros', self.out_channel), name='bias') | |||
| self.bias = Parameter(initializer('zeros', self.out_channel)) | |||
| self.residual = residual | |||
| if self.residual: | |||
| if in_channel != out_channel: | |||
| @@ -80,8 +80,8 @@ class BertPretrainEva(nn.Cell): | |||
| self.equal = P.Equal() | |||
| self.mean = P.ReduceMean() | |||
| self.sum = P.ReduceSum() | |||
| self.total = Parameter(Tensor([0], mstype.float32), name='total') | |||
| self.acc = Parameter(Tensor([0], mstype.float32), name='acc') | |||
| self.total = Parameter(Tensor([0], mstype.float32)) | |||
| self.acc = Parameter(Tensor([0], mstype.float32)) | |||
| self.reshape = P.Reshape() | |||
| self.shape = P.Shape() | |||
| self.cast = P.Cast() | |||
| @@ -52,7 +52,7 @@ class CRF(nn.Cell): | |||
| transitions = np.random.normal(size=(self.target_size, self.target_size)).astype(np.float32) | |||
| transitions[tag_to_index[self.START_TAG], :] = -10000 | |||
| transitions[:, tag_to_index[self.STOP_TAG]] = -10000 | |||
| self.transitions = Parameter(Tensor(transitions), name="transition_matrix") | |||
| self.transitions = Parameter(Tensor(transitions)) | |||
| self.cat = P.Concat(axis=-1) | |||
| self.argmax = P.ArgMaxWithValue(axis=-1) | |||
| self.log = P.Log() | |||
| @@ -90,8 +90,7 @@ class BertFinetuneCell(nn.Cell): | |||
| self.loss_scale = None | |||
| self.loss_scaling_manager = scale_update_cell | |||
| if scale_update_cell: | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), | |||
| name="loss_scale") | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) | |||
| def construct(self, | |||
| input_ids, | |||
| @@ -185,8 +184,8 @@ class BertSquadCell(nn.Cell): | |||
| self.loss_scale = None | |||
| self.loss_scaling_manager = scale_update_cell | |||
| if scale_update_cell: | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), | |||
| name="loss_scale") | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) | |||
| def construct(self, | |||
| input_ids, | |||
| input_mask, | |||
| @@ -306,9 +305,9 @@ class BertSquad(nn.Cell): | |||
| self.num_labels = num_labels | |||
| self.seq_length = config.seq_length | |||
| self.is_training = is_training | |||
| self.total_num = Parameter(Tensor([0], mstype.float32), name='total_num') | |||
| self.start_num = Parameter(Tensor([0], mstype.float32), name='start_num') | |||
| self.end_num = Parameter(Tensor([0], mstype.float32), name='end_num') | |||
| self.total_num = Parameter(Tensor([0], mstype.float32)) | |||
| self.start_num = Parameter(Tensor([0], mstype.float32)) | |||
| self.end_num = Parameter(Tensor([0], mstype.float32)) | |||
| self.sum = P.ReduceSum() | |||
| self.equal = P.Equal() | |||
| self.argmax = P.ArgMaxWithValue(axis=1) | |||
| @@ -84,8 +84,7 @@ class GetMaskedLMOutput(nn.Cell): | |||
| self.output_bias = Parameter( | |||
| initializer( | |||
| 'zero', | |||
| config.vocab_size), | |||
| name='output_bias') | |||
| config.vocab_size)) | |||
| self.matmul = P.MatMul(transpose_b=True) | |||
| self.log_softmax = nn.LogSoftmax(axis=-1) | |||
| self.shape_flat_offsets = (-1, 1) | |||
| @@ -359,8 +358,7 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell): | |||
| self.loss_scale = None | |||
| self.loss_scaling_manager = scale_update_cell | |||
| if scale_update_cell: | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), | |||
| name="loss_scale") | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) | |||
| @C.add_flags(has_effect=True) | |||
| def construct(self, | |||
| @@ -465,10 +463,10 @@ class BertTrainAccumulateStepsWithLossScaleCell(nn.Cell): | |||
| self.enable_global_norm = enable_global_norm | |||
| self.one = Tensor(np.array([1]).astype(np.int32)) | |||
| self.zero = Tensor(np.array([0]).astype(np.int32)) | |||
| self.local_step = Parameter(initializer(0, [1], mstype.int32), name="local_step") | |||
| self.local_step = Parameter(initializer(0, [1], mstype.int32)) | |||
| self.accu_grads = self.weights.clone(prefix="accu_grads", init='zeros') | |||
| self.accu_overflow = Parameter(initializer(0, [1], mstype.int32), name="accu_overflow") | |||
| self.loss = Parameter(initializer(0, [1], mstype.float32), name="accu_loss") | |||
| self.accu_overflow = Parameter(initializer(0, [1], mstype.int32)) | |||
| self.accu_loss = Parameter(initializer(0, [1], mstype.float32)) | |||
| self.grad = C.GradOperation(get_by_list=True, sens_param=True) | |||
| self.reducer_flag = False | |||
| @@ -499,8 +497,7 @@ class BertTrainAccumulateStepsWithLossScaleCell(nn.Cell): | |||
| self.loss_scale = None | |||
| self.loss_scaling_manager = scale_update_cell | |||
| if scale_update_cell: | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), | |||
| name="loss_scale") | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) | |||
| @C.add_flags(has_effect=True) | |||
| def construct(self, | |||
| @@ -529,8 +526,8 @@ class BertTrainAccumulateStepsWithLossScaleCell(nn.Cell): | |||
| # update accumulation parameters | |||
| is_accu_step = self.not_equal(self.local_step, self.accumulation_steps) | |||
| self.local_step = self.select(is_accu_step, self.local_step + self.one, self.one) | |||
| self.loss = self.select(is_accu_step, self.loss + loss, loss) | |||
| mean_loss = self.loss / self.local_step | |||
| self.accu_loss = self.select(is_accu_step, self.accu_loss + loss, loss) | |||
| mean_loss = self.accu_loss / self.local_step | |||
| is_accu_step = self.not_equal(self.local_step, self.accumulation_steps) | |||
| # alloc status and clear should be right before gradoperation | |||
| @@ -110,8 +110,7 @@ class EmbeddingLookup(nn.Cell): | |||
| self.use_one_hot_embeddings = use_one_hot_embeddings | |||
| self.embedding_table = Parameter(initializer | |||
| (TruncatedNormal(initializer_range), | |||
| [vocab_size, embedding_size]), | |||
| name='embedding_table') | |||
| [vocab_size, embedding_size])) | |||
| self.expand = P.ExpandDims() | |||
| self.shape_flat = (-1,) | |||
| self.gather = P.GatherV2() | |||
| @@ -170,8 +169,7 @@ class EmbeddingPostprocessor(nn.Cell): | |||
| self.embedding_table = Parameter(initializer | |||
| (TruncatedNormal(initializer_range), | |||
| [token_type_vocab_size, | |||
| embedding_size]), | |||
| name='embedding_table') | |||
| embedding_size])) | |||
| self.shape_flat = (-1,) | |||
| self.one_hot = P.OneHot() | |||
| @@ -188,8 +186,7 @@ class EmbeddingPostprocessor(nn.Cell): | |||
| self.full_position_embeddings = Parameter(initializer | |||
| (TruncatedNormal(initializer_range), | |||
| [max_position_embeddings, | |||
| embedding_size]), | |||
| name='full_position_embeddings') | |||
| embedding_size])) | |||
| def construct(self, token_type_ids, word_embeddings): | |||
| """Postprocessors apply positional and token type embeddings to word embeddings.""" | |||
| @@ -314,8 +311,7 @@ class RelaPosEmbeddingsGenerator(nn.Cell): | |||
| self.embeddings_table = Parameter( | |||
| initializer(TruncatedNormal(initializer_range), | |||
| [self.vocab_size, self.depth]), | |||
| name='embeddings_for_position') | |||
| [self.vocab_size, self.depth])) | |||
| self.relative_positions_matrix = RelaPosMatrixGenerator(length=length, | |||
| max_relative_position=max_relative_position) | |||
| @@ -86,8 +86,8 @@ class BertPretrainEva(nn.Cell): | |||
| self.equal = P.Equal() | |||
| self.mean = P.ReduceMean() | |||
| self.sum = P.ReduceSum() | |||
| self.total = Parameter(Tensor([0], mstype.float32), name='total') | |||
| self.acc = Parameter(Tensor([0], mstype.float32), name='acc') | |||
| self.total = Parameter(Tensor([0], mstype.float32)) | |||
| self.acc = Parameter(Tensor([0], mstype.float32)) | |||
| self.reshape = P.Reshape() | |||
| self.shape = P.Shape() | |||
| self.cast = P.Cast() | |||
| @@ -98,8 +98,7 @@ class GetMaskedLMOutput(nn.Cell): | |||
| self.output_bias = Parameter( | |||
| initializer( | |||
| 'zero', | |||
| config.vocab_size), | |||
| name='output_bias') | |||
| config.vocab_size)) | |||
| self.matmul = P.MatMul(transpose_b=True) | |||
| self.log_softmax = nn.LogSoftmax(axis=-1) | |||
| self.shape_flat_offsets = (-1, 1) | |||
| @@ -379,8 +378,7 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell): | |||
| self.loss_scale = None | |||
| self.loss_scaling_manager = scale_update_cell | |||
| if scale_update_cell: | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), | |||
| name="loss_scale") | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) | |||
| @C.add_flags(has_effect=True) | |||
| def construct(self, | |||
| @@ -136,8 +136,7 @@ class EmbeddingLookup(nn.Cell): | |||
| self.use_one_hot_embeddings = use_one_hot_embeddings | |||
| self.embedding_table = Parameter(initializer | |||
| (TruncatedNormal(initializer_range), | |||
| [vocab_size, embedding_size]), | |||
| name='embedding_table') | |||
| [vocab_size, embedding_size])) | |||
| self.expand = P.ExpandDims() | |||
| self.shape_flat = (-1,) | |||
| self.gather = P.GatherV2() | |||
| @@ -200,7 +199,6 @@ class EmbeddingPostprocessor(nn.Cell): | |||
| embedding_shape=embedding_shape, | |||
| use_one_hot_embeddings=use_one_hot_embeddings, | |||
| initializer_range=initializer_range, | |||
| name='embedding_table', | |||
| batch_size=batch_size, | |||
| damping=damping, | |||
| loss_scale=loss_scale, | |||
| @@ -224,7 +222,6 @@ class EmbeddingPostprocessor(nn.Cell): | |||
| embedding_shape=position_embedding_shape, | |||
| use_one_hot_embeddings=use_one_hot_embeddings, | |||
| initializer_range=initializer_range, | |||
| name='full_position_embeddings', | |||
| batch_size=batch_size, | |||
| damping=damping, | |||
| loss_scale=loss_scale, | |||
| @@ -363,8 +360,7 @@ class RelaPosEmbeddingsGenerator(nn.Cell): | |||
| self.embeddings_table = Parameter( | |||
| initializer(TruncatedNormal(initializer_range), | |||
| [self.vocab_size, self.depth]), | |||
| name='embeddings_for_position') | |||
| [self.vocab_size, self.depth])) | |||
| self.relative_positions_matrix = RelaPosMatrixGenerator(length=length, | |||
| max_relative_position=max_relative_position) | |||
| @@ -944,7 +940,6 @@ class BertModel(nn.Cell): | |||
| embedding_shape=output_embedding_shape, | |||
| use_one_hot_embeddings=use_one_hot_embeddings, | |||
| initializer_range=config.initializer_range, | |||
| name='embedding_table', | |||
| batch_size=batch_size, | |||
| damping=damping, | |||
| loss_scale=loss_scale, | |||
| @@ -94,9 +94,9 @@ class FusedLayerNorm(Cell): | |||
| self.begin_norm_axis = begin_norm_axis | |||
| self.begin_params_axis = begin_params_axis | |||
| self.gamma = Parameter(initializer( | |||
| gamma_init, normalized_shape), name="gamma") | |||
| gamma_init, normalized_shape)) | |||
| self.beta = Parameter(initializer( | |||
| beta_init, normalized_shape), name="beta") | |||
| beta_init, normalized_shape)) | |||
| self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis) | |||
| self.batch_norm = P.BatchNorm(is_training=True, epsilon=1e-5) | |||
| @@ -52,7 +52,7 @@ class THOR(Optimizer): | |||
| super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale) | |||
| if isinstance(momentum, float) and momentum < 0.0: | |||
| raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) | |||
| self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") | |||
| self.momentum = Parameter(Tensor(momentum, mstype.float32)) | |||
| self.params = self.parameters | |||
| self.moments = self.params.clone(prefix="moments", init='zeros') | |||
| self.hyper_map = C.HyperMap() | |||
| @@ -80,7 +80,7 @@ class THOR(Optimizer): | |||
| self.batch_size = batch_size | |||
| self.damping = damping | |||
| self.one = Tensor(1, mstype.int32) | |||
| self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) | |||
| self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False) | |||
| def construct(self, gradients): | |||
| """construct of THOR""" | |||
| @@ -54,7 +54,7 @@ class THOR(Optimizer): | |||
| super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale) | |||
| if isinstance(momentum, float) and momentum < 0.0: | |||
| raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) | |||
| self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") | |||
| self.momentum = Parameter(Tensor(momentum, mstype.float32)) | |||
| self.params = self.parameters | |||
| self.moments = self.params.clone(prefix="moments", init='zeros') | |||
| self.hyper_map = C.HyperMap() | |||
| @@ -82,7 +82,7 @@ class THOR(Optimizer): | |||
| self.batch_size = batch_size | |||
| self.damping = damping | |||
| self.one = Tensor(1, mstype.int32) | |||
| self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) | |||
| self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False) | |||
| mean = _get_gradients_mean() | |||
| degree = _get_device_num() | |||
| self.grad_reducer_g = DistributedGradReducerThor(self.parameters, 3, mean, degree) | |||
| @@ -41,7 +41,6 @@ class Embedding_Thor(Cell): | |||
| embedding_shape, | |||
| use_one_hot_embeddings=False, | |||
| initializer_range=0.02, | |||
| name='embedding_table', | |||
| batch_size=12, | |||
| damping=0.03, | |||
| loss_scale=1, | |||
| @@ -52,8 +51,7 @@ class Embedding_Thor(Cell): | |||
| self.use_one_hot_embeddings = use_one_hot_embeddings | |||
| self.embedding_table = Parameter(initializer | |||
| (TruncatedNormal(initializer_range), | |||
| [vocab_size, embedding_size]), | |||
| name=name) | |||
| [vocab_size, embedding_size])) | |||
| self.thor = True | |||
| self.expand = P.ExpandDims() | |||
| self.shape_flat = (-1,) | |||
| @@ -67,14 +65,13 @@ class Embedding_Thor(Cell): | |||
| self.shape = P.Shape() | |||
| self.loss_scale = Tensor(1 / loss_scale, mstype.float16) | |||
| self.matrix_A_inv = Parameter(Tensor(np.zeros([vocab_size]).astype(np.float16)), | |||
| name='matrix_A_inv', requires_grad=False) | |||
| self.matrix_A_inv = Parameter(Tensor(np.zeros([vocab_size]).astype(np.float16)), requires_grad=False) | |||
| self.matrix_G_inv = Parameter(Tensor(np.zeros([embedding_size, embedding_size]).astype(np.float16)), | |||
| name="matrix_G_inv", requires_grad=False) | |||
| requires_grad=False) | |||
| self.fake_G = Tensor(np.zeros([embedding_size, embedding_size]).astype(np.float16)) | |||
| self.dampingA = Tensor(np.ones([vocab_size]).astype(np.float32)) | |||
| self.dampingG = Tensor(np.identity(embedding_size), mstype.float32) | |||
| self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) | |||
| self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False) | |||
| self.freq = Tensor(frequency, mstype.int32) | |||
| self.axis = 0 | |||
| self.damping = damping | |||
| @@ -169,14 +166,14 @@ class Dense_Thor(Cell): | |||
| weight_init.shape()[1] != in_channels: | |||
| raise ValueError("weight_init shape error") | |||
| self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight") | |||
| self.weight = Parameter(initializer(weight_init, [out_channels, in_channels])) | |||
| if self.has_bias: | |||
| if isinstance(bias_init, Tensor): | |||
| if bias_init.dim() != 1 or bias_init.shape()[0] != out_channels: | |||
| raise ValueError("bias_init shape error") | |||
| self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias") | |||
| self.bias = Parameter(initializer(bias_init, [out_channels])) | |||
| self.matmul = P.MatMul(transpose_b=True) | |||
| self.bias_add = P.BiasAdd() | |||
| @@ -184,9 +181,9 @@ class Dense_Thor(Cell): | |||
| self.activation = get_activation(activation) | |||
| self.activation_flag = self.activation is not None | |||
| self.matrix_A_inv = Parameter(Tensor(np.zeros([in_channels, in_channels]).astype(np.float16)), | |||
| name='matrix_A_inv', requires_grad=False) | |||
| requires_grad=False) | |||
| self.matrix_G_inv = Parameter(Tensor(np.zeros([out_channels, out_channels]).astype(np.float16)), | |||
| name="matrix_G_inv", requires_grad=False) | |||
| requires_grad=False) | |||
| self.fake_G = Tensor(np.zeros([out_channels, out_channels]).astype(np.float16)) | |||
| self.matmul = P.MatMul(transpose_b=True) | |||
| @@ -196,7 +193,7 @@ class Dense_Thor(Cell): | |||
| self.shape = P.Shape() | |||
| self.reshape = P.Reshape() | |||
| self.transpose = P.Transpose() | |||
| self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) | |||
| self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False) | |||
| self.mul = P.Mul() | |||
| self.cast = P.Cast() | |||
| self.damping = damping | |||
| @@ -57,11 +57,10 @@ class BahdanauAttention(nn.Cell): | |||
| self.normalize = normalize | |||
| self.num_units = num_units | |||
| self.linear_att = Parameter(Tensor(np.random.uniform(-initializer_range, initializer_range, size=[num_units]), | |||
| dtype=mstype.float32), name='linear_att') | |||
| dtype=mstype.float32)) | |||
| if self.normalize: | |||
| self.normalize_scalar = Parameter(Tensor(np.array([1.0 / num_units]), dtype=mstype.float32), | |||
| name='normalize_scalar') | |||
| self.normalize_bias = Parameter(Tensor(np.zeros(num_units), dtype=mstype.float32), name='normalize_bias') | |||
| self.normalize_scalar = Parameter(Tensor(np.array([1.0 / num_units]), dtype=mstype.float32)) | |||
| self.normalize_bias = Parameter(Tensor(np.zeros(num_units), dtype=mstype.float32)) | |||
| self.transpose = P.Transpose() | |||
| self.transpose_orders = (1, 0, 2) | |||
| self.shape_op = P.Shape() | |||
| @@ -49,10 +49,10 @@ class DynamicRNNCell(nn.Cell): | |||
| # w | |||
| dynamicRNN_w = np.random.uniform(-initializer_range, initializer_range, | |||
| size=[self.input_size + self.hidden_size, 4 * self.hidden_size]) | |||
| self.dynamicRNN_w = Parameter(Tensor(dynamicRNN_w, mstype.float32), name='weight') | |||
| self.dynamicRNN_w = Parameter(Tensor(dynamicRNN_w, mstype.float32)) | |||
| # b | |||
| dynamicRNN_b = np.random.uniform(-initializer_range, initializer_range, size=[4 * self.hidden_size]) | |||
| self.dynamicRNN_b = Parameter(Tensor(dynamicRNN_b, mstype.float32), name='bias') | |||
| self.dynamicRNN_b = Parameter(Tensor(dynamicRNN_b, mstype.float32)) | |||
| self.dynamicRNN_h = Tensor(np.zeros((1, self.batch_size, self.hidden_size)), mstype.float32) | |||
| self.dynamicRNN_c = Tensor(np.zeros((1, self.batch_size, self.hidden_size)), mstype.float32) | |||
| @@ -48,8 +48,7 @@ class EmbeddingLookup(nn.Cell): | |||
| self.use_one_hot_embeddings = use_one_hot_embeddings | |||
| init_weight = np.random.normal(-initializer_range, initializer_range, size=[vocab_size, embed_dim]) | |||
| self.embedding_table = Parameter(Tensor(init_weight, mstype.float32), | |||
| name='embedding_table') | |||
| self.embedding_table = Parameter(Tensor(init_weight, mstype.float32)) | |||
| self.expand = P.ExpandDims() | |||
| self.gather = P.GatherV2() | |||
| self.one_hot = P.OneHot() | |||
| @@ -253,8 +253,7 @@ class GNMTTrainOneStepWithLossScaleCell(nn.Cell): | |||
| self.loss_scale = None | |||
| self.loss_scaling_manager = scale_update_cell | |||
| if scale_update_cell: | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), | |||
| name="loss_scale") | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) | |||
| self.add_flags(has_effect=True) | |||
| self.loss_scalar = P.ScalarSummary() | |||
| @@ -217,8 +217,8 @@ class Adam(Optimizer): | |||
| self.beta1 = Tensor(beta1, mstype.float32) | |||
| self.beta2 = Tensor(beta2, mstype.float32) | |||
| self.beta1_power = Parameter(initializer(1, [1], mstype.float32), name="beta1_power") | |||
| self.beta2_power = Parameter(initializer(1, [1], mstype.float32), name="beta2_power") | |||
| self.beta1_power = Parameter(initializer(1, [1], mstype.float32)) | |||
| self.beta2_power = Parameter(initializer(1, [1], mstype.float32)) | |||
| self.eps = eps | |||
| self.moment1 = self.parameters.clone(prefix="moment1", init='zeros') | |||
| @@ -377,7 +377,7 @@ class AdamWeightDecayDynamicLR(Optimizer): | |||
| _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name) | |||
| _check_learning_rate_value(learning_rate, end_learning_rate, decay_steps, power, self.cls_name) | |||
| # turn them to scalar when me support scalar/tensor mix operations | |||
| self.global_step = Parameter(initializer(0, [1]), name="global_step") | |||
| self.global_step = Parameter(initializer(0, [1])) | |||
| self.warmup_steps = Tensor(np.array([warmup_steps]).astype(np.float32)) | |||
| self.warmup_flag = False | |||
| if warmup_steps > 0: | |||
| @@ -41,8 +41,8 @@ class LayerNorm(nn.Cell): | |||
| """ | |||
| def __init__(self, normalized_shape, eps=1e-5): | |||
| super(LayerNorm, self).__init__() | |||
| self.gamma = Parameter(initializer('ones', normalized_shape), name="gamma") | |||
| self.beta = Parameter(initializer('zeros', normalized_shape), name="beta") | |||
| self.gamma = Parameter(initializer('ones', normalized_shape)) | |||
| self.beta = Parameter(initializer('zeros', normalized_shape)) | |||
| self.mean = P.ReduceMean(keep_dims=True) | |||
| self.eps = eps | |||
| @@ -100,8 +100,8 @@ class Mapping(nn.Cell): | |||
| super(Mapping, self).__init__() | |||
| self.output_size = output_size | |||
| self.input_size = input_size | |||
| self.weight = Parameter(initializer(Normal(sigma=0.02*scale), [input_size, output_size]), name="mapping_weight") | |||
| self.bias = Parameter(initializer("zeros", [output_size,]), name="mapping_bias") | |||
| self.weight = Parameter(initializer(Normal(sigma=0.02*scale), [input_size, output_size])) | |||
| self.bias = Parameter(initializer("zeros", [output_size,])) | |||
| self.dtype = dtype | |||
| self.cast = P.Cast() | |||
| @@ -194,8 +194,7 @@ class EmbeddingLookup(nn.Cell): | |||
| super(EmbeddingLookup, self).__init__() | |||
| self.vocab_size = config.vocab_size | |||
| self.embedding_size = config.embedding_size | |||
| self.embedding_table = Parameter(initializer(TruncatedNormal(0.02), [self.vocab_size, self.embedding_size]), | |||
| name="embedding_table") | |||
| self.embedding_table = Parameter(initializer(TruncatedNormal(0.02), [self.vocab_size, self.embedding_size])) | |||
| self.gather = P.GatherV2() | |||
| self.shape = (-1, config.seq_length, config.embedding_size) | |||
| def construct(self, input_ids): | |||
| @@ -106,8 +106,7 @@ class GPTTrainOneStepWithLossScaleCell(nn.Cell): | |||
| self.loss_scale = None | |||
| self.loss_scaling_manager = scale_update_cell | |||
| if scale_update_cell: | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), | |||
| name="loss_scale") | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) | |||
| @C.add_flags(has_effect=True) | |||
| def construct(self, | |||
| @@ -44,8 +44,7 @@ class EmbeddingLookup(nn.Cell): | |||
| init_weight = np.random.normal(0, embed_dim ** -0.5, size=[vocab_size, embed_dim]).astype(np.float32) | |||
| # 0 is Padding index, thus init it as 0. | |||
| init_weight[0, :] = 0 | |||
| self.embedding_table = Parameter(Tensor(init_weight), | |||
| name='embedding_table') | |||
| self.embedding_table = Parameter(Tensor(init_weight)) | |||
| self.expand = P.ExpandDims() | |||
| self.gather = P.GatherV2() | |||
| self.one_hot = P.OneHot() | |||
| @@ -277,8 +277,7 @@ class TransformerTrainOneStepWithLossScaleCell(nn.Cell): | |||
| self.loss_scale = None | |||
| self.loss_scaling_manager = scale_update_cell | |||
| if scale_update_cell: | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), | |||
| name="loss_scale") | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) | |||
| self.add_flags(has_effect=True) | |||
| def construct(self, | |||
| @@ -44,8 +44,7 @@ class EmbeddingLookup(nn.Cell): | |||
| init_weight = np.random.normal(0, embed_dim ** -0.5, size=[vocab_size, embed_dim]).astype(np.float32) | |||
| # 0 is Padding index, thus init it as 0. | |||
| init_weight[0, :] = 0 | |||
| self.embedding_table = Parameter(Tensor(init_weight), | |||
| name='embedding_table') | |||
| self.embedding_table = Parameter(Tensor(init_weight)) | |||
| self.expand = P.ExpandDims() | |||
| self.gather = P.GatherV2() | |||
| self.one_hot = P.OneHot() | |||
| @@ -243,8 +243,7 @@ class BertTrainWithLossScaleCell(nn.Cell): | |||
| self.loss_scale = None | |||
| self.loss_scaling_manager = scale_update_cell | |||
| if scale_update_cell: | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), | |||
| name="loss_scale") | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) | |||
| @C.add_flags(has_effect=True) | |||
| def construct(self, | |||
| @@ -497,8 +496,7 @@ class BertEvaluationWithLossScaleCell(nn.Cell): | |||
| self.loss_scale = None | |||
| self.loss_scaling_manager = scale_update_cell | |||
| if scale_update_cell: | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), | |||
| name="loss_scale") | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) | |||
| @C.add_flags(has_effect=True) | |||
| def construct(self, | |||
| @@ -110,8 +110,7 @@ class EmbeddingLookup(nn.Cell): | |||
| self.use_one_hot_embeddings = use_one_hot_embeddings | |||
| self.embedding_table = Parameter(initializer | |||
| (TruncatedNormal(initializer_range), | |||
| [vocab_size, embedding_size]), | |||
| name='embedding_table') | |||
| [vocab_size, embedding_size])) | |||
| self.expand = P.ExpandDims() | |||
| self.shape_flat = (-1,) | |||
| self.gather = P.GatherV2() | |||
| @@ -170,8 +169,7 @@ class EmbeddingPostprocessor(nn.Cell): | |||
| self.embedding_table = Parameter(initializer | |||
| (TruncatedNormal(initializer_range), | |||
| [token_type_vocab_size, | |||
| embedding_size]), | |||
| name='embedding_table') | |||
| embedding_size])) | |||
| self.shape_flat = (-1,) | |||
| self.one_hot = P.OneHot() | |||
| self.on_value = Tensor(1.0, mstype.float32) | |||
| @@ -187,8 +185,7 @@ class EmbeddingPostprocessor(nn.Cell): | |||
| self.full_position_embeddings = Parameter(initializer | |||
| (TruncatedNormal(initializer_range), | |||
| [max_position_embeddings, | |||
| embedding_size]), | |||
| name='full_position_embeddings') | |||
| embedding_size])) | |||
| def construct(self, token_type_ids, word_embeddings): | |||
| """embedding postprocessor""" | |||
| @@ -317,8 +314,7 @@ class RelaPosEmbeddingsGenerator(nn.Cell): | |||
| self.use_one_hot_embeddings = use_one_hot_embeddings | |||
| self.embeddings_table = Parameter( | |||
| initializer(TruncatedNormal(initializer_range), | |||
| [self.vocab_size, self.depth]), | |||
| name='embeddings_for_position') | |||
| [self.vocab_size, self.depth])) | |||
| self.relative_positions_matrix = RelaPosMatrixGenerator(length=length, | |||
| max_relative_position=max_relative_position) | |||
| self.reshape = P.Reshape() | |||
| @@ -291,8 +291,7 @@ class TransformerTrainOneStepWithLossScaleCell(nn.Cell): | |||
| self.loss_scale = None | |||
| self.loss_scaling_manager = scale_update_cell | |||
| if scale_update_cell: | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), | |||
| name="loss_scale") | |||
| self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) | |||
| @C.add_flags(has_effect=True) | |||
| def construct(self, | |||
| @@ -115,8 +115,7 @@ class EmbeddingLookup(nn.Cell): | |||
| self.vocab_size = vocab_size | |||
| self.embedding_size = embedding_size | |||
| self.use_one_hot_embeddings = use_one_hot_embeddings | |||
| self.embedding_table = Parameter(normal_weight([vocab_size, embedding_size], embedding_size), | |||
| name='embedding_table') | |||
| self.embedding_table = Parameter(normal_weight([vocab_size, embedding_size], embedding_size)) | |||
| self.expand = P.ExpandDims() | |||
| self.shape_flat = (-1,) | |||
| self.gather = P.GatherV2() | |||
| @@ -47,14 +47,14 @@ class DenseLayer(nn.Cell): | |||
| weight_init.shape()[1] != in_channels: | |||
| raise ValueError("weight_init shape error") | |||
| self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight") | |||
| self.weight = Parameter(initializer(weight_init, [out_channels, in_channels])) | |||
| if self.has_bias: | |||
| if isinstance(bias_init, Tensor): | |||
| if bias_init.dim() != 1 or bias_init.shape()[0] != out_channels: | |||
| raise ValueError("bias_init shape error") | |||
| self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias") | |||
| self.bias = Parameter(initializer(bias_init, [out_channels])) | |||
| self.matmul = P.MatMul(transpose_b=True) | |||
| self.bias_add = P.BiasAdd() | |||
| @@ -35,10 +35,10 @@ class QuanConv(nn.Conv2d): | |||
| self.x_upper_bound = Tensor(2 ** 8 - 1, ms.float32) | |||
| self.w_lower_bound = Tensor(-2 ** 7 - 1, ms.float32) | |||
| self.w_upper_bound = Tensor(2 ** 7, ms.float32) | |||
| self.scale_a = Parameter(initializer('ones', [1]), name='scale_a') | |||
| self.scale_a = Parameter(initializer('ones', [1])) | |||
| self.scale_w = Parameter(initializer( | |||
| 'ones', [out_channels]), name='scale_w') | |||
| self.zp_a = Parameter(initializer('ones', [1]), name='zp_a') | |||
| 'ones', [out_channels])) | |||
| self.zp_a = Parameter(initializer('ones', [1])) | |||
| def construct(self, in_data): | |||
| r"""construct of QuantConv""" | |||
| @@ -119,12 +119,12 @@ class DepthwiseConv(nn.Cell): | |||
| self.bias_add = P.BiasAdd() | |||
| weight_shape = [channel_multiplier, in_planes, *self.kernel_size] | |||
| self.weight = Parameter(initializer( | |||
| 'ones', weight_shape), name='weight') | |||
| 'ones', weight_shape)) | |||
| if has_bias: | |||
| bias_shape = [channel_multiplier * in_planes] | |||
| self.bias = Parameter(initializer( | |||
| 'zeros', bias_shape), name='bias') | |||
| 'zeros', bias_shape)) | |||
| else: | |||
| self.bias = None | |||
| @@ -499,7 +499,7 @@ class DepthWiseConv(nn.Cell): | |||
| group=in_planes) | |||
| self.weight = Parameter(initializer(weight_init, | |||
| [in_planes*1, 1, kernel_size, kernel_size]), name='depthwise_weight') | |||
| [in_planes*1, 1, kernel_size, kernel_size])) | |||
| else: | |||
| self.depthwise_conv = P.DepthwiseConv2dNative(channel_multiplier=1, | |||
| @@ -508,7 +508,7 @@ class DepthWiseConv(nn.Cell): | |||
| pad=int(kernel_size/2)) | |||
| self.weight = Parameter(initializer(weight_init, | |||
| [1, in_planes, kernel_size, kernel_size]), name='depthwise_weight') | |||
| [1, in_planes, kernel_size, kernel_size])) | |||
| def construct(self, x): | |||
| x = self.depthwise_conv(x, self.weight) | |||
| @@ -31,11 +31,11 @@ class DepthWiseConv(nn.Cell): | |||
| self.bias_add = P.BiasAdd() | |||
| weight_shape = [channel_multiplier, in_planes, kernel_size[0], kernel_size[1]] | |||
| self.weight = Parameter(initializer('ones', weight_shape), name='weight') | |||
| self.weight = Parameter(initializer('ones', weight_shape)) | |||
| if has_bias: | |||
| bias_shape = [channel_multiplier * in_planes] | |||
| self.bias = Parameter(initializer('zeros', bias_shape), name='bias') | |||
| self.bias = Parameter(initializer('zeros', bias_shape)) | |||
| else: | |||
| self.bias = None | |||