# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """base process""" import copy import mindspore.nn as nn from mindspore.nn.optim import LARS from mindspore import log as logger from mindspore.common import Parameter from .less_batch_normalization import CommonHeadLastFN __all__ = ["OptimizerProcess", "ParameterProcess"] class OptimizerProcess: r""" Process optimizer for Boost. Currently, this class supports adding GC(grad centralization) tags and creating new optimizers. Args: opt (Cell): Optimizer used. Examples: >>> from mindspore import Tensor, Parameter, nn >>> import mindspore.ops import ops >>> from mindspore.boost import OptimizerProcess >>> >>> class Net(nn.Cell): ... def __init__(self, in_features, out_features): ... super(Net, self).__init__() ... self.weight = Parameter(Tensor(np.ones([in_features, out_features]).astype(np.float32)), ... name='weight') ... self.matmul = ops.MatMul() ... ... def construct(self, x): ... output = self.matmul(x, self.weight) ... return output ... >>> size, in_features, out_features = 16, 16, 10 >>> network = Net(in_features, out_features) >>> optimizer = nn.Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) >>> optimizer_process = OptimizerProcess(optimizer) >>> optimizer_process.add_grad_centralization(network) >>> optimizer = optimizer_process.generate_new_optimizer() """ def __init__(self, opt): if isinstance(opt, LARS): self.is_lars = True self.opt_class = type(opt.opt) self.opt_init_args = opt.opt.init_args self.lars_init_args = opt.init_args else: self.is_lars = False self.opt_class = type(opt) self.opt_init_args = opt.init_args self.origin_params = opt.init_params["params"] def build_params_dict(self, network): r""" Build the parameter's dict of the network. Args: network (Cell): The training network. """ cells = network.cells_and_names() params_dict = {} for _, cell in cells: for par in cell.get_parameters(expand=False): params_dict[id(par)] = cell return params_dict def build_gc_params_group(self, params_dict, parameters): r""" Build the parameter's group with grad centralization. Args: params_dict (dict): The network's parameter dict. parameters (list): The network's parameter list. """ group_params = [] for group_param in parameters: if 'order_params' in group_param.keys(): group_params.append(group_param) continue params_gc_value = [] params_value = [] for param in group_param['params']: if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: param_cell = params_dict[id(param)] if (isinstance(param_cell, nn.Conv2d) and param_cell.group > 1) or \ isinstance(param_cell, CommonHeadLastFN): params_value.append(param) else: params_gc_value.append(param) else: params_value.append(param) if params_gc_value: new_group_param = copy.deepcopy(group_param) new_group_param['params'] = params_gc_value new_group_param['grad_centralization'] = True group_params.append(new_group_param) if params_value: new_group_param = copy.deepcopy(group_param) new_group_param['params'] = params_value group_params.append(new_group_param) return group_params def add_grad_centralization(self, network): r""" Add gradient centralization. Args: network (Cell): The training network. """ params_dict = self.build_params_dict(network) parameters = self.origin_params if parameters is not None and not isinstance(parameters, list): parameters = list(parameters) if not parameters: raise ValueError("Optimizer got an empty parameter list.") if not isinstance(parameters[0], (dict, Parameter)): raise TypeError("Only a list of Parameter or dict can be supported.") if isinstance(parameters[0], Parameter): logger.warning("Only group parameters support gradient centralization.") return self.origin_params = self.build_gc_params_group(params_dict, parameters) def generate_new_optimizer(self): """Generate new optimizer.""" if not self.is_lars: opt = self.opt_class(params=self.origin_params, **self.opt_init_args) else: opt = LARS(self.opt_class(params=self.origin_params, **self.opt_init_args), **self.lars_init_args) return opt class ParameterProcess: r""" Process parameter for Boost. Currently, this class supports creating group parameters and automatically setting gradient segmentation point. Examples: >>> from mindspore import Tensor, Parameter, nn >>> import mindspore.ops as ops >>> from mindspore.boost import OptimizerProcess >>> >>> class Net(nn.Cell): ... def __init__(self, in_features, out_features): ... super(Net, self).__init__() ... self.weight = Parameter(Tensor(np.ones([in_features, out_features]).astype(np.float32)), ... name='weight') ... self.weight2 = Parameter(Tensor(np.ones([in_features, out_features]).astype(np.float32)), ... name='weight2') ... self.matmul = ops.MatMul() ... self.matmul2 = ops.MatMul() ... ... def construct(self, x): ... output = self.matmul(x, self.weight) ... output2 = self.matmul2(x, self.weight2) ... return output + output2 ... >>> size, in_features, out_features = 16, 16, 10 >>> network = Net(in_features, out_features) >>> new_parameter = net.trainable_params()[:1] >>> parameter_process = ParameterProcess() >>> group_params = parameter_process.generate_group_params(new_parameter, net.trainable_params()) """ def __init__(self): self._parameter_indices = 1 def assign_parameter_group(self, parameters, split_point=None): r""" Assign parameter group. Args: parameters (list): The network's parameter list. split_point (list): The gradient split point of this network. default: None. """ if not isinstance(parameters, (list, tuple)) or not parameters: return parameters parameter_len = len(parameters) if split_point: split_parameter_index = split_point else: split_parameter_index = [parameter_len // 2] for i in range(parameter_len): if i in split_parameter_index: self._parameter_indices += 1 parameters[i].comm_fusion = self._parameter_indices return parameters def generate_group_params(self, parameters, origin_params): r""" Generate group parameters. Args: parameters (list): The network's parameter list. origin_params (list): The network's origin parameter list. """ origin_params_copy = origin_params if origin_params_copy is not None: if not isinstance(origin_params_copy, list): origin_params_copy = list(origin_params_copy) if not origin_params_copy: raise ValueError("Optimizer got an empty parameter list.") if not isinstance(origin_params_copy[0], (dict, Parameter)): raise TypeError("Only a list of Parameter or dict can be supported.") if isinstance(origin_params_copy[0], Parameter): group_params = [{"params": parameters}] return group_params group_params = [] params_name = [param.name for param in parameters] new_params_count = copy.deepcopy(params_name) new_params_clone = {} max_key_number = 0 for group_param in origin_params_copy: if 'order_params' in group_param.keys(): new_group_param = copy.deepcopy(group_param) new_group_param['order_params'] = parameters group_params.append(new_group_param) continue params_value = [] for param in group_param['params']: if param.name in params_name: index = params_name.index(param.name) params_value.append(parameters[index]) new_params_count.remove(param.name) new_group_param = copy.deepcopy(group_param) new_group_param['params'] = params_value group_params.append(new_group_param) if len(group_param.keys()) > max_key_number: max_key_number = len(group_param.keys()) new_params_clone = copy.deepcopy(group_param) if new_params_count: params_value = [] for param in new_params_count: index = params_name.index(param) params_value.append(parameters[index]) if new_params_clone: new_params_clone['params'] = params_value group_params.append(new_params_clone) else: group_params.append({"params": params_value}) return group_params