From: @yepei6 Reviewed-by: @kingxian,@zh_qh Signed-off-by: @kingxiantags/v1.2.0-rc1
| @@ -46,21 +46,21 @@ def create_quant_config(quant_observer=(nn.FakeQuantWithMinMaxObserver, nn.FakeQ | |||
| Configs the observer type of weights and data flow with quant params. | |||
| Args: | |||
| quant_observer (Observer, list or tuple): The observer type to do quantization. The first element represent | |||
| quant_observer (Union[Observer, list, tuple]): The observer type to do quantization. The first element represent | |||
| weights and second element represent data flow. | |||
| Default: (nn.FakeQuantWithMinMaxObserver, nn.FakeQuantWithMinMaxObserver) | |||
| quant_delay (int, list or tuple): Number of steps after which weights and activations are quantized during | |||
| quant_delay (Union[int, list, tuple]): Number of steps after which weights and activations are quantized during | |||
| eval. The first element represent weights and second element represent data flow. Default: (0, 0) | |||
| quant_dtype (QuantDtype, list or tuple): Datatype to use for quantize weights and activations. The first | |||
| quant_dtype (Union[QuantDtype, list, tuple]): Datatype to use for quantize weights and activations. The first | |||
| element represent weights and second element represent data flow. | |||
| Default: (QuantDtype.INT8, QuantDtype.INT8) | |||
| per_channel (bool, list or tuple): Quantization granularity based on layer or on channel. If `True` | |||
| per_channel (Union[bool, list, tuple]): Quantization granularity based on layer or on channel. If `True` | |||
| then base on per channel otherwise base on per layer. The first element represent weights | |||
| and second element represent data flow. Default: (False, False) | |||
| symmetric (bool, list or tuple): Whether the quantization algorithm is symmetric or not. If `True` then base on | |||
| symmetric otherwise base on asymmetric. The first element represent weights and second | |||
| symmetric (Union[bool, list, tuple]): Whether the quantization algorithm is symmetric or not. If `True` then | |||
| base on symmetric otherwise base on asymmetric. The first element represent weights and second | |||
| element represent data flow. Default: (False, False) | |||
| narrow_range (bool, list or tuple): Whether the quantization algorithm uses narrow range or not. | |||
| narrow_range (Union[bool, list, tuple]): Whether the quantization algorithm uses narrow range or not. | |||
| The first element represents weights and the second element represents data flow. Default: (False, False) | |||
| Returns: | |||
| @@ -123,20 +123,20 @@ class QuantizationAwareTraining(Quantizer): | |||
| Args: | |||
| bn_fold (bool): Flag to used bn fold ops for simulation inference operation. Default: True. | |||
| freeze_bn (int): Number of steps after which BatchNorm OP parameters used total mean and variance. Default: 1e7. | |||
| quant_delay (int, list or tuple): Number of steps after which weights and activations are quantized during | |||
| quant_delay (Union[int, list, tuple]): Number of steps after which weights and activations are quantized during | |||
| eval. The first element represent weights and second element represent data flow. Default: (0, 0) | |||
| quant_dtype (QuantDtype, list or tuple): Datatype to use for quantize weights and activations. The first | |||
| quant_dtype (Union[QuantDtype, list, tuple]): Datatype to use for quantize weights and activations. The first | |||
| element represent weights and second element represent data flow. | |||
| Default: (QuantDtype.INT8, QuantDtype.INT8) | |||
| per_channel (bool, list or tuple): Quantization granularity based on layer or on channel. If `True` | |||
| per_channel (Union[bool, list, tuple]): Quantization granularity based on layer or on channel. If `True` | |||
| then base on per channel otherwise base on per layer. The first element represent weights | |||
| and second element represent data flow. Default: (False, False) | |||
| symmetric (bool, list or tuple): Whether the quantization algorithm is symmetric or not. If `True` then base on | |||
| symmetric otherwise base on asymmetric. The first element represent weights and second | |||
| symmetric (Union[bool, list, tuple]): Whether the quantization algorithm is symmetric or not. If `True` then | |||
| base on symmetric otherwise base on asymmetric. The first element represent weights and second | |||
| element represent data flow. Default: (False, False) | |||
| narrow_range (bool, list or tuple): Whether the quantization algorithm uses narrow range or not. | |||
| narrow_range (Union[bool, list, tuple]): Whether the quantization algorithm uses narrow range or not. | |||
| The first element represents weights and the second element represents data flow. Default: (False, False) | |||
| optimize_option (OptimizeOption, list or tuple): Specifies the quant algorithm and options, currently only | |||
| optimize_option (Union[OptimizeOption, list, tuple]): Specifies the quant algorithm and options, currently only | |||
| support QAT. Default: OptimizeOption.QAT | |||
| one_conv_fold (bool): Flag to used one conv bn fold ops for simulation inference operation. Default: True. | |||
| @@ -187,6 +187,7 @@ class QuantizationAwareTraining(Quantizer): | |||
| one_conv_fold=True): | |||
| """Init for QuantizationAwareTraining quantizer""" | |||
| super(QuantizationAwareTraining, self).__init__(optimize_option=optimize_option) | |||
| def convert2list(name, value): | |||
| if not isinstance(value, list) and not isinstance(value, tuple): | |||
| value = [value] | |||
| @@ -133,6 +133,7 @@ def weight2int(data, scale, zero_point, data_type, num_bits=8, narrow_range=Fals | |||
| weight_int[weight_int < quant_min] = quant_min | |||
| return weight_int | |||
| def scale_zp_max_min_from_fake_quant_cell(cell, data_type): | |||
| """Get calculate quantization params for scale, zero point, max and min from `FakeQuantWithMinMax`.""" | |||
| minq = cell.minq.data.asnumpy() | |||
| @@ -271,31 +272,31 @@ def load_nonquant_param_into_quant_net(quant_model, params_dict, quant_new_param | |||
| Load fp32 model parameters into quantization model. | |||
| Args: | |||
| quant_model: quantization model. | |||
| params_dict: parameter dict that stores fp32 parameters. | |||
| quant_new_params: parameters that exist in quantitative network but not in unquantitative network. | |||
| quant_model(Cell): quantization model. | |||
| params_dict(dict): parameter dict that stores fp32 parameters. | |||
| quant_new_params(list): parameters that exist in quantitative network but not in unquantitative network. | |||
| Returns: | |||
| None | |||
| """ | |||
| iterable_dict = { | |||
| 'weight': iter([item for item in params_dict.items() if item[0].endswith('weight')]), | |||
| 'bias': iter([item for item in params_dict.items() if item[0].endswith('bias')]), | |||
| 'gamma': iter([item for item in params_dict.items() if item[0].endswith('gamma')]), | |||
| 'beta': iter([item for item in params_dict.items() if item[0].endswith('beta')]), | |||
| 'moving_mean': iter([item for item in params_dict.items() if item[0].endswith('moving_mean')]), | |||
| 'moving_variance': iter( | |||
| [item for item in params_dict.items() if item[0].endswith('moving_variance')]), | |||
| 'minq': iter([item for item in params_dict.items() if item[0].endswith('minq')]), | |||
| 'maxq': iter([item for item in params_dict.items() if item[0].endswith('maxq')]) | |||
| 'weight': iter(list(filter(lambda item: item[0].endswith('weight'), params_dict.items()))), | |||
| 'bias': iter(list(filter(lambda item: item[0].endswith('bias'), params_dict.items()))), | |||
| 'gamma': iter(list(filter(lambda item: item[0].endswith('gamma'), params_dict.items()))), | |||
| 'beta': iter(list(filter(lambda item: item[0].endswith('beta'), params_dict.items()))), | |||
| 'moving_mean': iter(list(filter(lambda item: item[0].endswith('moving_mean'), params_dict.items()))), | |||
| 'moving_variance': iter(list(filter(lambda item: item[0].endswith('moving_variance'), params_dict.items()))), | |||
| 'minq': iter(list(filter(lambda item: item[0].endswith('minq'), params_dict.items()))), | |||
| 'maxq': iter(list(filter(lambda item: item[0].endswith('maxq'), params_dict.items()))) | |||
| } | |||
| for name, param in quant_model.parameters_and_names(): | |||
| key_name = name.split(".")[-1] | |||
| if key_name not in iterable_dict.keys(): | |||
| if quant_new_params is not None and key_name in quant_new_params: | |||
| continue | |||
| raise ValueError(f"Can't find match parameter in ckpt,param name = {name}") | |||
| if key_name not in quant_new_params: | |||
| raise ValueError(f"Can't find match parameter in ckpt,param name = {name}") | |||
| continue | |||
| value_param = next(iterable_dict[key_name], None) | |||
| if value_param is not None: | |||
| if value_param: | |||
| param.set_data(value_param[1].data) | |||
| print(f'init model param {name} with checkpoint param {value_param[0]}') | |||
| @@ -645,8 +645,8 @@ def set_context(**kwargs): | |||
| >>> context.set_context(mode=context.GRAPH_MODE, | |||
| ... device_target="Ascend",device_id=0, save_graphs=True, | |||
| ... save_graphs_path="/mindspore") | |||
| >>> context.set_context(enable_profiling=True, \ | |||
| profiling_options='{"output":"/home/data/output","training_trace":"on"}') | |||
| >>> context.set_context(enable_profiling=True, | |||
| ... profiling_options='{"output":"/home/data/output","training_trace":"on"}') | |||
| >>> context.set_context(max_device_memory="3.5GB") | |||
| >>> context.set_context(print_file_path="print.pb") | |||
| >>> context.set_context(max_call_depth=80) | |||
| @@ -734,16 +734,14 @@ def set_ps_context(**kwargs): | |||
| Some other environment variables should also be set for parameter server training mode. | |||
| These environment variables are listed below: | |||
| .. code-block:: | |||
| MS_SERVER_NUM # Server number | |||
| MS_WORKER_NUM # Worker number | |||
| MS_SCHED_HOST # Scheduler IP address | |||
| MS_SCHED_PORT # Scheduler port | |||
| MS_ROLE # The role of this process: | |||
| # MS_SCHED represents the scheduler, | |||
| # MS_WORKER represents the worker, | |||
| # MS_PSERVER represents the Server | |||
| MS_SERVER_NUM # Server number | |||
| MS_WORKER_NUM # Worker number | |||
| MS_SCHED_HOST # Scheduler IP address | |||
| MS_SCHED_PORT # Scheduler port | |||
| MS_ROLE # The role of this process: | |||
| MS_SCHED #represents the scheduler, | |||
| MS_WORKER #represents the worker, | |||
| MS_PSERVER #represents the Server | |||
| Args: | |||
| @@ -81,14 +81,14 @@ class CheckpointConfig: | |||
| Args: | |||
| save_checkpoint_steps (int): Steps to save checkpoint. Default: 1. | |||
| save_checkpoint_seconds (int): Seconds to save checkpoint. Default: 0. | |||
| Can't be used with save_checkpoint_steps at the same time. | |||
| save_checkpoint_seconds (int): Seconds to save checkpoint. | |||
| Can't be used with save_checkpoint_steps at the same time. Default: 0. | |||
| keep_checkpoint_max (int): Maximum number of checkpoint files can be saved. Default: 5. | |||
| keep_checkpoint_per_n_minutes (int): Keep one checkpoint every n minutes. Default: 0. | |||
| Can't be used with keep_checkpoint_max at the same time. | |||
| keep_checkpoint_per_n_minutes (int): Keep one checkpoint every n minutes. | |||
| Can't be used with keep_checkpoint_max at the same time. Default: 0. | |||
| integrated_save (bool): Whether to perform integrated save function in automatic model parallel scene. | |||
| Default: True. Integrated save function is only supported in automatic parallel scene, not supported | |||
| in manual parallel. | |||
| Integrated save function is only supported in automatic parallel scene, not supported | |||
| in manual parallel. Default: True. | |||
| async_save (bool): Whether asynchronous execution saves the checkpoint to a file. Default: False. | |||
| saved_network (Cell): Network to be saved in checkpoint file. If the saved_network has no relation | |||
| with the network in training, the initial value of saved_network will be saved. Default: None. | |||
| @@ -128,6 +128,7 @@ class CheckpointConfig: | |||
| >>> ckpoint_cb = ModelCheckpoint(prefix='LeNet5', directory='./checkpoint', config=config) | |||
| >>> model.train(10, dataset, callbacks=ckpoint_cb) | |||
| """ | |||
| def __init__(self, | |||
| save_checkpoint_steps=1, | |||
| save_checkpoint_seconds=0, | |||
| @@ -231,6 +232,7 @@ class ModelCheckpoint(Callback): | |||
| ValueError: If the prefix is invalid. | |||
| TypeError: If the config is not CheckpointConfig type. | |||
| """ | |||
| def __init__(self, prefix='CKP', directory=None, config=None): | |||
| super(ModelCheckpoint, self).__init__() | |||
| self._latest_ckpt_file_name = "" | |||
| @@ -311,7 +313,7 @@ class ModelCheckpoint(Callback): | |||
| """Check whether save checkpoint files or not.""" | |||
| if self._config.save_checkpoint_steps and self._config.save_checkpoint_steps > 0: | |||
| if cb_params.cur_step_num >= self._last_triggered_step + self._config.save_checkpoint_steps \ | |||
| or force_to_save is True: | |||
| or force_to_save is True: | |||
| return True | |||
| elif self._config.save_checkpoint_seconds and self._config.save_checkpoint_seconds > 0: | |||
| self._cur_time = time.time() | |||
| @@ -335,7 +337,7 @@ class ModelCheckpoint(Callback): | |||
| if save_ckpt: | |||
| cur_ckpoint_file = self._prefix + "-" + str(cb_params.cur_epoch_num) + "_" \ | |||
| + str(step_num_in_epoch) + ".ckpt" | |||
| + str(step_num_in_epoch) + ".ckpt" | |||
| # update checkpoint file list. | |||
| self._manager.update_ckpoint_filelist(self._directory, self._prefix) | |||
| # keep checkpoint files number equal max number. | |||
| @@ -384,6 +386,7 @@ class ModelCheckpoint(Callback): | |||
| class CheckpointManager: | |||
| """Manage checkpoint files according to train_config of checkpoint.""" | |||
| def __init__(self): | |||
| self._ckpoint_filelist = [] | |||
| @@ -79,15 +79,15 @@ class SummaryCollector(Callback): | |||
| summary_dir (str): The collected data will be persisted to this directory. | |||
| If the directory does not exist, it will be created automatically. | |||
| collect_freq (int): Set the frequency of data collection, it should be greater then zero, | |||
| and the unit is `step`. Default: 10. If a frequency is set, we will collect data | |||
| and the unit is `step`. If a frequency is set, we will collect data | |||
| when (current steps % freq) equals to 0, and the first step will be collected at any time. | |||
| It is important to note that if the data sink mode is used, the unit will become the `epoch`. | |||
| It is not recommended to collect data too frequently, which can affect performance. | |||
| collect_specified_data (Union[None, dict]): Perform custom operations on the collected data. Default: None. | |||
| It is not recommended to collect data too frequently, which can affect performance. Default: 10. | |||
| collect_specified_data (Union[None, dict]): Perform custom operations on the collected data. | |||
| By default, if set to None, all data is collected as the default behavior. | |||
| You can customize the collected data with a dictionary. | |||
| For example, you can set {'collect_metric': False} to control not collecting metrics. | |||
| The data that supports control is shown below. | |||
| The data that supports control is shown below. Default: None. | |||
| - collect_metric (bool): Whether to collect training metrics, currently only the loss is collected. | |||
| The first output will be treated as the loss and it will be averaged. | |||
| @@ -106,14 +106,13 @@ class SummaryCollector(Callback): | |||
| Optional: True/False. Default: True. | |||
| - histogram_regular (Union[str, None]): Collect weight and bias for parameter distribution page | |||
| and displayed in MindInsight. This field allows regular strings to control which parameters to collect. | |||
| Default: None, it means only the first five parameters are collected. | |||
| It is not recommended to collect too many parameters at once, as it can affect performance. | |||
| Note that if you collect too many parameters and run out of memory, the training will fail. | |||
| Default: None, it means only the first five parameters are collected. | |||
| keep_default_action (bool): This field affects the collection behavior of the 'collect_specified_data' field. | |||
| Optional: True/False, Default: True. | |||
| True: it means that after specified data is set, non-specified data is collected as the default behavior. | |||
| False: it means that after specified data is set, only the specified data is collected, | |||
| and the others are not collected. | |||
| and the others are not collected. Optional: True/False, Default: True. | |||
| custom_lineage_data (Union[dict, None]): Allows you to customize the data and present it on the MingInsight | |||
| lineage page. In the custom data, the type of the key supports str, and the type of value supports str, int | |||
| and float. Default: None, it means there is no custom data. | |||
| @@ -121,19 +120,20 @@ class SummaryCollector(Callback): | |||
| Because TensorSummary data is too large to be compared with other summary data, this parameter is used to | |||
| reduce its collection. By default, The maximum number of steps for collecting TensorSummary data is 20, | |||
| but it will not exceed the number of steps for collecting other summary data. | |||
| Default: None, which means to follow the behavior as described above. For example, given `collect_freq=10`, | |||
| when the total steps is 600, TensorSummary will be collected 20 steps, while other summary data 61 steps, | |||
| For example, given `collect_freq=10`, when the total steps is 600, TensorSummary will be collected 20 steps, | |||
| while other summary data 61 steps, | |||
| but when the total steps is 20, both TensorSummary and other summary will be collected 3 steps. | |||
| Also note that when in parallel mode, the total steps will be split evenly, which will | |||
| affect the number of steps TensorSummary will be collected. | |||
| Default: None, which means to follow the behavior as described above. | |||
| max_file_size (Optional[int]): The maximum size in bytes of each file that can be written to the disk. | |||
| Default: None, which means no limit. For example, to write not larger than 4GB, | |||
| specify `max_file_size=4 * 1024**3`. | |||
| For example, to write not larger than 4GB, specify `max_file_size=4*1024**3`. | |||
| Default: None, which means no limit. | |||
| export_options (Union[None, dict]): Perform custom operations on the export data. | |||
| Default: None, it means that the data is not exported. | |||
| Note that the size of export files is not limited by the max_file_size. | |||
| You can customize the export data with a dictionary. For example, you can set {'tensor_format': 'npy'} | |||
| to export tensor as npy file. The data that supports control is shown below. | |||
| Default: None, it means that the data is not exported. | |||
| - tensor_format (Union[str, None]): Customize the export tensor format. Supports ["npy", None]. | |||
| Default: None, it means that the tensor is not exported. | |||
| @@ -110,15 +110,15 @@ class SummaryRecord: | |||
| file_prefix (str): The prefix of file. Default: "events". | |||
| file_suffix (str): The suffix of file. Default: "_MS". | |||
| network (Cell): Obtain a pipeline through network for saving graph summary. Default: None. | |||
| max_file_size (int, optional): The maximum size of each file that can be written to disk (in bytes). \ | |||
| Unlimited by default. For example, to write not larger than 4GB, specify `max_file_size=4 * 1024**3`. | |||
| max_file_size (int, optional): The maximum size of each file that can be written to disk (in bytes). | |||
| Unlimited by default. For example, to write not larger than 4GB, specify `max_file_size=4 * 1024 ** 3`. | |||
| raise_exception (bool, optional): Sets whether to throw an exception when a RuntimeError or OSError exception | |||
| occurs in recording data. Default: False, this means that error logs are printed and no exception is thrown. | |||
| export_options (Union[None, dict]): Perform custom operations on the export data. | |||
| Default: None, it means that the data is not exported. | |||
| Note that the size of export files is not limited by the max_file_size. | |||
| You can customize the export data with a dictionary. For example, you can set {'tensor_format': 'npy'} | |||
| to export tensor as npy file. The data that supports control is shown below. | |||
| to export tensor as npy file. The data that supports control is shown below. Default: None, it means that | |||
| the data is not exported. | |||
| - tensor_format (Union[str, None]): Customize the export tensor format. Supports ["npy", None]. | |||
| Default: None, it means that the tensor is not exported. | |||