adding check for exceptional values in moe

4 years ago · 9e1d095c99
--- a/mindspore/parallel/nn/moe.py
+++ b/mindspore/parallel/nn/moe.py
@@ -41,8 +41,8 @@ class MoEConfig:
                which is >=1.0. Default: 1.1.
            aux_loss_factor (float): The factor is used to indicate how much the load balance loss (produced by the
                router) to be added to the entire model loss, which is < 1.0. Default: 0.05.
            num_experts_chosen (int): The number of experts is chosen by each token. This value should be less
                than or equal to 'expert_num'. Default: 1.
            num_experts_chosen (int): The number of experts is chosen by each token. Since only 'Top1' routing policy
                is supported currently, the value should be 1. Default: 1.
        Supported Platforms:
            ``Ascend`` ``GPU``

@@ -62,10 +62,9 @@ class MoEConfig:
        if aux_loss_factor >= 1.0:
            raise ValueError(f"'aux_loss_factor' should be less than 1.0, "
                             f"but got {aux_loss_factor}.")
        if num_experts_chosen > expert_num:
            raise ValueError(f"'num_experts_chosen' should be less than or equal to 'expert_num', "
                             f"but got {num_experts_chosen} for 'num_experts_chosen', "
                             f"and {expert_num} for 'expert_num'.")
        if num_experts_chosen != 1:
            raise ValueError(f"'num_experts_chosen' should be 1. Since only 'Top1' routing policy supported currently, "
                             f"the value should be 1.")
        self.expert_num = expert_num
        self.capacity_factor = capacity_factor
        self.aux_loss_factor = aux_loss_factor
@@ -73,7 +72,15 @@ class MoEConfig:

 default_moe_config = MoEConfig()


 def _check_moe_config(moe_config=None, parallel_config=None):
    if not isinstance(moe_config, MoEConfig):
        raise TypeError(f"'moe_config' should be an instance of MoEConfig, but got {type(moe_config).__name__}.")
    use_moe = (moe_config.expert_num > 1)
    if use_moe and moe_config.expert_num % parallel_config.data_parallel != 0:
        raise ValueError(f"When using MoE, the 'expert_num' in {type(moe_config).__name__} must be a multiple "
                         f"of 'data_parallel' value in {type(parallel_config).__name__}, but got "
                         f"{moe_config.expert_num} for 'expert_num' and {parallel_config.data_parallel} for "
                         f"'data_parallel'.")
@constexpr
 def calculate_expert_capacity(k, tokens_per_device, capacity_factor, expert_dim):
    return math.ceil(k * tokens_per_device * capacity_factor / expert_dim)
--- a/mindspore/parallel/nn/transformer.py
+++ b/mindspore/parallel/nn/transformer.py
@@ -35,7 +35,7 @@ from .layers import _LayerNorm, _Linear, _check_input_shape, \
    _args_type_validator_check, _valid_type_checks, _valid_value_checks, \
    _check_shape_equal, _check_past_none_input_none, _check_input_dtype, _check_input_shape_value
 from .op_parallel_config import default_dpmp_config, _PipeLineConfig, OpParallelConfig, _Config, _check_config
 from .moe import default_moe_config, MoE
 from .moe import default_moe_config, MoE, _check_moe_config

 __all__ = [
    "AttentionMask",
@@ -1304,6 +1304,7 @@ class TransformerEncoderLayer(Cell):
                                            param_init_type=param_init_type,
                                            use_past=use_past,
                                            parallel_config=parallel_config)
        _check_moe_config(moe_config, parallel_config)
        self.use_moe = (moe_config.expert_num > 1)
        if self.use_moe is True:
            self.output = MoE(hidden_size=hidden_size,
@@ -1625,6 +1626,7 @@ class TransformerDecoderLayer(Cell):
        self.cross_attention_layernorm = _LayerNorm((hidden_size,)).to_float(
            layernorm_compute_type)
        self.cross_attention_layernorm.shard(((parallel_config.data_parallel, 1),))
        _check_moe_config(moe_config, parallel_config)
        self.use_moe = (moe_config.expert_num > 1)
        if self.use_moe is True:
            self.output = MoE(hidden_size=hidden_size,
@@ -2004,7 +2006,7 @@ class TransformerEncoder(Cell):
                 parallel_config=default_transformer_config):
        super(TransformerEncoder, self).__init__()
        _check_config(parallel_config)

        _check_moe_config(moe_config, parallel_config)
        self.use_moe = (moe_config.expert_num > 1)
        self.add = P.Add().shard(((), ()))
        self.aux_loss = Tensor(0.0, mstype.float32)
@@ -2205,6 +2207,7 @@ class TransformerDecoder(Cell):
            raise RuntimeError(f"The {self.cls_name} does not support auto parallel mode now.")
        self.num_layers = num_layers
        self.blocks = nn.CellList()
        _check_moe_config(moe_config, parallel_config)
        self.use_moe = (moe_config.expert_num > 1)
        for i in range(num_layers):
            block = TransformerDecoderLayer(hidden_size=hidden_size,
@@ -2433,7 +2436,7 @@ class Transformer(Cell):
        # The shard setting of Transformer is set within the TransformerEncoderLayer
        if not lambda_func:
            lambda_func = _get_lambda_func(total_layer=encoder_layers + decoder_layers)

        _check_moe_config(moe_config, parallel_config)
        self.use_moe = (moe_config.expert_num > 1)
        self.add = P.Add().shard(((), ()))
        self.aux_loss = Tensor(0.0, mstype.float32)