|
- # Copyright 2020 Huawei Technologies Co., Ltd
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ============================================================================
- """ test bert of graph compile """
- import functools
- import numpy as np
- import mindspore.common.dtype as mstype
- from mindspore.common.tensor import Tensor
- from mindspore.model_zoo.Bert_NEZHA.bert_model import BertConfig, \
- EmbeddingLookup, EmbeddingPostprocessor, BertOutput, RelaPosMatrixGenerator, \
- RelaPosEmbeddingsGenerator, SaturateCast, BertAttention, BertSelfAttention, \
- BertEncoderCell, BertTransformer, CreateAttentionMaskFromInputMask, BertModel
- from mindspore.nn.layer.basic import Norm
- from mindspore.model_zoo.Bert_NEZHA import BertPretrainingLoss, GetNextSentenceOutput
- import mindspore.nn as nn
- from mindspore.common.initializer import TruncatedNormal
- from mindspore.common.parameter import ParameterTuple
- from mindspore.nn.optim import AdamWeightDecay, AdamWeightDecayDynamicLR
- from mindspore.model_zoo.Bert_NEZHA.bert_for_pre_training import ClipGradients
- import mindspore.ops.composite as C
- from mindspore.ops import functional as F
- from ....ops_common import convert
- from ....mindspore_test_framework.mindspore_test import mindspore_test
- from ....mindspore_test_framework.pipeline.forward.compile_forward import pipeline_for_compile_forward_ge_graph_for_case_by_case_config
- from ....mindspore_test_framework.pipeline.gradient.compile_gradient import pipeline_for_compile_grad_ge_graph_for_case_by_case_config
-
- def bert_trans():
- """bert_trans"""
- net = BertTransformer(batch_size=1,
- hidden_size=768,
- seq_length=128,
- num_hidden_layers=1,
- num_attention_heads=12,
- intermediate_size=768,
- attention_probs_dropout_prob=0.1,
- use_one_hot_embeddings=False,
- initializer_range=0.02,
- use_relative_positions=False,
- hidden_act="gelu",
- compute_type=mstype.float32,
- return_all_encoders=True)
- net.set_train()
- return net
-
- def set_train(net):
- net.set_train()
- return net
-
- class NetForAdam(nn.Cell):
- def __init__(self):
- super(NetForAdam, self).__init__()
- self.dense = nn.Dense(64, 10)
-
- def construct(self, x):
- x = self.dense(x)
- return x
-
- class TrainStepWrapForAdam(nn.Cell):
- """TrainStepWrapForAdam definition"""
- def __init__(self, network):
- super(TrainStepWrapForAdam, self).__init__()
- self.network = network
- self.weights = ParameterTuple(network.get_parameters())
- self.optimizer = AdamWeightDecay(self.weights)
- self.clip_gradients = ClipGradients()
-
- def construct(self, x, sens):
- weights = self.weights
- grads = C.grad_by_list_with_sens(self.network, weights)(x, sens)
- grads = self.clip_gradients(grads, 1, 1.0)
- return self.optimizer(grads)
-
- class TrainStepWrapForAdamDynamicLr(nn.Cell):
- """TrainStepWrapForAdamDynamicLr definition"""
- def __init__(self, network):
- super(TrainStepWrapForAdamDynamicLr, self).__init__()
- self.network = network
- self.weights = ParameterTuple(network.get_parameters())
- self.optimizer = AdamWeightDecayDynamicLR(self.weights, 10)
- self.sens = Tensor(np.ones(shape=(1, 10)).astype(np.float32))
-
- def construct(self, x):
- weights = self.weights
- grads = C.grad_by_list_with_sens(self.network, weights)(x, self.sens)
- return self.optimizer(grads)
-
- class TempC2Wrap(nn.Cell):
- def __init__(self, op, c1=None, c2=None,):
- super(TempC2Wrap, self).__init__()
- self.op = op
- self.c1 = c1
- self.c2 = c2
- def construct(self, x1):
- x = self.op(x1, self.c1, self.c2)
- return x
-
- test_case_cell_ops = [
- ('Norm_keepdims', {
- 'block': Norm(keep_dims=True),
- 'desc_inputs': [[1, 3, 4, 4]],
- 'desc_bprop': [[1]]}),
- ('SaturateCast', {
- 'block': SaturateCast(),
- 'desc_inputs': [[1, 3, 4, 4]],
- 'desc_bprop': [[1, 3, 4, 4]]}),
- ('RelaPosMatrixGenerator_0', {
- 'block': RelaPosMatrixGenerator(length=128, max_relative_position=16),
- 'desc_inputs': [],
- 'desc_bprop': [[128, 128]],
- 'skip': ['backward']}),
- ('RelaPosEmbeddingsGenerator_0', {
- 'block': RelaPosEmbeddingsGenerator(length=128, depth=512,
- max_relative_position=16,
- initializer_range=0.2),
- 'desc_inputs': [],
- 'desc_bprop': [[16384, 512]],
- 'skip': ['backward']}),
- ('RelaPosEmbeddingsGenerator_1', {
- 'block': RelaPosEmbeddingsGenerator(length=128, depth=512,
- max_relative_position=16,
- initializer_range=0.2,
- use_one_hot_embeddings=False),
- 'desc_inputs': [],
- 'desc_bprop': [[128, 128, 512]],
- 'skip': ['backward']}),
- ('RelaPosEmbeddingsGenerator_2', {
- 'block': RelaPosEmbeddingsGenerator(length=128, depth=64,
- max_relative_position=16,
- initializer_range=0.2,
- use_one_hot_embeddings=False),
- 'desc_inputs': [],
- 'desc_bprop': [[128, 128, 64]],
- 'skip': ['backward']}),
- ('BertAttention_0', {
- 'block': BertAttention(batch_size=64,
- from_tensor_width=768,
- to_tensor_width=768,
- from_seq_length=128,
- to_seq_length=128,
- num_attention_heads=12,
- size_per_head=64,
- query_act=None,
- key_act=None,
- value_act=None,
- has_attention_mask=True,
- attention_probs_dropout_prob=0.1,
- use_one_hot_embeddings=False,
- initializer_range=0.02,
- do_return_2d_tensor=True,
- use_relative_positions=False,
- compute_type=mstype.float32),
- 'desc_inputs': [[64, 128, 768], [64, 128, 768], [64, 128, 128]],
- 'desc_bprop': [[8192, 768]]}),
- ('BertAttention_1', {
- 'block': BertAttention(batch_size=64,
- from_tensor_width=768,
- to_tensor_width=768,
- from_seq_length=128,
- to_seq_length=128,
- num_attention_heads=12,
- size_per_head=64,
- query_act=None,
- key_act=None,
- value_act=None,
- has_attention_mask=True,
- attention_probs_dropout_prob=0.1,
- use_one_hot_embeddings=False,
- initializer_range=0.02,
- do_return_2d_tensor=True,
- use_relative_positions=True,
- compute_type=mstype.float32),
- 'desc_inputs': [[64, 128, 768], [64, 128, 768], [64, 128, 128]],
- 'desc_bprop': [[8192, 768]]}),
- ('BertAttention_2', {
- 'block': BertAttention(batch_size=64,
- from_tensor_width=768,
- to_tensor_width=768,
- from_seq_length=128,
- to_seq_length=128,
- num_attention_heads=12,
- size_per_head=64,
- query_act=None,
- key_act=None,
- value_act=None,
- has_attention_mask=False,
- attention_probs_dropout_prob=0.1,
- use_one_hot_embeddings=False,
- initializer_range=0.02,
- do_return_2d_tensor=True,
- use_relative_positions=True,
- compute_type=mstype.float32),
- 'desc_inputs': [[64, 128, 768], [64, 128, 768], [64, 128, 128]],
- 'desc_bprop': [[8192, 768]]}),
- ('BertAttention_3', {
- 'block': BertAttention(batch_size=64,
- from_tensor_width=768,
- to_tensor_width=768,
- from_seq_length=128,
- to_seq_length=128,
- num_attention_heads=12,
- size_per_head=64,
- query_act=None,
- key_act=None,
- value_act=None,
- has_attention_mask=True,
- attention_probs_dropout_prob=0.1,
- use_one_hot_embeddings=False,
- initializer_range=0.02,
- do_return_2d_tensor=False,
- use_relative_positions=True,
- compute_type=mstype.float32),
- 'desc_inputs': [[64, 128, 768], [64, 128, 768], [64, 128, 128]],
- 'desc_bprop': [[8192, 768]]}),
- ('BertOutput', {
- 'block': BertOutput(in_channels=768,
- out_channels=768,
- initializer_range=0.02,
- dropout_prob=0.1),
- 'desc_inputs': [[8192, 768], [8192, 768]],
- 'desc_bprop': [[8192, 768]]}),
- ('BertSelfAttention_0', {
- 'block': BertSelfAttention(batch_size=64,
- seq_length=128,
- hidden_size=768,
- num_attention_heads=12,
- attention_probs_dropout_prob=0.1,
- use_one_hot_embeddings=False,
- initializer_range=0.02,
- hidden_dropout_prob=0.1,
- use_relative_positions=False,
- compute_type=mstype.float32),
- 'desc_inputs': [[64, 128, 768], [64, 128, 128]],
- 'desc_bprop': [[8192, 768]]}),
- ('BertEncoderCell', {
- 'block': BertEncoderCell(batch_size=64,
- hidden_size=768,
- seq_length=128,
- num_attention_heads=12,
- intermediate_size=768,
- attention_probs_dropout_prob=0.02,
- use_one_hot_embeddings=False,
- initializer_range=0.02,
- hidden_dropout_prob=0.1,
- use_relative_positions=False,
- hidden_act="gelu",
- compute_type=mstype.float32),
- 'desc_inputs': [[64, 128, 768], [64, 128, 128]],
- 'desc_bprop': [[8192, 768]]}),
- ('BertTransformer_0', {
- 'block': BertTransformer(batch_size=1,
- hidden_size=768,
- seq_length=128,
- num_hidden_layers=1,
- num_attention_heads=12,
- intermediate_size=768,
- attention_probs_dropout_prob=0.1,
- use_one_hot_embeddings=False,
- initializer_range=0.02,
- use_relative_positions=False,
- hidden_act="gelu",
- compute_type=mstype.float32,
- return_all_encoders=True),
- 'desc_inputs': [[1, 128, 768], [1, 128, 128]]}),
- ('BertTransformer_1', {
- 'block': BertTransformer(batch_size=64,
- hidden_size=768,
- seq_length=128,
- num_hidden_layers=2,
- num_attention_heads=12,
- intermediate_size=768,
- attention_probs_dropout_prob=0.1,
- use_one_hot_embeddings=False,
- initializer_range=0.02,
- use_relative_positions=True,
- hidden_act="gelu",
- compute_type=mstype.float32,
- return_all_encoders=False),
- 'desc_inputs': [[64, 128, 768], [64, 128, 128]]}),
- ('EmbeddingLookup', {
- 'block': EmbeddingLookup(vocab_size=32000,
- embedding_size=768,
- embedding_shape=[1, 128, 768],
- use_one_hot_embeddings=False,
- initializer_range=0.02),
- 'desc_inputs': [Tensor(np.random.rand(128).astype(np.int32))],
- 'desc_bprop': [[1, 128, 768], [1, 128, 768]],
- 'num_output': 2}),
- ('EmbeddingPostprocessor', {
- 'block': EmbeddingPostprocessor(embedding_size=768,
- embedding_shape=[1, 128, 768],
- use_token_type=True,
- token_type_vocab_size=16,
- use_one_hot_embeddings=False,
- initializer_range=0.02,
- max_position_embeddings=512,
- dropout_prob=0.1),
- 'desc_inputs': [Tensor(np.random.rand(128).astype(np.int32)), [1, 128, 768]],
- 'desc_bprop': [[1, 128, 768]]}),
- ('CreateAttentionMaskFromInputMask', {
- 'block': CreateAttentionMaskFromInputMask(config=BertConfig(batch_size=1)),
- 'desc_inputs': [[128]],
- 'desc_bprop': [[1, 128, 128]]}),
- ('BertOutput_0', {
- 'block': BertOutput(in_channels=768,
- out_channels=768,
- initializer_range=0.02,
- dropout_prob=0.1),
- 'desc_inputs': [[1, 768], [1, 768]],
- 'desc_bprop': [[1, 128, 768]]}), # maybe not right
- ('BertTransformer_2', {
- 'block': bert_trans(),
- 'desc_inputs': [[1, 128, 768], [1, 128, 128]]}),
-
- ('BertModel', {
- 'block': BertModel(config=BertConfig(batch_size=1,
- num_hidden_layers=1,
- intermediate_size=768,
- token_type_ids_from_dataset=False),
- is_training=True),
- 'desc_inputs': [Tensor(np.random.rand(128).astype(np.int32)),
- Tensor(np.random.rand(128).astype(np.int32)), [128]],
- 'desc_bprop': [[1, 128, 768], [1, 128, 768], [1, 128, 768]],
- 'num_output': 3}), # maybe not right
-
- ('BertModel_1', {
- 'block': BertModel(config=BertConfig(batch_size=1,
- num_hidden_layers=1,
- intermediate_size=768,
- token_type_ids_from_dataset=False),
- is_training=False),
- 'desc_inputs': [Tensor(np.random.rand(128).astype(np.int32)),
- Tensor(np.random.rand(128).astype(np.int32)), [128]],
- 'desc_bprop': [[1, 128, 768], [1, 128, 768], [1, 128, 768]],
- 'num_output': 3}), # maybe not right
-
- ('BertModel_2', {
- 'block': BertModel(config=BertConfig(batch_size=1,
- num_hidden_layers=1,
- intermediate_size=768,
- token_type_ids_from_dataset=False,
- input_mask_from_dataset=False),
- is_training=True),
- 'desc_inputs': [Tensor(np.random.rand(128).astype(np.int32)),
- Tensor(np.random.rand(128).astype(np.int32)), [128]],
- 'desc_bprop': [[1, 128, 768], [1, 128, 768], [1, 128, 768]],
- 'num_output': 3}), # maybe not right
-
- ('BertPretrainingLoss', {
- 'block': BertPretrainingLoss(config=BertConfig(batch_size=1)),
- 'desc_inputs': [[32000], [20, 2], Tensor(np.array([1]).astype(np.int32)),
- [20], Tensor(np.array([20]).astype(np.int32))],
- 'desc_bprop': [[1]],
- 'num_output': 1}),
- ('Dense_1', {
- 'block': nn.Dense(in_channels=768,
- out_channels=3072,
- activation='gelu',
- weight_init=TruncatedNormal(0.02)),
- 'desc_inputs': [[3, 768]],
- 'desc_bprop': [[3, 3072]]}),
- ('Dense_2', {
- 'block': set_train(nn.Dense(in_channels=768,
- out_channels=3072,
- activation='gelu',
- weight_init=TruncatedNormal(0.02),)),
- 'desc_inputs': [[3, 768]],
- 'desc_bprop': [[3, 3072]]}),
- ('GetNextSentenceOutput', {
- 'block': GetNextSentenceOutput(BertConfig(batch_size=1)),
- 'desc_inputs': [[128, 768]],
- 'desc_bprop': [[128, 2]]}),
- ('Adam_1', {
- 'block': set_train(TrainStepWrapForAdam(NetForAdam())),
- 'desc_inputs': [[1, 64], [1, 10]],
- 'skip': ['backward']}),
- ('Adam_2', {
- 'block': set_train(TrainStepWrapForAdam(GetNextSentenceOutput(BertConfig(batch_size=1)))),
- 'desc_inputs': [[128, 768], [128, 2]],
- 'skip': ['backward']}),
- ('AdamWeightDecayDynamicLR', {
- 'block': set_train(TrainStepWrapForAdamDynamicLr(NetForAdam())),
- 'desc_inputs': [[1, 64]],
- 'skip': ['backward']}),
- ('ClipGradients', {
- 'block': TempC2Wrap(ClipGradients(), 1, 1.0),
- 'desc_inputs': [tuple(convert(shp) for shp in [[1], [1], [1]])],
- 'skip': ['backward', 'exec']}),
- ]
-
- test_case = functools.reduce(lambda x, y: x+y, [test_case_cell_ops])
- # use -k to select certain testcast
- # pytest tests/python/ops/test_ops.py::test_backward -k LayerNorm
-
-
- test_exec_case = filter(lambda x: 'skip' not in x[1] or
- 'exec' not in x[1]['skip'], test_case)
- test_backward_exec_case = filter(lambda x: 'skip' not in x[1] or
- 'backward' not in x[1]['skip'] and 'backward_exec'
- not in x[1]['skip'], test_case)
- test_check_gradient_case = filter(lambda x: 'skip' not in x[1] or
- 'backward' not in x[1]['skip'] and 'backward_exec'
- not in x[1]['skip'], test_case)
-
- @mindspore_test(pipeline_for_compile_forward_ge_graph_for_case_by_case_config)
- def test_exec():
- return test_exec_case
-
- @mindspore_test(pipeline_for_compile_grad_ge_graph_for_case_by_case_config)
- def test_backward_exec():
- return test_backward_exec_case
|