|
- # Copyright 2022 OFA-Sys Team. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """Tokenization classes for OFA."""
- import collections
- import os
- from typing import List, Optional, Tuple
-
- from transformers import PreTrainedTokenizer
- from transformers.models.bart.tokenization_bart import BartTokenizer
- from transformers.models.bert.tokenization_bert import (BasicTokenizer,
- WordpieceTokenizer)
- from transformers.utils import logging
-
- from modelscope.utils.constant import ModelFile
-
- logger = logging.get_logger(__name__)
-
- VOCAB_FILES_NAMES = {'vocab_file': 'vocab.json', 'merges_file': 'merges.txt'}
-
- PRETRAINED_VOCAB_FILES_MAP = {
- 'vocab_file': {
- 'ofa-base': 'https://huggingface.co/ofa-base/resolve/main/vocab.json',
- },
- 'merges_file': {
- 'ofa-base': 'https://huggingface.co/ofa-base/resolve/main/merges.txt',
- },
- # OFA models are implemented to be compatible with both huggingface
- # and modelscope frameworks. For all OFA models available on huggingface,
- # please refer to https://huggingface.co/models?filter=ofa
- }
-
- PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
- 'ofa-base': 1024,
- }
-
- VOCAB_FILES_NAMES_ZH = {'vocab_file': ModelFile.VOCAB_FILE}
-
- PRETRAINED_VOCAB_FILES_MAP_ZH = {
- 'vocab_file': {
- 'bert-base-chinese':
- 'https://huggingface.co/bert-base-chinese/resolve/main/vocab.txt',
- }
- # OFA models are implemented to be compatible with both huggingface
- # and modelscope frameworks. For all OFA models available on huggingface,
- # please refer to https://huggingface.co/models?filter=ofa
- }
-
- PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES_ZH = {
- 'ofa-base': 1024,
- }
-
- PRETRAINED_INIT_CONFIGURATION_ZH = {
- 'bert-base-chinese': {
- 'do_lower_case': True
- },
- }
-
-
- class OFATokenizer(BartTokenizer):
- """
- Construct a OFA tokenizer.
-
- [`~OFATokenizer`] is identical to [`BartTokenizer`] and runs end-to-end tokenization: punctuation splitting and
- wordpiece.
-
- Refer to superclass [`BartTokenizer`] for usage examples and documentation concerning parameters.
- """
-
- vocab_files_names = VOCAB_FILES_NAMES
- pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
- max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-
- def load_vocab(vocab_file):
- """Loads a vocabulary file into a dictionary."""
- vocab = collections.OrderedDict()
- with open(vocab_file, 'r', encoding='utf-8') as reader:
- tokens = reader.readlines()
- for index, token in enumerate(tokens):
- token = token.rstrip('\n')
- vocab[token] = index
- return vocab
-
-
- def whitespace_tokenize(text):
- """Runs basic whitespace cleaning and splitting on a piece of text."""
- text = text.strip()
- if not text:
- return []
- tokens = text.split()
- return tokens
-
-
- class OFATokenizerZH(PreTrainedTokenizer):
- r"""
- Construct a OFA tokenizer. Based on WordPiece.
- This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
- this superclass for more information regarding those methods.
-
- Args:
- vocab_file (`str`):
- File containing the vocabulary.
- do_lower_case (`bool`, *optional*, defaults to `True`):
- Whether or not to lowercase the input when tokenizing.
- do_basic_tokenize (`bool`, *optional*, defaults to `True`):
- Whether or not to do basic tokenization before WordPiece.
- never_split (`Iterable`, *optional*):
- Collection of tokens which will never be split during tokenization. Only has an effect when
- `do_basic_tokenize=True`
- bos_token (`str`, *optional*, defaults to `"<s>"`):
- The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
- <Tip>
-
- When building a sequence using special tokens, this is not the token that is used for the beginning of
- sequence. The token used is the `cls_token`.
-
- </Tip>
-
- eos_token (`str`, *optional*, defaults to `"</s>"`):
- The end of sequence token.
-
- <Tip>
-
- When building a sequence using special tokens, this is not the token that is used for the end of sequence.
- The token used is the `sep_token`.
-
- </Tip>
-
- sep_token (`str`, *optional*, defaults to `"</s>"`):
- The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
- sequence classification or for a text and a question for question answering. It is also used as the last
- token of a sequence built with special tokens.
- cls_token (`str`, *optional*, defaults to `"<s>"`):
- The classifier token which is used when doing sequence classification (classification of the whole sequence
- instead of per-token classification). It is the first token of the sequence when built with special tokens.
- unk_token (`str`, *optional*, defaults to `"<unk>"`):
- The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
- token instead.
- pad_token (`str`, *optional*, defaults to `"<pad>"`):
- The token used for padding, for example when batching sequences of different lengths.
- mask_token (`str`, *optional*, defaults to `"<mask>"`):
- The token used for masking values. This is the token used when training this model with masked language
- modeling. This is the token which the model will try to predict.
- tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
- Whether or not to tokenize Chinese characters.
-
- This should likely be deactivated for Japanese (see this
- [issue](https://github.com/huggingface/transformers/issues/328)).
- strip_accents (`bool`, *optional*):
- Whether or not to strip all accents. If this option is not specified, then it will be determined by the
- value for `lowercase` (as in the original BERT).
- """
-
- vocab_files_names = VOCAB_FILES_NAMES_ZH
- pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP_ZH
- pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION_ZH
- max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES_ZH
-
- def __init__(self,
- vocab_file,
- do_lower_case=True,
- do_basic_tokenize=True,
- never_split=None,
- bos_token='<s>',
- eos_token='</s>',
- sep_token='</s>',
- cls_token='<s>',
- unk_token='<unk>',
- pad_token='<pad>',
- mask_token='<mask>',
- tokenize_chinese_chars=True,
- strip_accents=None,
- **kwargs):
- super().__init__(
- do_lower_case=do_lower_case,
- do_basic_tokenize=do_basic_tokenize,
- never_split=never_split,
- bos_token=bos_token,
- eos_token=eos_token,
- unk_token=unk_token,
- sep_token=sep_token,
- cls_token=cls_token,
- pad_token=pad_token,
- mask_token=mask_token,
- tokenize_chinese_chars=tokenize_chinese_chars,
- strip_accents=strip_accents,
- **kwargs,
- )
-
- if not os.path.isfile(vocab_file):
- raise ValueError(
- f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained "
- 'model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`'
- )
- self.vocab = load_vocab(vocab_file)
- self.ids_to_tokens = collections.OrderedDict([
- (ids, tok) for tok, ids in self.vocab.items()
- ])
- self.do_basic_tokenize = do_basic_tokenize
- if do_basic_tokenize:
- self.basic_tokenizer = BasicTokenizer(
- do_lower_case=do_lower_case,
- never_split=never_split,
- tokenize_chinese_chars=tokenize_chinese_chars,
- strip_accents=strip_accents,
- )
- self.wordpiece_tokenizer = WordpieceTokenizer(
- vocab=self.vocab, unk_token=self.unk_token)
-
- @property
- def do_lower_case(self):
- return self.basic_tokenizer.do_lower_case
-
- @property
- def vocab_size(self):
- return len(self.vocab)
-
- def get_vocab(self):
- return dict(self.vocab, **self.added_tokens_encoder)
-
- def _tokenize(self, text):
- split_tokens = []
- if self.do_basic_tokenize:
- for token in self.basic_tokenizer.tokenize(
- text, never_split=self.all_special_tokens):
-
- # If the token is part of the never_split set
- if token in self.basic_tokenizer.never_split:
- split_tokens.append(token)
- else:
- split_tokens += self.wordpiece_tokenizer.tokenize(token)
- else:
- split_tokens = self.wordpiece_tokenizer.tokenize(text)
- return split_tokens
-
- def _convert_token_to_id(self, token):
- """Converts a token (str) in an id using the vocab."""
- return self.vocab.get(token, self.vocab.get(self.unk_token))
-
- def _convert_id_to_token(self, index):
- """Converts an index (integer) in a token (str) using the vocab."""
- return self.ids_to_tokens.get(index, self.unk_token)
-
- def convert_tokens_to_string(self, tokens):
- """Converts a sequence of tokens (string) in a single string."""
- out_string = ' '.join(tokens).replace(' ##', '').strip()
- return out_string
-
- def build_inputs_with_special_tokens(
- self,
- token_ids_0: List[int],
- token_ids_1: Optional[List[int]] = None) -> List[int]:
- """
- Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
- adding special tokens. A BERT sequence has the following format:
-
- - single sequence: `[CLS] X [SEP]`
- - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
- Args:
- token_ids_0 (`List[int]`):
- List of IDs to which the special tokens will be added.
- token_ids_1 (`List[int]`, *optional*):
- Optional second list of IDs for sequence pairs.
-
- Returns:
- `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
- """
- if token_ids_1 is None:
- return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
- cls = [self.cls_token_id]
- sep = [self.sep_token_id]
- return cls + token_ids_0 + sep + token_ids_1 + sep
-
- def get_special_tokens_mask(
- self,
- token_ids_0: List[int],
- token_ids_1: Optional[List[int]] = None,
- already_has_special_tokens: bool = False) -> List[int]:
- """
- Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer `prepare_for_model` method.
-
- Args:
- token_ids_0 (`List[int]`):
- List of IDs.
- token_ids_1 (`List[int]`, *optional*):
- Optional second list of IDs for sequence pairs.
- already_has_special_tokens (`bool`, *optional*, defaults to `False`):
- Whether or not the token list is already formatted with special tokens for the model.
-
- Returns:
- `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
- """
-
- if already_has_special_tokens:
- return super().get_special_tokens_mask(
- token_ids_0=token_ids_0,
- token_ids_1=token_ids_1,
- already_has_special_tokens=True)
-
- if token_ids_1 is not None:
- return [1] + ([0] * len(token_ids_0)) + [1] + (
- [0] * len(token_ids_1)) + [1]
- return [1] + ([0] * len(token_ids_0)) + [1]
-
- def create_token_type_ids_from_sequences(
- self,
- token_ids_0: List[int],
- token_ids_1: Optional[List[int]] = None) -> List[int]:
- """
- Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
- pair mask has the following format:
-
- ```
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
- | first sequence | second sequence |
- ```
-
- If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
-
- Args:
- token_ids_0 (`List[int]`):
- List of IDs.
- token_ids_1 (`List[int]`, *optional*):
- Optional second list of IDs for sequence pairs.
-
- Returns:
- `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
- """
- sep = [self.sep_token_id]
- cls = [self.cls_token_id]
- if token_ids_1 is None:
- return len(cls + token_ids_0 + sep) * [0]
- return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1
- + sep) * [1]
-
- def save_vocabulary(self,
- save_directory: str,
- filename_prefix: Optional[str] = None) -> Tuple[str]:
- index = 0
- if os.path.isdir(save_directory):
- vocab_file = os.path.join(
- save_directory,
- (filename_prefix + '-' if filename_prefix else '')
- + VOCAB_FILES_NAMES['vocab_file'])
- else:
- vocab_file = (filename_prefix
- + '-' if filename_prefix else '') + save_directory
- with open(vocab_file, 'w', encoding='utf-8') as writer:
- for token, token_index in sorted(
- self.vocab.items(), key=lambda kv: kv[1]):
- if index != token_index:
- logger.warning(
- f'Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive.'
- ' Please check that the vocabulary is not corrupted!')
- index = token_index
- writer.write(token + '\n')
- index += 1
- return (vocab_file, )
|