modelscope
/
ModelScope

 
			
							# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" MPLUG model configuration """
import os
from typing import Any, Dict, Union

import yaml
from transformers import PretrainedConfig
from transformers.utils import logging

from modelscope.utils.constant import Tasks

logger = logging.get_logger(__name__)


class MPlugConfig(PretrainedConfig):

    model_type = 'mplug'

    def __init__(
            self,
            task=Tasks.visual_question_answering,
            bert_config='config_bert.json',
            image_res=504,
            batch_size_train=128,
            vision_width=1024,
            distill=True,
            clip_name='ViT-L-14',  # ViT-B-16 | ViT-L-14
            batch_size_test=64,
            k_test=128,
            alpha=0.4,
            warm_up=True,
            eos='[SEP]',
            optimizer=None,
            schedular=None,
            min_length=1,
            max_length=10,
            beam_size=5,
            add_ocr=False,
            add_object=False,
            text_encoder='bert-base-uncased',
            text_decoder='bert-base-uncased',
            # clip
            clip_embed_dim=768,
            clip_image_resolution=224,
            clip_vision_layers=24,
            clip_vision_width=1024,
            clip_vision_patch_size=14,
            clip_context_length=77,
            clip_vocab_size=49408,
            clip_transformer_width=768,
            clip_transformer_heads=12,
            clip_transformer_layers=12,
            **kwargs):

        super().__init__(**kwargs)
        self.task = task
        self.bert_config = bert_config
        self.image_res = image_res
        self.batch_size_train = batch_size_train
        self.vision_width = vision_width
        self.distill = distill
        self.clip_name = clip_name
        self.batch_size_test = batch_size_test
        self.k_test = k_test
        self.alpha = alpha
        self.warm_up = warm_up
        self.eos = eos
        self.optimizer = optimizer
        self.schedular = schedular
        self.min_length = min_length
        self.max_length = max_length
        self.beam_size = beam_size
        self.add_ocr = add_ocr
        self.add_object = add_object
        self.text_encoder = text_encoder
        self.text_decoder = text_decoder
        # clip
        self.clip_embed_dim = clip_embed_dim
        self.clip_image_resolution = clip_image_resolution
        self.clip_vision_layers = clip_vision_layers
        self.clip_vision_width = clip_vision_width
        self.clip_vision_patch_size = clip_vision_patch_size
        self.clip_context_length = clip_context_length
        self.clip_vocab_size = clip_vocab_size
        self.clip_transformer_width = clip_transformer_width
        self.clip_transformer_heads = clip_transformer_heads
        self.clip_transformer_layers = clip_transformer_layers

    @classmethod
    def from_yaml_file(cls, yaml_file: Union[str,
                                             os.PathLike]) -> Dict[str, Any]:
        with open(yaml_file, 'r') as reader:
            config_dict = yaml.load(reader, Loader=yaml.Loader)
        return cls(**config_dict)