| @@ -1,6 +1,6 @@ | |||||
| # Modified by Zhipu.AI | # Modified by Zhipu.AI | ||||
| # Original Copyright (c) Alibaba, Inc. and its affiliates. | # Original Copyright (c) Alibaba, Inc. and its affiliates. | ||||
| from typing import TYPE_CHECKING | |||||
| from typing import TYPE_CHECKING, Union | |||||
| from modelscope.utils.import_utils import LazyImportModule | from modelscope.utils.import_utils import LazyImportModule | ||||
| @@ -1,8 +1,8 @@ | |||||
| # Copyright (c) 2022 Zhipu.AI | |||||
| import math | import math | ||||
| import torch | import torch | ||||
| import torch.nn.functional as F | import torch.nn.functional as F | ||||
| from torch.nn.parameter import Parameter | |||||
| def fast_gelu(x): | def fast_gelu(x): | ||||
| @@ -1,20 +1,15 @@ | |||||
| # Copyright (c) 2022 Zhipu.AI | # Copyright (c) 2022 Zhipu.AI | ||||
| import copy | import copy | ||||
| import os | |||||
| import random | |||||
| import time | |||||
| from typing import Dict | |||||
| from typing import Any, Dict | |||||
| import numpy as np | |||||
| import torch | import torch | ||||
| from IPython import embed | |||||
| from modelscope.metainfo import Models | from modelscope.metainfo import Models | ||||
| from modelscope.models.base import Tensor, TorchModel | |||||
| from modelscope.models.base import TorchModel | |||||
| from modelscope.models.builder import MODELS | from modelscope.models.builder import MODELS | ||||
| from modelscope.outputs import OutputKeys | from modelscope.outputs import OutputKeys | ||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| from modelscope.utils.constant import Tasks | |||||
| from modelscope.utils.logger import get_logger | |||||
| from .codegeex import CodeGeeXModel | from .codegeex import CodeGeeXModel | ||||
| from .inference import get_token_stream | from .inference import get_token_stream | ||||
| from .tokenizer import CodeGeeXTokenizer | from .tokenizer import CodeGeeXTokenizer | ||||
| @@ -45,18 +40,18 @@ class CodeGeeXForCodeTranslation(TorchModel): | |||||
| model_dir (str): the model path. | model_dir (str): the model path. | ||||
| """ | """ | ||||
| super().__init__(model_dir, *args, **kwargs) | super().__init__(model_dir, *args, **kwargs) | ||||
| logger = get_logger() | |||||
| # loading tokenizer | # loading tokenizer | ||||
| print('Loading tokenizer ...') | |||||
| logger.info('Loading tokenizer ...') | |||||
| self.tokenizer = CodeGeeXTokenizer( | self.tokenizer = CodeGeeXTokenizer( | ||||
| tokenizer_path=model_dir + '/tokenizer', mode='codegeex-13b') | tokenizer_path=model_dir + '/tokenizer', mode='codegeex-13b') | ||||
| # loading model | # loading model | ||||
| state_dict_path = model_dir + '/ckpt_ms_translation_0817.pt' | state_dict_path = model_dir + '/ckpt_ms_translation_0817.pt' | ||||
| print('Loading state dict ...') | |||||
| logger.info('Loading state dict ...') | |||||
| state_dict = torch.load(state_dict_path, map_location='cpu') | state_dict = torch.load(state_dict_path, map_location='cpu') | ||||
| state_dict = state_dict['module'] | state_dict = state_dict['module'] | ||||
| print('Building CodeGeeX model ...') | |||||
| logger.info('Building CodeGeeX model ...') | |||||
| self.model = model_provider() | self.model = model_provider() | ||||
| self.model.load_state_dict(state_dict) | self.model.load_state_dict(state_dict) | ||||
| self.model.eval() | self.model.eval() | ||||
| @@ -68,21 +63,16 @@ class CodeGeeXForCodeTranslation(TorchModel): | |||||
| seq_length = 2048 | seq_length = 2048 | ||||
| out_seq_length = 256 | out_seq_length = 256 | ||||
| bad_ids = None | bad_ids = None | ||||
| print('Generating ...') | |||||
| src_lang = input['source language'] | src_lang = input['source language'] | ||||
| dst_lang = input['target language'] | dst_lang = input['target language'] | ||||
| prompt = input['prompt'] | prompt = input['prompt'] | ||||
| prompt = f'code translation\n{src_lang}:\n{prompt}\n{dst_lang}:\n' | prompt = f'code translation\n{src_lang}:\n{prompt}\n{dst_lang}:\n' | ||||
| t0 = time.perf_counter() | |||||
| logger = get_logger() | |||||
| tokenizer = self.tokenizer | tokenizer = self.tokenizer | ||||
| model = self.model | model = self.model | ||||
| for prompt in [prompt]: | for prompt in [prompt]: | ||||
| tokens = tokenizer.encode_code(prompt) | tokens = tokenizer.encode_code(prompt) | ||||
| print(tokens) | |||||
| print('Current prompt:') | |||||
| print(prompt) | |||||
| n_token_prompt = len(tokens) | n_token_prompt = len(tokens) | ||||
| print('N_token_prompt:', n_token_prompt) | |||||
| token_stream = get_token_stream( | token_stream = get_token_stream( | ||||
| model, | model, | ||||
| tokenizer, | tokenizer, | ||||
| @@ -108,19 +98,10 @@ class CodeGeeXForCodeTranslation(TorchModel): | |||||
| generated_code = tokenizer.decode_code( | generated_code = tokenizer.decode_code( | ||||
| generated_tokens_[n_token_prompt:]) | generated_tokens_[n_token_prompt:]) | ||||
| generated_code = ''.join(generated_code) | generated_code = ''.join(generated_code) | ||||
| t1 = time.perf_counter() | |||||
| print('Total generation time:', t1 - t0, '# Tokens:', | |||||
| len(generated_tokens_) - n_token_prompt) | |||||
| print( | |||||
| f'{(t1 - t0) / (len(generated_tokens_) - n_token_prompt)}s/token' | |||||
| ) | |||||
| print( | |||||
| '================================= Generated code:' | |||||
| ) | |||||
| print(generated_code) | |||||
| t0 = time.perf_counter() | |||||
| logger.info('================================= Generated code:') | |||||
| logger.info(generated_code) | |||||
| if all(is_finished): | if all(is_finished): | ||||
| break | break | ||||
| print('Generation finished.') | |||||
| logger.info('Generation finished.') | |||||
| return {OutputKeys.TEXT: generated_code} | return {OutputKeys.TEXT: generated_code} | ||||
| @@ -1,12 +1,8 @@ | |||||
| import copy | |||||
| import os | |||||
| import time | |||||
| import typing | |||||
| from dataclasses import dataclass | |||||
| # Copyright (c) 2022 Zhipu.AI | |||||
| import json | |||||
| import torch | import torch | ||||
| import torch.nn.functional as F | import torch.nn.functional as F | ||||
| from typing import List | |||||
| def get_ltor_masks_and_position_ids( | def get_ltor_masks_and_position_ids( | ||||
| @@ -128,38 +124,7 @@ def pad_batch(batch, pad_id, seq_length): | |||||
| tokens.extend([pad_id] * (seq_length - context_length)) | tokens.extend([pad_id] * (seq_length - context_length)) | ||||
| context_lengths.append(context_length) | context_lengths.append(context_length) | ||||
| return batch, context_lengths | return batch, context_lengths | ||||
| def forward_step( | |||||
| model, | |||||
| tokens, | |||||
| seq_length, | |||||
| position_ids, | |||||
| attention_mask, | |||||
| layer_past=None, | |||||
| get_key_value=None, | |||||
| prompt_length=None, | |||||
| context_length=None, | |||||
| ): | |||||
| # Forward pass through the model. | |||||
| output_tensor = model( | |||||
| tokens, | |||||
| position_ids, | |||||
| attention_mask, | |||||
| layer_past=layer_past, | |||||
| get_key_value=get_key_value, | |||||
| prompt_length=prompt_length, | |||||
| context_length=context_length, | |||||
| ) | |||||
| if get_key_value: | |||||
| output_tensor, layer_past = output_tensor | |||||
| if get_key_value: | |||||
| return output_tensor, layer_past | |||||
| return output_tensor | |||||
| def get_token_stream( | def get_token_stream( | ||||
| model, | model, | ||||
| @@ -1,8 +1,8 @@ | |||||
| import typing | |||||
| # Copyright (c) 2022 Zhipu.AI | |||||
| import torch | import torch | ||||
| from transformers import AutoTokenizer | from transformers import AutoTokenizer | ||||
| from transformers.models.gpt2 import GPT2TokenizerFast | from transformers.models.gpt2 import GPT2TokenizerFast | ||||
| from typing import List, Union | |||||
| def encode_whitespaces(text, start_extra_id: int, max_len: int): | def encode_whitespaces(text, start_extra_id: int, max_len: int): | ||||
| @@ -1,13 +1,12 @@ | |||||
| # Copyright (c) 2022 Zhipu.AI | # Copyright (c) 2022 Zhipu.AI | ||||
| from typing import Any, Dict, Optional, Union | |||||
| from typing import Any, Dict, Union | |||||
| from modelscope.metainfo import Pipelines | from modelscope.metainfo import Pipelines | ||||
| from modelscope.models.base import Model | |||||
| from modelscope.models.nlp import CodeGeeXForCodeTranslation | from modelscope.models.nlp import CodeGeeXForCodeTranslation | ||||
| from modelscope.pipelines.base import Pipeline, Tensor | |||||
| from modelscope.pipelines.base import Pipeline | |||||
| from modelscope.pipelines.builder import PIPELINES | from modelscope.pipelines.builder import PIPELINES | ||||
| from modelscope.preprocessors import CodeGeeXPreprocessor, Preprocessor | |||||
| from modelscope.preprocessors import Preprocessor | |||||
| from modelscope.utils.constant import Tasks | from modelscope.utils.constant import Tasks | ||||
| @@ -27,16 +26,18 @@ class CodeGeeXCodeTranslationPipeline(Pipeline): | |||||
| self.model.eval() | self.model.eval() | ||||
| self.model.half() | self.model.half() | ||||
| self.model.cuda() | self.model.cuda() | ||||
| if preprocessor is None: | |||||
| preprocessor = CodeGeeXPreprocessor() | |||||
| super().__init__(model=model, preprocessor=preprocessor, **kwargs) | |||||
| super().__init__(model=model, **kwargs) | |||||
| def preprocess(self, inputs, **preprocess_params) -> Dict[str, Any]: | |||||
| return inputs | |||||
| # define the forward pass | # define the forward pass | ||||
| def forward(self, inputs: Union[Dict], **forward_params) -> Dict[str, Any]: | def forward(self, inputs: Union[Dict], **forward_params) -> Dict[str, Any]: | ||||
| # check input format | # check input format | ||||
| for para in ['prompt', 'source language', 'target language']: | for para in ['prompt', 'source language', 'target language']: | ||||
| if para not in inputs: | if para not in inputs: | ||||
| return ('please check your input format.') | |||||
| raise Exception('please check your input format.') | |||||
| return self.model(inputs) | return self.model(inputs) | ||||
| # format the outputs from pipeline | # format the outputs from pipeline | ||||
| @@ -30,7 +30,6 @@ if TYPE_CHECKING: | |||||
| from .space_T_en import ConversationalTextToSqlPreprocessor | from .space_T_en import ConversationalTextToSqlPreprocessor | ||||
| from .space_T_cn import TableQuestionAnsweringPreprocessor | from .space_T_cn import TableQuestionAnsweringPreprocessor | ||||
| from .mglm_summarization_preprocessor import MGLMSummarizationPreprocessor | from .mglm_summarization_preprocessor import MGLMSummarizationPreprocessor | ||||
| from .codegeex_preprocessor import CodeGeeXPreprocessor | |||||
| else: | else: | ||||
| _import_structure = { | _import_structure = { | ||||
| 'nlp_base': [ | 'nlp_base': [ | ||||
| @@ -65,7 +64,6 @@ else: | |||||
| 'TextErrorCorrectionPreprocessor', | 'TextErrorCorrectionPreprocessor', | ||||
| ], | ], | ||||
| 'mglm_summarization_preprocessor': ['MGLMSummarizationPreprocessor'], | 'mglm_summarization_preprocessor': ['MGLMSummarizationPreprocessor'], | ||||
| 'codegeex_preprocessor': ['CodeGeeXPreprocessor'], | |||||
| 'token_classification_thai_preprocessor': [ | 'token_classification_thai_preprocessor': [ | ||||
| 'NERPreprocessorThai', | 'NERPreprocessorThai', | ||||
| 'WordSegmentationPreprocessorThai', | 'WordSegmentationPreprocessorThai', | ||||
| @@ -1,25 +0,0 @@ | |||||
| # Copyright (c) 2022 Zhipu.AI | |||||
| import re | |||||
| from typing import Any, Dict, Iterable, Optional, Tuple, Union | |||||
| from modelscope.metainfo import Models, Preprocessors | |||||
| from modelscope.preprocessors.base import Preprocessor | |||||
| from modelscope.preprocessors.builder import PREPROCESSORS | |||||
| from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile | |||||
| from modelscope.utils.type_assert import type_assert | |||||
| @PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.codegeex) | |||||
| class CodeGeeXPreprocessor(Preprocessor): | |||||
| def __init__(self, *args, **kwargs): | |||||
| """preprocess the data | |||||
| Args: | |||||
| model_dir (str): model path | |||||
| """ | |||||
| super().__init__(*args, **kwargs) | |||||
| @type_assert(object, (str, tuple, Dict)) | |||||
| def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]: | |||||
| return data | |||||
| @@ -2,9 +2,7 @@ | |||||
| import os | import os | ||||
| import unittest | import unittest | ||||
| from modelscope.models import Model | |||||
| from modelscope.pipelines import pipeline | from modelscope.pipelines import pipeline | ||||
| from modelscope.preprocessors import CodeGeeXPreprocessor | |||||
| from modelscope.utils.constant import Tasks | from modelscope.utils.constant import Tasks | ||||
| from modelscope.utils.demo_utils import DemoCompatibilityCheck | from modelscope.utils.demo_utils import DemoCompatibilityCheck | ||||
| from modelscope.utils.test_utils import test_level | from modelscope.utils.test_utils import test_level | ||||
| @@ -19,11 +17,9 @@ class CodeGeeXCodeTranslationTest(unittest.TestCase, DemoCompatibilityCheck): | |||||
| @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') | @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') | ||||
| def test_run_with_CodeGeeX_with_name(self): | def test_run_with_CodeGeeX_with_name(self): | ||||
| model = 'ZhipuAI/CodeGeeX-Code-Translation-13B' | model = 'ZhipuAI/CodeGeeX-Code-Translation-13B' | ||||
| preprocessor = CodeGeeXPreprocessor() | |||||
| pipe = pipeline( | pipe = pipeline( | ||||
| task=Tasks.code_translation, | task=Tasks.code_translation, | ||||
| model=model, | |||||
| preprocessor=preprocessor, | |||||
| model=model | |||||
| ) | ) | ||||
| inputs = { | inputs = { | ||||
| 'prompt': 'for i in range(10):\n\tprint(i)\n', | 'prompt': 'for i in range(10):\n\tprint(i)\n', | ||||