Merge master internal 1118master
| @@ -1,6 +1,3 @@ | |||
| echo "Testing envs" | |||
| printenv | |||
| echo "ENV END" | |||
| if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then | |||
| pip install -r requirements/tests.txt | |||
| git config --global --add safe.directory /Maas-lib | |||
| @@ -28,7 +25,7 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then | |||
| awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | |||
| awk -F: '/^[^#]/ { print $1 }' requirements/nlp.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | |||
| awk -F: '/^[^#]/ { print $1 }' requirements/science.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | |||
| pip install -r requirements/tests.txt | |||
| # test with install | |||
| python setup.py install | |||
| else | |||
| @@ -3,30 +3,32 @@ MODELSCOPE_CACHE_DIR_IN_CONTAINER=/modelscope_cache | |||
| CODE_DIR=$PWD | |||
| CODE_DIR_IN_CONTAINER=/Maas-lib | |||
| echo "$USER" | |||
| gpus='7 6 5 4 3 2 1 0' | |||
| cpu_sets='0-7 8-15 16-23 24-30 31-37 38-44 45-51 52-58' | |||
| gpus='0,1 2,3 4,5 6,7' | |||
| cpu_sets='45-58 31-44 16-30 0-15' | |||
| cpu_sets_arr=($cpu_sets) | |||
| is_get_file_lock=false | |||
| # export RUN_CASE_COMMAND='python tests/run.py --run_config tests/run_config.yaml' | |||
| CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_BASE_COMMAND} | |||
| CI_COMMAND='bash .dev_scripts/ci_container_test.sh python tests/run.py --parallel 2 --run_config tests/run_config.yaml' | |||
| echo "ci command: $CI_COMMAND" | |||
| idx=0 | |||
| for gpu in $gpus | |||
| do | |||
| exec {lock_fd}>"/tmp/gpu$gpu" || exit 1 | |||
| flock -n "$lock_fd" || { echo "WARN: gpu $gpu is in use!" >&2; continue; } | |||
| flock -n "$lock_fd" || { echo "WARN: gpu $gpu is in use!" >&2; idx=$((idx+1)); continue; } | |||
| echo "get gpu lock $gpu" | |||
| CONTAINER_NAME="modelscope-ci-$gpu" | |||
| CONTAINER_NAME="modelscope-ci-$idx" | |||
| let is_get_file_lock=true | |||
| # pull image if there are update | |||
| docker pull ${IMAGE_NAME}:${IMAGE_VERSION} | |||
| if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then | |||
| echo 'debugging' | |||
| docker run --rm --name $CONTAINER_NAME --shm-size=16gb \ | |||
| --cpuset-cpus=${cpu_sets_arr[$gpu]} \ | |||
| --gpus="device=$gpu" \ | |||
| --cpuset-cpus=${cpu_sets_arr[$idx]} \ | |||
| --gpus='"'"device=$gpu"'"' \ | |||
| -v $CODE_DIR:$CODE_DIR_IN_CONTAINER \ | |||
| -v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \ | |||
| -v $MODELSCOPE_HOME_CACHE/$gpu:/root \ | |||
| -v $MODELSCOPE_HOME_CACHE/$idx:/root \ | |||
| -v /home/admin/pre-commit:/home/admin/pre-commit \ | |||
| -e CI_TEST=True \ | |||
| -e TEST_LEVEL=$TEST_LEVEL \ | |||
| @@ -41,16 +43,15 @@ do | |||
| -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \ | |||
| -e MODEL_TAG_URL=$MODEL_TAG_URL \ | |||
| --workdir=$CODE_DIR_IN_CONTAINER \ | |||
| --net host \ | |||
| ${IMAGE_NAME}:${IMAGE_VERSION} \ | |||
| $CI_COMMAND | |||
| else | |||
| docker run --rm --name $CONTAINER_NAME --shm-size=16gb \ | |||
| --cpuset-cpus=${cpu_sets_arr[$gpu]} \ | |||
| --gpus="device=$gpu" \ | |||
| --cpuset-cpus=${cpu_sets_arr[$idx]} \ | |||
| --gpus='"'"device=$gpu"'"' \ | |||
| -v $CODE_DIR:$CODE_DIR_IN_CONTAINER \ | |||
| -v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \ | |||
| -v $MODELSCOPE_HOME_CACHE/$gpu:/root \ | |||
| -v $MODELSCOPE_HOME_CACHE/$idx:/root \ | |||
| -v /home/admin/pre-commit:/home/admin/pre-commit \ | |||
| -e CI_TEST=True \ | |||
| -e TEST_LEVEL=$TEST_LEVEL \ | |||
| @@ -64,7 +65,6 @@ do | |||
| -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \ | |||
| -e MODEL_TAG_URL=$MODEL_TAG_URL \ | |||
| --workdir=$CODE_DIR_IN_CONTAINER \ | |||
| --net host \ | |||
| ${IMAGE_NAME}:${IMAGE_VERSION} \ | |||
| $CI_COMMAND | |||
| fi | |||
| @@ -1,5 +1,5 @@ | |||
| repos: | |||
| - repo: https://github.com/PyCQA/flake8 | |||
| - repo: https://github.com/pycqa/flake8.git | |||
| rev: 4.0.0 | |||
| hooks: | |||
| - id: flake8 | |||
| @@ -23,9 +23,10 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA, | |||
| API_RESPONSE_FIELD_MESSAGE, | |||
| API_RESPONSE_FIELD_USERNAME, | |||
| DEFAULT_CREDENTIALS_PATH, | |||
| MODELSCOPE_ENVIRONMENT, | |||
| MODELSCOPE_USERNAME, ONE_YEAR_SECONDS, | |||
| Licenses, ModelVisibility) | |||
| MODELSCOPE_CLOUD_ENVIRONMENT, | |||
| MODELSCOPE_CLOUD_USERNAME, | |||
| ONE_YEAR_SECONDS, Licenses, | |||
| ModelVisibility) | |||
| from modelscope.hub.errors import (InvalidParameter, NotExistError, | |||
| NotLoginException, NoValidRevisionError, | |||
| RequestError, datahub_raise_on_error, | |||
| @@ -653,10 +654,10 @@ class HubApi: | |||
| # get channel and user_name | |||
| channel = DownloadChannel.LOCAL.value | |||
| user_name = '' | |||
| if MODELSCOPE_ENVIRONMENT in os.environ: | |||
| channel = os.environ[MODELSCOPE_ENVIRONMENT] | |||
| if MODELSCOPE_USERNAME in os.environ: | |||
| user_name = os.environ[MODELSCOPE_USERNAME] | |||
| if MODELSCOPE_CLOUD_ENVIRONMENT in os.environ: | |||
| channel = os.environ[MODELSCOPE_CLOUD_ENVIRONMENT] | |||
| if MODELSCOPE_CLOUD_USERNAME in os.environ: | |||
| user_name = os.environ[MODELSCOPE_CLOUD_USERNAME] | |||
| url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/{channel}?user={user_name}' | |||
| cookies = ModelScopeConfig.get_cookies() | |||
| @@ -777,12 +778,15 @@ class ModelScopeConfig: | |||
| Returns: | |||
| The formatted user-agent string. | |||
| """ | |||
| # include some more telemetrics when executing in dedicated | |||
| # cloud containers | |||
| env = 'custom' | |||
| if MODELSCOPE_ENVIRONMENT in os.environ: | |||
| env = os.environ[MODELSCOPE_ENVIRONMENT] | |||
| if MODELSCOPE_CLOUD_ENVIRONMENT in os.environ: | |||
| env = os.environ[MODELSCOPE_CLOUD_ENVIRONMENT] | |||
| user_name = 'unknown' | |||
| if MODELSCOPE_USERNAME in os.environ: | |||
| user_name = os.environ[MODELSCOPE_USERNAME] | |||
| if MODELSCOPE_CLOUD_USERNAME in os.environ: | |||
| user_name = os.environ[MODELSCOPE_CLOUD_USERNAME] | |||
| ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s; user/%s' % ( | |||
| __version__, | |||
| @@ -16,9 +16,9 @@ API_RESPONSE_FIELD_GIT_ACCESS_TOKEN = 'AccessToken' | |||
| API_RESPONSE_FIELD_USERNAME = 'Username' | |||
| API_RESPONSE_FIELD_EMAIL = 'Email' | |||
| API_RESPONSE_FIELD_MESSAGE = 'Message' | |||
| MODELSCOPE_ENVIRONMENT = 'MODELSCOPE_ENVIRONMENT' | |||
| MODELSCOPE_CLOUD_ENVIRONMENT = 'MODELSCOPE_ENVIRONMENT' | |||
| MODELSCOPE_CLOUD_USERNAME = 'MODELSCOPE_USERNAME' | |||
| MODELSCOPE_SDK_DEBUG = 'MODELSCOPE_SDK_DEBUG' | |||
| MODELSCOPE_USERNAME = 'MODELSCOPE_USERNAME' | |||
| ONE_YEAR_SECONDS = 24 * 365 * 60 * 60 | |||
| @@ -87,16 +87,3 @@ def file_integrity_validation(file_path, expected_sha256): | |||
| msg = 'File %s integrity check failed, the download may be incomplete, please try again.' % file_path | |||
| logger.error(msg) | |||
| raise FileIntegrityError(msg) | |||
| def create_library_statistics(method: str, name: str, cn_name: Optional[str]): | |||
| try: | |||
| from modelscope.hub.api import ModelScopeConfig | |||
| path = f'{get_endpoint()}/api/v1/statistics/library' | |||
| headers = {'user-agent': ModelScopeConfig.get_user_agent()} | |||
| params = {'Method': method, 'Name': name, 'CnName': cn_name} | |||
| r = requests.post(path, params=params, headers=headers) | |||
| r.raise_for_status() | |||
| except Exception: | |||
| pass | |||
| return | |||
| @@ -54,7 +54,8 @@ class FSMNSeleNetV2Decorator(TorchModel): | |||
| ) | |||
| def __del__(self): | |||
| self.tmp_dir.cleanup() | |||
| if hasattr(self, 'tmp_dir'): | |||
| self.tmp_dir.cleanup() | |||
| def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: | |||
| return self.model.forward(input) | |||
| @@ -20,7 +20,6 @@ class MogFaceDetector(TorchModel): | |||
| def __init__(self, model_path, device='cuda'): | |||
| super().__init__(model_path) | |||
| torch.set_grad_enabled(False) | |||
| cudnn.benchmark = True | |||
| self.model_path = model_path | |||
| self.device = device | |||
| @@ -21,7 +21,6 @@ class MtcnnFaceDetector(TorchModel): | |||
| def __init__(self, model_path, device='cuda'): | |||
| super().__init__(model_path) | |||
| torch.set_grad_enabled(False) | |||
| cudnn.benchmark = True | |||
| self.model_path = model_path | |||
| self.device = device | |||
| @@ -18,7 +18,6 @@ class RetinaFaceDetection(TorchModel): | |||
| def __init__(self, model_path, device='cuda'): | |||
| super().__init__(model_path) | |||
| torch.set_grad_enabled(False) | |||
| cudnn.benchmark = True | |||
| self.model_path = model_path | |||
| self.cfg = Config.from_file( | |||
| @@ -24,7 +24,6 @@ class UlfdFaceDetector(TorchModel): | |||
| def __init__(self, model_path, device='cuda'): | |||
| super().__init__(model_path) | |||
| torch.set_grad_enabled(False) | |||
| cudnn.benchmark = True | |||
| self.model_path = model_path | |||
| self.device = device | |||
| @@ -24,7 +24,6 @@ class FacialExpressionRecognition(TorchModel): | |||
| def __init__(self, model_path, device='cuda'): | |||
| super().__init__(model_path) | |||
| torch.set_grad_enabled(False) | |||
| cudnn.benchmark = True | |||
| self.model_path = model_path | |||
| self.device = device | |||
| @@ -31,7 +31,6 @@ cfg_re50 = { | |||
| class RetinaFaceDetection(object): | |||
| def __init__(self, model_path, device='cuda'): | |||
| torch.set_grad_enabled(False) | |||
| cudnn.benchmark = True | |||
| self.model_path = model_path | |||
| self.device = device | |||
| @@ -7,6 +7,7 @@ import time | |||
| import cv2 | |||
| import json | |||
| import numpy as np | |||
| import torch | |||
| from tqdm import tqdm | |||
| @@ -87,13 +88,17 @@ class RealtimeVideoDetector(TorchModel): | |||
| self.nmsthre, | |||
| class_agnostic=True) | |||
| if len(outputs) == 1: | |||
| if len(outputs) == 1 and (outputs[0] is not None): | |||
| bboxes = outputs[0][:, 0:4].cpu().numpy() / self.ratio | |||
| scores = outputs[0][:, 5].cpu().numpy() | |||
| labels = outputs[0][:, 6].cpu().int().numpy() | |||
| pred_label_names = [] | |||
| for lab in labels: | |||
| pred_label_names.append(self.label_mapping[lab]) | |||
| else: | |||
| bboxes = np.asarray([]) | |||
| scores = np.asarray([]) | |||
| pred_label_names = np.asarray([]) | |||
| return bboxes, scores, pred_label_names | |||
| @@ -31,7 +31,10 @@ class ReferringVideoObjectSegmentation(TorchModel): | |||
| config_path = osp.join(model_dir, ModelFile.CONFIGURATION) | |||
| self.cfg = Config.from_file(config_path) | |||
| self.model = MTTR(**self.cfg.model) | |||
| transformer_cfg_dir = osp.join(model_dir, 'transformer_cfg_dir') | |||
| self.model = MTTR( | |||
| transformer_cfg_dir=transformer_cfg_dir, **self.cfg.model) | |||
| model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE) | |||
| params_dict = torch.load(model_path, map_location='cpu') | |||
| @@ -19,6 +19,7 @@ class MTTR(nn.Module): | |||
| num_queries, | |||
| mask_kernels_dim=8, | |||
| aux_loss=False, | |||
| transformer_cfg_dir=None, | |||
| **kwargs): | |||
| """ | |||
| Parameters: | |||
| @@ -29,7 +30,9 @@ class MTTR(nn.Module): | |||
| """ | |||
| super().__init__() | |||
| self.backbone = init_backbone(**kwargs) | |||
| self.transformer = MultimodalTransformer(**kwargs) | |||
| assert transformer_cfg_dir is not None | |||
| self.transformer = MultimodalTransformer( | |||
| transformer_cfg_dir=transformer_cfg_dir, **kwargs) | |||
| d_model = self.transformer.d_model | |||
| self.is_referred_head = nn.Linear( | |||
| d_model, | |||
| @@ -26,6 +26,7 @@ class MultimodalTransformer(nn.Module): | |||
| num_decoder_layers=3, | |||
| text_encoder_type='roberta-base', | |||
| freeze_text_encoder=True, | |||
| transformer_cfg_dir=None, | |||
| **kwargs): | |||
| super().__init__() | |||
| self.d_model = kwargs['d_model'] | |||
| @@ -40,10 +41,12 @@ class MultimodalTransformer(nn.Module): | |||
| self.pos_encoder_2d = PositionEmbeddingSine2D() | |||
| self._reset_parameters() | |||
| self.text_encoder = RobertaModel.from_pretrained(text_encoder_type) | |||
| if text_encoder_type != 'roberta-base': | |||
| transformer_cfg_dir = text_encoder_type | |||
| self.text_encoder = RobertaModel.from_pretrained(transformer_cfg_dir) | |||
| self.text_encoder.pooler = None # this pooler is never used, this is a hack to avoid DDP problems... | |||
| self.tokenizer = RobertaTokenizerFast.from_pretrained( | |||
| text_encoder_type) | |||
| transformer_cfg_dir) | |||
| self.freeze_text_encoder = freeze_text_encoder | |||
| if freeze_text_encoder: | |||
| for p in self.text_encoder.parameters(): | |||
| @@ -188,11 +188,13 @@ class Worker(threading.Thread): | |||
| class KWSDataLoader: | |||
| """ | |||
| dataset: the dataset reference | |||
| batchsize: data batch size | |||
| numworkers: no. of workers | |||
| prefetch: prefetch factor | |||
| """ Load and organize audio data with multiple threads | |||
| Args: | |||
| dataset: the dataset reference | |||
| batchsize: data batch size | |||
| numworkers: no. of workers | |||
| prefetch: prefetch factor | |||
| """ | |||
| def __init__(self, dataset, batchsize, numworkers, prefetch=2): | |||
| @@ -202,7 +204,7 @@ class KWSDataLoader: | |||
| self.isrun = True | |||
| # data queue | |||
| self.pool = queue.Queue(batchsize * prefetch) | |||
| self.pool = queue.Queue(numworkers * prefetch) | |||
| # initialize workers | |||
| self.workerlist = [] | |||
| @@ -270,11 +272,11 @@ class KWSDataLoader: | |||
| w.stopWorker() | |||
| while not self.pool.empty(): | |||
| self.pool.get(block=True, timeout=0.001) | |||
| self.pool.get(block=True, timeout=0.01) | |||
| # wait workers terminated | |||
| for w in self.workerlist: | |||
| while not self.pool.empty(): | |||
| self.pool.get(block=True, timeout=0.001) | |||
| self.pool.get(block=True, timeout=0.01) | |||
| w.join() | |||
| logger.info('KWSDataLoader: All worker stopped.') | |||
| @@ -10,7 +10,6 @@ from typing import Any, Dict, Generator, List, Mapping, Union | |||
| import numpy as np | |||
| from modelscope.hub.utils.utils import create_library_statistics | |||
| from modelscope.models.base import Model | |||
| from modelscope.msdatasets import MsDataset | |||
| from modelscope.outputs import TASK_OUTPUTS | |||
| @@ -152,9 +151,6 @@ class Pipeline(ABC): | |||
| **kwargs) -> Union[Dict[str, Any], Generator]: | |||
| # model provider should leave it as it is | |||
| # modelscope library developer will handle this function | |||
| for single_model in self.models: | |||
| if hasattr(single_model, 'name'): | |||
| create_library_statistics('pipeline', single_model.name, None) | |||
| # place model to cpu or gpu | |||
| if (self.model or (self.has_multiple_models and self.models[0])): | |||
| if not self._model_prepare: | |||
| @@ -92,6 +92,8 @@ class NamedEntityRecognitionPipeline(Pipeline): | |||
| offset_mapping = [x.cpu().tolist() for x in inputs['offset_mapping']] | |||
| labels = [self.id2label[x] for x in predictions] | |||
| if len(labels) > len(offset_mapping): | |||
| labels = labels[1:-1] | |||
| chunks = [] | |||
| chunk = {} | |||
| for label, offsets in zip(labels, offset_mapping): | |||
| @@ -104,6 +106,20 @@ class NamedEntityRecognitionPipeline(Pipeline): | |||
| 'start': offsets[0], | |||
| 'end': offsets[1] | |||
| } | |||
| if label[0] in 'I': | |||
| if not chunk: | |||
| chunk = { | |||
| 'type': label[2:], | |||
| 'start': offsets[0], | |||
| 'end': offsets[1] | |||
| } | |||
| if label[0] in 'E': | |||
| if not chunk: | |||
| chunk = { | |||
| 'type': label[2:], | |||
| 'start': offsets[0], | |||
| 'end': offsets[1] | |||
| } | |||
| if label[0] in 'IES': | |||
| if chunk: | |||
| chunk['end'] = offsets[1] | |||
| @@ -118,15 +134,15 @@ class NamedEntityRecognitionPipeline(Pipeline): | |||
| chunk['span'] = text[chunk['start']:chunk['end']] | |||
| chunks.append(chunk) | |||
| # for cws output | |||
| # for cws outputs | |||
| if len(chunks) > 0 and chunks[0]['type'] == 'cws': | |||
| spans = [ | |||
| chunk['span'] for chunk in chunks if chunk['span'].strip() | |||
| ] | |||
| seg_result = ' '.join(spans) | |||
| outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []} | |||
| outputs = {OutputKeys.OUTPUT: seg_result} | |||
| # for ner outpus | |||
| # for ner outputs | |||
| else: | |||
| outputs = {OutputKeys.OUTPUT: chunks} | |||
| return outputs | |||
| @@ -95,6 +95,20 @@ class TokenClassificationPipeline(Pipeline): | |||
| 'start': offsets[0], | |||
| 'end': offsets[1] | |||
| } | |||
| if label[0] in 'I': | |||
| if not chunk: | |||
| chunk = { | |||
| 'type': label[2:], | |||
| 'start': offsets[0], | |||
| 'end': offsets[1] | |||
| } | |||
| if label[0] in 'E': | |||
| if not chunk: | |||
| chunk = { | |||
| 'type': label[2:], | |||
| 'start': offsets[0], | |||
| 'end': offsets[1] | |||
| } | |||
| if label[0] in 'IES': | |||
| if chunk: | |||
| chunk['end'] = offsets[1] | |||
| @@ -80,9 +80,12 @@ class WordSegmentationPipeline(Pipeline): | |||
| Dict[str, str]: the prediction results | |||
| """ | |||
| text = inputs['text'] | |||
| logits = inputs[OutputKeys.LOGITS] | |||
| predictions = torch.argmax(logits[0], dim=-1) | |||
| logits = torch_nested_numpify(torch_nested_detach(logits)) | |||
| if not hasattr(inputs, 'predictions'): | |||
| logits = inputs[OutputKeys.LOGITS] | |||
| predictions = torch.argmax(logits[0], dim=-1) | |||
| else: | |||
| predictions = inputs[OutputKeys.PREDICTIONS].squeeze( | |||
| 0).cpu().numpy() | |||
| predictions = torch_nested_numpify(torch_nested_detach(predictions)) | |||
| offset_mapping = [x.cpu().tolist() for x in inputs['offset_mapping']] | |||
| @@ -101,6 +104,20 @@ class WordSegmentationPipeline(Pipeline): | |||
| 'start': offsets[0], | |||
| 'end': offsets[1] | |||
| } | |||
| if label[0] in 'I': | |||
| if not chunk: | |||
| chunk = { | |||
| 'type': label[2:], | |||
| 'start': offsets[0], | |||
| 'end': offsets[1] | |||
| } | |||
| if label[0] in 'E': | |||
| if not chunk: | |||
| chunk = { | |||
| 'type': label[2:], | |||
| 'start': offsets[0], | |||
| 'end': offsets[1] | |||
| } | |||
| if label[0] in 'IES': | |||
| if chunk: | |||
| chunk['end'] = offsets[1] | |||
| @@ -123,7 +140,7 @@ class WordSegmentationPipeline(Pipeline): | |||
| seg_result = ' '.join(spans) | |||
| outputs = {OutputKeys.OUTPUT: seg_result} | |||
| # for ner output | |||
| # for ner outputs | |||
| else: | |||
| outputs = {OutputKeys.OUTPUT: chunks} | |||
| return outputs | |||
| @@ -117,8 +117,7 @@ class KWSFarfieldTrainer(BaseTrainer): | |||
| self._batch_size = dataloader_config.batch_size_per_gpu | |||
| if 'model_bin' in kwargs: | |||
| model_bin_file = os.path.join(self.model_dir, kwargs['model_bin']) | |||
| checkpoint = torch.load(model_bin_file) | |||
| self.model.load_state_dict(checkpoint) | |||
| self.model = torch.load(model_bin_file) | |||
| # build corresponding optimizer and loss function | |||
| lr = self.cfg.train.optimizer.lr | |||
| self.optimizer = optim.Adam(self.model.parameters(), lr) | |||
| @@ -219,7 +218,9 @@ class KWSFarfieldTrainer(BaseTrainer): | |||
| # check point | |||
| ckpt_name = 'checkpoint_{:04d}_loss_train_{:.4f}_loss_val_{:.4f}.pth'.format( | |||
| self._current_epoch, loss_train_epoch, loss_val_epoch) | |||
| torch.save(self.model, os.path.join(self.work_dir, ckpt_name)) | |||
| save_path = os.path.join(self.work_dir, ckpt_name) | |||
| logger.info(f'Save model to {save_path}') | |||
| torch.save(self.model, save_path) | |||
| # time spent per epoch | |||
| epochtime = datetime.datetime.now() - epochtime | |||
| logger.info('Epoch {:04d} time spent: {:.2f} hours'.format( | |||
| @@ -15,7 +15,6 @@ from torch.utils.data.dataloader import default_collate | |||
| from torch.utils.data.distributed import DistributedSampler | |||
| from modelscope.hub.snapshot_download import snapshot_download | |||
| from modelscope.hub.utils.utils import create_library_statistics | |||
| from modelscope.metainfo import Trainers | |||
| from modelscope.metrics import build_metric, task_default_metrics | |||
| from modelscope.models.base import Model, TorchModel | |||
| @@ -437,8 +436,6 @@ class EpochBasedTrainer(BaseTrainer): | |||
| def train(self, checkpoint_path=None, *args, **kwargs): | |||
| self._mode = ModeKeys.TRAIN | |||
| if hasattr(self.model, 'name'): | |||
| create_library_statistics('train', self.model.name, None) | |||
| if self.train_dataset is None: | |||
| self.train_dataloader = self.get_train_dataloader() | |||
| @@ -459,8 +456,6 @@ class EpochBasedTrainer(BaseTrainer): | |||
| self.train_loop(self.train_dataloader) | |||
| def evaluate(self, checkpoint_path=None): | |||
| if hasattr(self.model, 'name'): | |||
| create_library_statistics('evaluate', self.model.name, None) | |||
| if checkpoint_path is not None and os.path.isfile(checkpoint_path): | |||
| from modelscope.trainers.hooks import CheckpointHook | |||
| CheckpointHook.load_checkpoint(checkpoint_path, self) | |||
| @@ -43,7 +43,10 @@ def update_conf(origin_config_file, new_config_file, conf_item: [str, str]): | |||
| def repl(matched): | |||
| key = matched.group(1) | |||
| if key in conf_item: | |||
| return conf_item[key] | |||
| value = conf_item[key] | |||
| if not isinstance(value, str): | |||
| value = str(value) | |||
| return value | |||
| else: | |||
| return None | |||
| @@ -3,11 +3,13 @@ | |||
| import argparse | |||
| import datetime | |||
| import math | |||
| import multiprocessing | |||
| import os | |||
| import subprocess | |||
| import sys | |||
| import tempfile | |||
| import time | |||
| import unittest | |||
| from fnmatch import fnmatch | |||
| from multiprocessing.managers import BaseManager | |||
| @@ -158,6 +160,21 @@ def run_command_with_popen(cmd): | |||
| sys.stdout.write(line) | |||
| def async_run_command_with_popen(cmd, device_id): | |||
| logger.info('Worker id: %s args: %s' % (device_id, cmd)) | |||
| env = os.environ.copy() | |||
| env['CUDA_VISIBLE_DEVICES'] = '%s' % device_id | |||
| sub_process = subprocess.Popen( | |||
| cmd, | |||
| stdout=subprocess.PIPE, | |||
| stderr=subprocess.STDOUT, | |||
| bufsize=1, | |||
| universal_newlines=True, | |||
| env=env, | |||
| encoding='utf8') | |||
| return sub_process | |||
| def save_test_result(df, args): | |||
| if args.result_dir is not None: | |||
| file_name = str(int(datetime.datetime.now().timestamp() * 1000)) | |||
| @@ -199,6 +216,108 @@ def install_requirements(requirements): | |||
| run_command(cmd) | |||
| def wait_for_free_worker(workers): | |||
| while True: | |||
| for idx, worker in enumerate(workers): | |||
| if worker is None: | |||
| logger.info('return free worker: %s' % (idx)) | |||
| return idx | |||
| if worker.poll() is None: # running, get output | |||
| for line in iter(worker.stdout.readline, ''): | |||
| if line != '': | |||
| sys.stdout.write(line) | |||
| else: | |||
| break | |||
| else: # worker process completed. | |||
| logger.info('Process end: %s' % (idx)) | |||
| workers[idx] = None | |||
| return idx | |||
| time.sleep(0.001) | |||
| def wait_for_workers(workers): | |||
| while True: | |||
| for idx, worker in enumerate(workers): | |||
| if worker is None: | |||
| continue | |||
| # check worker is completed. | |||
| if worker.poll() is None: | |||
| for line in iter(worker.stdout.readline, ''): | |||
| if line != '': | |||
| sys.stdout.write(line) | |||
| else: | |||
| break | |||
| else: | |||
| logger.info('Process idx: %s end!' % (idx)) | |||
| workers[idx] = None | |||
| is_all_completed = True | |||
| for idx, worker in enumerate(workers): | |||
| if worker is not None: | |||
| is_all_completed = False | |||
| break | |||
| if is_all_completed: | |||
| logger.info('All sub porcess is completed!') | |||
| break | |||
| time.sleep(0.001) | |||
| def parallel_run_case_in_env(env_name, env, test_suite_env_map, isolated_cases, | |||
| result_dir, parallel): | |||
| logger.info('Running case in env: %s' % env_name) | |||
| # install requirements and deps # run_config['envs'][env] | |||
| if 'requirements' in env: | |||
| install_requirements(env['requirements']) | |||
| if 'dependencies' in env: | |||
| install_packages(env['dependencies']) | |||
| # case worker processes | |||
| worker_processes = [None] * parallel | |||
| for test_suite_file in isolated_cases: # run case in subprocess | |||
| if test_suite_file in test_suite_env_map and test_suite_env_map[ | |||
| test_suite_file] == env_name: | |||
| cmd = [ | |||
| 'python', | |||
| 'tests/run.py', | |||
| '--pattern', | |||
| test_suite_file, | |||
| '--result_dir', | |||
| result_dir, | |||
| ] | |||
| worker_idx = wait_for_free_worker(worker_processes) | |||
| worker_process = async_run_command_with_popen(cmd, worker_idx) | |||
| os.set_blocking(worker_process.stdout.fileno(), False) | |||
| worker_processes[worker_idx] = worker_process | |||
| else: | |||
| pass # case not in run list. | |||
| # run remain cases in a process. | |||
| remain_suite_files = [] | |||
| for k, v in test_suite_env_map.items(): | |||
| if k not in isolated_cases and v == env_name: | |||
| remain_suite_files.append(k) | |||
| if len(remain_suite_files) == 0: | |||
| return | |||
| # roughly split case in parallel | |||
| part_count = math.ceil(len(remain_suite_files) / parallel) | |||
| suites_chunks = [ | |||
| remain_suite_files[x:x + part_count] | |||
| for x in range(0, len(remain_suite_files), part_count) | |||
| ] | |||
| for suites_chunk in suites_chunks: | |||
| worker_idx = wait_for_free_worker(worker_processes) | |||
| cmd = [ | |||
| 'python', 'tests/run.py', '--result_dir', result_dir, '--suites' | |||
| ] | |||
| for suite in suites_chunk: | |||
| cmd.append(suite) | |||
| worker_process = async_run_command_with_popen(cmd, worker_idx) | |||
| os.set_blocking(worker_process.stdout.fileno(), False) | |||
| worker_processes[worker_idx] = worker_process | |||
| wait_for_workers(worker_processes) | |||
| def run_case_in_env(env_name, env, test_suite_env_map, isolated_cases, | |||
| result_dir): | |||
| # install requirements and deps # run_config['envs'][env] | |||
| @@ -264,8 +383,9 @@ def run_in_subprocess(args): | |||
| with tempfile.TemporaryDirectory() as temp_result_dir: | |||
| for env in set(test_suite_env_map.values()): | |||
| run_case_in_env(env, run_config['envs'][env], test_suite_env_map, | |||
| isolated_cases, temp_result_dir) | |||
| parallel_run_case_in_env(env, run_config['envs'][env], | |||
| test_suite_env_map, isolated_cases, | |||
| temp_result_dir, args.parallel) | |||
| result_dfs = [] | |||
| result_path = Path(temp_result_dir) | |||
| @@ -312,6 +432,10 @@ class TimeCostTextTestResult(TextTestResult): | |||
| self.stream.writeln( | |||
| 'Test case: %s stop at: %s, cost time: %s(seconds)' % | |||
| (test.test_full_name, test.stop_time, test.time_cost)) | |||
| if torch.cuda.is_available( | |||
| ) and test.time_cost > 5.0: # print nvidia-smi | |||
| cmd = ['nvidia-smi'] | |||
| run_command_with_popen(cmd) | |||
| super(TimeCostTextTestResult, self).stopTest(test) | |||
| def addSuccess(self, test): | |||
| @@ -383,6 +507,8 @@ def main(args): | |||
| os.path.abspath(args.test_dir), args.pattern, args.list_tests) | |||
| if not args.list_tests: | |||
| result = runner.run(test_suite) | |||
| logger.info('Running case completed, pid: %s, suites: %s' % | |||
| (os.getpid(), args.suites)) | |||
| result = collect_test_results(result) | |||
| df = test_cases_result_to_df(result) | |||
| if args.result_dir is not None: | |||
| @@ -417,6 +543,12 @@ if __name__ == '__main__': | |||
| '--result_dir', | |||
| default=None, | |||
| help='Save result to directory, internal use only') | |||
| parser.add_argument( | |||
| '--parallel', | |||
| default=1, | |||
| type=int, | |||
| help='Set case parallels, default single process, set with gpu number.' | |||
| ) | |||
| parser.add_argument( | |||
| '--suites', | |||
| nargs='*', | |||
| @@ -1,5 +1,5 @@ | |||
| # isolate cases in env, we can install different dependencies in each env. | |||
| isolated: # test cases that may require excessive anmount of GPU memory, which will be executed in dedicagted process. | |||
| isolated: # test cases that may require excessive anmount of GPU memory or run long time, which will be executed in dedicagted process. | |||
| - test_text_to_speech.py | |||
| - test_multi_modal_embedding.py | |||
| - test_ofa_tasks.py | |||
| @@ -13,6 +13,33 @@ isolated: # test cases that may require excessive anmount of GPU memory, which | |||
| - test_movie_scene_segmentation.py | |||
| - test_image_inpainting.py | |||
| - test_mglm_text_summarization.py | |||
| - test_team_transfer_trainer.py | |||
| - test_image_denoise_trainer.py | |||
| - test_dialog_intent_trainer.py | |||
| - test_finetune_mplug.py | |||
| - test_image_instance_segmentation_trainer.py | |||
| - test_image_portrait_enhancement_trainer.py | |||
| - test_translation_trainer.py | |||
| - test_unifold.py | |||
| - test_automatic_post_editing.py | |||
| - test_mplug_tasks.py | |||
| - test_movie_scene_segmentation.py | |||
| - test_body_3d_keypoints.py | |||
| - test_finetune_text_generation.py | |||
| - test_clip_trainer.py | |||
| - test_ofa_trainer.py | |||
| - test_fill_mask.py | |||
| - test_hand_2d_keypoints.py | |||
| - test_referring_video_object_segmentation.py | |||
| - test_easycv_trainer_hand_2d_keypoints.py | |||
| - test_card_detection_scrfd_trainer.py | |||
| - test_referring_video_object_segmentation_trainer.py | |||
| - test_person_image_cartoon.py | |||
| - test_image_style_transfer.py | |||
| - test_ocr_detection.py | |||
| - test_automatic_speech_recognition.py | |||
| - test_image_matting.py | |||
| - test_skin_retouching.py | |||
| envs: | |||
| default: # default env, case not in other env will in default, pytorch. | |||
| @@ -94,7 +94,7 @@ class TestDialogIntentTrainer(unittest.TestCase): | |||
| cfg.Model.update(config['Model']) | |||
| if self.debugging: | |||
| cfg.Trainer.save_checkpoint = False | |||
| cfg.Trainer.num_epochs = 5 | |||
| cfg.Trainer.num_epochs = 1 | |||
| cfg.Trainer.batch_size_label = 64 | |||
| return cfg | |||