| @@ -1,6 +1,3 @@ | |||||
| echo "Testing envs" | |||||
| printenv | |||||
| echo "ENV END" | |||||
| if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then | if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then | ||||
| pip install -r requirements/tests.txt | pip install -r requirements/tests.txt | ||||
| git config --global --add safe.directory /Maas-lib | git config --global --add safe.directory /Maas-lib | ||||
| @@ -23,7 +20,7 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then | |||||
| awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | ||||
| awk -F: '/^[^#]/ { print $1 }' requirements/nlp.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | awk -F: '/^[^#]/ { print $1 }' requirements/nlp.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | ||||
| awk -F: '/^[^#]/ { print $1 }' requirements/science.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | awk -F: '/^[^#]/ { print $1 }' requirements/science.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | ||||
| pip install -r requirements/tests.txt | |||||
| # test with install | # test with install | ||||
| python setup.py install | python setup.py install | ||||
| else | else | ||||
| @@ -3,30 +3,32 @@ MODELSCOPE_CACHE_DIR_IN_CONTAINER=/modelscope_cache | |||||
| CODE_DIR=$PWD | CODE_DIR=$PWD | ||||
| CODE_DIR_IN_CONTAINER=/Maas-lib | CODE_DIR_IN_CONTAINER=/Maas-lib | ||||
| echo "$USER" | echo "$USER" | ||||
| gpus='7 6 5 4 3 2 1 0' | |||||
| cpu_sets='0-7 8-15 16-23 24-30 31-37 38-44 45-51 52-58' | |||||
| gpus='0,1 2,3 4,5 6,7' | |||||
| cpu_sets='45-58 31-44 16-30 0-15' | |||||
| cpu_sets_arr=($cpu_sets) | cpu_sets_arr=($cpu_sets) | ||||
| is_get_file_lock=false | is_get_file_lock=false | ||||
| # export RUN_CASE_COMMAND='python tests/run.py --run_config tests/run_config.yaml' | |||||
| CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_BASE_COMMAND} | |||||
| CI_COMMAND='bash .dev_scripts/ci_container_test.sh python tests/run.py --parallel 2 --run_config tests/run_config.yaml' | |||||
| echo "ci command: $CI_COMMAND" | echo "ci command: $CI_COMMAND" | ||||
| idx=0 | |||||
| for gpu in $gpus | for gpu in $gpus | ||||
| do | do | ||||
| exec {lock_fd}>"/tmp/gpu$gpu" || exit 1 | exec {lock_fd}>"/tmp/gpu$gpu" || exit 1 | ||||
| flock -n "$lock_fd" || { echo "WARN: gpu $gpu is in use!" >&2; continue; } | |||||
| flock -n "$lock_fd" || { echo "WARN: gpu $gpu is in use!" >&2; idx=$((idx+1)); continue; } | |||||
| echo "get gpu lock $gpu" | echo "get gpu lock $gpu" | ||||
| CONTAINER_NAME="modelscope-ci-$gpu" | |||||
| CONTAINER_NAME="modelscope-ci-$idx" | |||||
| let is_get_file_lock=true | let is_get_file_lock=true | ||||
| # pull image if there are update | # pull image if there are update | ||||
| docker pull ${IMAGE_NAME}:${IMAGE_VERSION} | docker pull ${IMAGE_NAME}:${IMAGE_VERSION} | ||||
| if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then | if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then | ||||
| echo 'debugging' | |||||
| docker run --rm --name $CONTAINER_NAME --shm-size=16gb \ | docker run --rm --name $CONTAINER_NAME --shm-size=16gb \ | ||||
| --cpuset-cpus=${cpu_sets_arr[$gpu]} \ | |||||
| --gpus="device=$gpu" \ | |||||
| --cpuset-cpus=${cpu_sets_arr[$idx]} \ | |||||
| --gpus='"'"device=$gpu"'"' \ | |||||
| -v $CODE_DIR:$CODE_DIR_IN_CONTAINER \ | -v $CODE_DIR:$CODE_DIR_IN_CONTAINER \ | ||||
| -v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \ | -v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \ | ||||
| -v $MODELSCOPE_HOME_CACHE/$gpu:/root \ | |||||
| -v $MODELSCOPE_HOME_CACHE/$idx:/root \ | |||||
| -v /home/admin/pre-commit:/home/admin/pre-commit \ | -v /home/admin/pre-commit:/home/admin/pre-commit \ | ||||
| -e CI_TEST=True \ | -e CI_TEST=True \ | ||||
| -e TEST_LEVEL=$TEST_LEVEL \ | -e TEST_LEVEL=$TEST_LEVEL \ | ||||
| @@ -41,16 +43,15 @@ do | |||||
| -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \ | -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \ | ||||
| -e MODEL_TAG_URL=$MODEL_TAG_URL \ | -e MODEL_TAG_URL=$MODEL_TAG_URL \ | ||||
| --workdir=$CODE_DIR_IN_CONTAINER \ | --workdir=$CODE_DIR_IN_CONTAINER \ | ||||
| --net host \ | |||||
| ${IMAGE_NAME}:${IMAGE_VERSION} \ | ${IMAGE_NAME}:${IMAGE_VERSION} \ | ||||
| $CI_COMMAND | $CI_COMMAND | ||||
| else | else | ||||
| docker run --rm --name $CONTAINER_NAME --shm-size=16gb \ | docker run --rm --name $CONTAINER_NAME --shm-size=16gb \ | ||||
| --cpuset-cpus=${cpu_sets_arr[$gpu]} \ | |||||
| --gpus="device=$gpu" \ | |||||
| --cpuset-cpus=${cpu_sets_arr[$idx]} \ | |||||
| --gpus='"'"device=$gpu"'"' \ | |||||
| -v $CODE_DIR:$CODE_DIR_IN_CONTAINER \ | -v $CODE_DIR:$CODE_DIR_IN_CONTAINER \ | ||||
| -v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \ | -v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \ | ||||
| -v $MODELSCOPE_HOME_CACHE/$gpu:/root \ | |||||
| -v $MODELSCOPE_HOME_CACHE/$idx:/root \ | |||||
| -v /home/admin/pre-commit:/home/admin/pre-commit \ | -v /home/admin/pre-commit:/home/admin/pre-commit \ | ||||
| -e CI_TEST=True \ | -e CI_TEST=True \ | ||||
| -e TEST_LEVEL=$TEST_LEVEL \ | -e TEST_LEVEL=$TEST_LEVEL \ | ||||
| @@ -64,7 +65,6 @@ do | |||||
| -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \ | -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \ | ||||
| -e MODEL_TAG_URL=$MODEL_TAG_URL \ | -e MODEL_TAG_URL=$MODEL_TAG_URL \ | ||||
| --workdir=$CODE_DIR_IN_CONTAINER \ | --workdir=$CODE_DIR_IN_CONTAINER \ | ||||
| --net host \ | |||||
| ${IMAGE_NAME}:${IMAGE_VERSION} \ | ${IMAGE_NAME}:${IMAGE_VERSION} \ | ||||
| $CI_COMMAND | $CI_COMMAND | ||||
| fi | fi | ||||
| @@ -20,7 +20,6 @@ class MogFaceDetector(TorchModel): | |||||
| def __init__(self, model_path, device='cuda'): | def __init__(self, model_path, device='cuda'): | ||||
| super().__init__(model_path) | super().__init__(model_path) | ||||
| torch.set_grad_enabled(False) | |||||
| cudnn.benchmark = True | cudnn.benchmark = True | ||||
| self.model_path = model_path | self.model_path = model_path | ||||
| self.device = device | self.device = device | ||||
| @@ -21,7 +21,6 @@ class MtcnnFaceDetector(TorchModel): | |||||
| def __init__(self, model_path, device='cuda'): | def __init__(self, model_path, device='cuda'): | ||||
| super().__init__(model_path) | super().__init__(model_path) | ||||
| torch.set_grad_enabled(False) | |||||
| cudnn.benchmark = True | cudnn.benchmark = True | ||||
| self.model_path = model_path | self.model_path = model_path | ||||
| self.device = device | self.device = device | ||||
| @@ -18,7 +18,6 @@ class RetinaFaceDetection(TorchModel): | |||||
| def __init__(self, model_path, device='cuda'): | def __init__(self, model_path, device='cuda'): | ||||
| super().__init__(model_path) | super().__init__(model_path) | ||||
| torch.set_grad_enabled(False) | |||||
| cudnn.benchmark = True | cudnn.benchmark = True | ||||
| self.model_path = model_path | self.model_path = model_path | ||||
| self.cfg = Config.from_file( | self.cfg = Config.from_file( | ||||
| @@ -24,7 +24,6 @@ class UlfdFaceDetector(TorchModel): | |||||
| def __init__(self, model_path, device='cuda'): | def __init__(self, model_path, device='cuda'): | ||||
| super().__init__(model_path) | super().__init__(model_path) | ||||
| torch.set_grad_enabled(False) | |||||
| cudnn.benchmark = True | cudnn.benchmark = True | ||||
| self.model_path = model_path | self.model_path = model_path | ||||
| self.device = device | self.device = device | ||||
| @@ -24,7 +24,6 @@ class FacialExpressionRecognition(TorchModel): | |||||
| def __init__(self, model_path, device='cuda'): | def __init__(self, model_path, device='cuda'): | ||||
| super().__init__(model_path) | super().__init__(model_path) | ||||
| torch.set_grad_enabled(False) | |||||
| cudnn.benchmark = True | cudnn.benchmark = True | ||||
| self.model_path = model_path | self.model_path = model_path | ||||
| self.device = device | self.device = device | ||||
| @@ -31,7 +31,6 @@ cfg_re50 = { | |||||
| class RetinaFaceDetection(object): | class RetinaFaceDetection(object): | ||||
| def __init__(self, model_path, device='cuda'): | def __init__(self, model_path, device='cuda'): | ||||
| torch.set_grad_enabled(False) | |||||
| cudnn.benchmark = True | cudnn.benchmark = True | ||||
| self.model_path = model_path | self.model_path = model_path | ||||
| self.device = device | self.device = device | ||||
| @@ -3,11 +3,13 @@ | |||||
| import argparse | import argparse | ||||
| import datetime | import datetime | ||||
| import math | |||||
| import multiprocessing | import multiprocessing | ||||
| import os | import os | ||||
| import subprocess | import subprocess | ||||
| import sys | import sys | ||||
| import tempfile | import tempfile | ||||
| import time | |||||
| import unittest | import unittest | ||||
| from fnmatch import fnmatch | from fnmatch import fnmatch | ||||
| from multiprocessing.managers import BaseManager | from multiprocessing.managers import BaseManager | ||||
| @@ -158,6 +160,21 @@ def run_command_with_popen(cmd): | |||||
| sys.stdout.write(line) | sys.stdout.write(line) | ||||
| def async_run_command_with_popen(cmd, device_id): | |||||
| logger.info('Worker id: %s args: %s' % (device_id, cmd)) | |||||
| env = os.environ.copy() | |||||
| env['CUDA_VISIBLE_DEVICES'] = '%s' % device_id | |||||
| sub_process = subprocess.Popen( | |||||
| cmd, | |||||
| stdout=subprocess.PIPE, | |||||
| stderr=subprocess.STDOUT, | |||||
| bufsize=1, | |||||
| universal_newlines=True, | |||||
| env=env, | |||||
| encoding='utf8') | |||||
| return sub_process | |||||
| def save_test_result(df, args): | def save_test_result(df, args): | ||||
| if args.result_dir is not None: | if args.result_dir is not None: | ||||
| file_name = str(int(datetime.datetime.now().timestamp() * 1000)) | file_name = str(int(datetime.datetime.now().timestamp() * 1000)) | ||||
| @@ -199,6 +216,108 @@ def install_requirements(requirements): | |||||
| run_command(cmd) | run_command(cmd) | ||||
| def wait_for_free_worker(workers): | |||||
| while True: | |||||
| for idx, worker in enumerate(workers): | |||||
| if worker is None: | |||||
| logger.info('return free worker: %s' % (idx)) | |||||
| return idx | |||||
| if worker.poll() is None: # running, get output | |||||
| for line in iter(worker.stdout.readline, ''): | |||||
| if line != '': | |||||
| sys.stdout.write(line) | |||||
| else: | |||||
| break | |||||
| else: # worker process completed. | |||||
| logger.info('Process end: %s' % (idx)) | |||||
| workers[idx] = None | |||||
| return idx | |||||
| time.sleep(0.001) | |||||
| def wait_for_workers(workers): | |||||
| while True: | |||||
| for idx, worker in enumerate(workers): | |||||
| if worker is None: | |||||
| continue | |||||
| # check worker is completed. | |||||
| if worker.poll() is None: | |||||
| for line in iter(worker.stdout.readline, ''): | |||||
| if line != '': | |||||
| sys.stdout.write(line) | |||||
| else: | |||||
| break | |||||
| else: | |||||
| logger.info('Process idx: %s end!' % (idx)) | |||||
| workers[idx] = None | |||||
| is_all_completed = True | |||||
| for idx, worker in enumerate(workers): | |||||
| if worker is not None: | |||||
| is_all_completed = False | |||||
| break | |||||
| if is_all_completed: | |||||
| logger.info('All sub porcess is completed!') | |||||
| break | |||||
| time.sleep(0.001) | |||||
| def parallel_run_case_in_env(env_name, env, test_suite_env_map, isolated_cases, | |||||
| result_dir, parallel): | |||||
| logger.info('Running case in env: %s' % env_name) | |||||
| # install requirements and deps # run_config['envs'][env] | |||||
| if 'requirements' in env: | |||||
| install_requirements(env['requirements']) | |||||
| if 'dependencies' in env: | |||||
| install_packages(env['dependencies']) | |||||
| # case worker processes | |||||
| worker_processes = [None] * parallel | |||||
| for test_suite_file in isolated_cases: # run case in subprocess | |||||
| if test_suite_file in test_suite_env_map and test_suite_env_map[ | |||||
| test_suite_file] == env_name: | |||||
| cmd = [ | |||||
| 'python', | |||||
| 'tests/run.py', | |||||
| '--pattern', | |||||
| test_suite_file, | |||||
| '--result_dir', | |||||
| result_dir, | |||||
| ] | |||||
| worker_idx = wait_for_free_worker(worker_processes) | |||||
| worker_process = async_run_command_with_popen(cmd, worker_idx) | |||||
| os.set_blocking(worker_process.stdout.fileno(), False) | |||||
| worker_processes[worker_idx] = worker_process | |||||
| else: | |||||
| pass # case not in run list. | |||||
| # run remain cases in a process. | |||||
| remain_suite_files = [] | |||||
| for k, v in test_suite_env_map.items(): | |||||
| if k not in isolated_cases and v == env_name: | |||||
| remain_suite_files.append(k) | |||||
| if len(remain_suite_files) == 0: | |||||
| return | |||||
| # roughly split case in parallel | |||||
| part_count = math.ceil(len(remain_suite_files) / parallel) | |||||
| suites_chunks = [ | |||||
| remain_suite_files[x:x + part_count] | |||||
| for x in range(0, len(remain_suite_files), part_count) | |||||
| ] | |||||
| for suites_chunk in suites_chunks: | |||||
| worker_idx = wait_for_free_worker(worker_processes) | |||||
| cmd = [ | |||||
| 'python', 'tests/run.py', '--result_dir', result_dir, '--suites' | |||||
| ] | |||||
| for suite in suites_chunk: | |||||
| cmd.append(suite) | |||||
| worker_process = async_run_command_with_popen(cmd, worker_idx) | |||||
| os.set_blocking(worker_process.stdout.fileno(), False) | |||||
| worker_processes[worker_idx] = worker_process | |||||
| wait_for_workers(worker_processes) | |||||
| def run_case_in_env(env_name, env, test_suite_env_map, isolated_cases, | def run_case_in_env(env_name, env, test_suite_env_map, isolated_cases, | ||||
| result_dir): | result_dir): | ||||
| # install requirements and deps # run_config['envs'][env] | # install requirements and deps # run_config['envs'][env] | ||||
| @@ -264,8 +383,9 @@ def run_in_subprocess(args): | |||||
| with tempfile.TemporaryDirectory() as temp_result_dir: | with tempfile.TemporaryDirectory() as temp_result_dir: | ||||
| for env in set(test_suite_env_map.values()): | for env in set(test_suite_env_map.values()): | ||||
| run_case_in_env(env, run_config['envs'][env], test_suite_env_map, | |||||
| isolated_cases, temp_result_dir) | |||||
| parallel_run_case_in_env(env, run_config['envs'][env], | |||||
| test_suite_env_map, isolated_cases, | |||||
| temp_result_dir, args.parallel) | |||||
| result_dfs = [] | result_dfs = [] | ||||
| result_path = Path(temp_result_dir) | result_path = Path(temp_result_dir) | ||||
| @@ -312,6 +432,10 @@ class TimeCostTextTestResult(TextTestResult): | |||||
| self.stream.writeln( | self.stream.writeln( | ||||
| 'Test case: %s stop at: %s, cost time: %s(seconds)' % | 'Test case: %s stop at: %s, cost time: %s(seconds)' % | ||||
| (test.test_full_name, test.stop_time, test.time_cost)) | (test.test_full_name, test.stop_time, test.time_cost)) | ||||
| if torch.cuda.is_available( | |||||
| ) and test.time_cost > 5.0: # print nvidia-smi | |||||
| cmd = ['nvidia-smi'] | |||||
| run_command_with_popen(cmd) | |||||
| super(TimeCostTextTestResult, self).stopTest(test) | super(TimeCostTextTestResult, self).stopTest(test) | ||||
| def addSuccess(self, test): | def addSuccess(self, test): | ||||
| @@ -383,6 +507,8 @@ def main(args): | |||||
| os.path.abspath(args.test_dir), args.pattern, args.list_tests) | os.path.abspath(args.test_dir), args.pattern, args.list_tests) | ||||
| if not args.list_tests: | if not args.list_tests: | ||||
| result = runner.run(test_suite) | result = runner.run(test_suite) | ||||
| logger.info('Running case completed, pid: %s, suites: %s' % | |||||
| (os.getpid(), args.suites)) | |||||
| result = collect_test_results(result) | result = collect_test_results(result) | ||||
| df = test_cases_result_to_df(result) | df = test_cases_result_to_df(result) | ||||
| if args.result_dir is not None: | if args.result_dir is not None: | ||||
| @@ -417,6 +543,12 @@ if __name__ == '__main__': | |||||
| '--result_dir', | '--result_dir', | ||||
| default=None, | default=None, | ||||
| help='Save result to directory, internal use only') | help='Save result to directory, internal use only') | ||||
| parser.add_argument( | |||||
| '--parallel', | |||||
| default=1, | |||||
| type=int, | |||||
| help='Set case parallels, default single process, set with gpu number.' | |||||
| ) | |||||
| parser.add_argument( | parser.add_argument( | ||||
| '--suites', | '--suites', | ||||
| nargs='*', | nargs='*', | ||||
| @@ -1,5 +1,5 @@ | |||||
| # isolate cases in env, we can install different dependencies in each env. | # isolate cases in env, we can install different dependencies in each env. | ||||
| isolated: # test cases that may require excessive anmount of GPU memory, which will be executed in dedicagted process. | |||||
| isolated: # test cases that may require excessive anmount of GPU memory or run long time, which will be executed in dedicagted process. | |||||
| - test_text_to_speech.py | - test_text_to_speech.py | ||||
| - test_multi_modal_embedding.py | - test_multi_modal_embedding.py | ||||
| - test_ofa_tasks.py | - test_ofa_tasks.py | ||||
| @@ -12,6 +12,33 @@ isolated: # test cases that may require excessive anmount of GPU memory, which | |||||
| - test_segmentation_pipeline.py | - test_segmentation_pipeline.py | ||||
| - test_image_inpainting.py | - test_image_inpainting.py | ||||
| - test_mglm_text_summarization.py | - test_mglm_text_summarization.py | ||||
| - test_team_transfer_trainer.py | |||||
| - test_image_denoise_trainer.py | |||||
| - test_dialog_intent_trainer.py | |||||
| - test_finetune_mplug.py | |||||
| - test_image_instance_segmentation_trainer.py | |||||
| - test_image_portrait_enhancement_trainer.py | |||||
| - test_translation_trainer.py | |||||
| - test_unifold.py | |||||
| - test_automatic_post_editing.py | |||||
| - test_mplug_tasks.py | |||||
| - test_movie_scene_segmentation.py | |||||
| - test_body_3d_keypoints.py | |||||
| - test_finetune_text_generation.py | |||||
| - test_clip_trainer.py | |||||
| - test_ofa_trainer.py | |||||
| - test_fill_mask.py | |||||
| - test_hand_2d_keypoints.py | |||||
| - test_referring_video_object_segmentation.py | |||||
| - test_easycv_trainer_hand_2d_keypoints.py | |||||
| - test_card_detection_scrfd_trainer.py | |||||
| - test_referring_video_object_segmentation_trainer.py | |||||
| - test_person_image_cartoon.py | |||||
| - test_image_style_transfer.py | |||||
| - test_ocr_detection.py | |||||
| - test_automatic_speech_recognition.py | |||||
| - test_image_matting.py | |||||
| - test_skin_retouching.py | |||||
| envs: | envs: | ||||
| default: # default env, case not in other env will in default, pytorch. | default: # default env, case not in other env will in default, pytorch. | ||||
| @@ -94,7 +94,7 @@ class TestDialogIntentTrainer(unittest.TestCase): | |||||
| cfg.Model.update(config['Model']) | cfg.Model.update(config['Model']) | ||||
| if self.debugging: | if self.debugging: | ||||
| cfg.Trainer.save_checkpoint = False | cfg.Trainer.save_checkpoint = False | ||||
| cfg.Trainer.num_epochs = 5 | |||||
| cfg.Trainer.num_epochs = 1 | |||||
| cfg.Trainer.batch_size_label = 64 | cfg.Trainer.batch_size_label = 64 | ||||
| return cfg | return cfg | ||||