From a318f27247ad8436e2716109be8877565e558d06 Mon Sep 17 00:00:00 2001 From: "zhangzhicheng.zzc" Date: Fri, 2 Dec 2022 10:06:24 +0800 Subject: [PATCH] [to #42322933] speed up the ast indexing during editing Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10907357 --- modelscope/utils/ast_utils.py | 137 +++++++++++++++++++++++++++------- tests/utils/test_ast.py | 111 ++++++++++++++++++++++++--- 2 files changed, 212 insertions(+), 36 deletions(-) diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py index f59100cb..65218a1c 100644 --- a/modelscope/utils/ast_utils.py +++ b/modelscope/utils/ast_utils.py @@ -31,6 +31,7 @@ p = Path(__file__) # get the path of package 'modelscope' MODELSCOPE_PATH = p.resolve().parents[1] +INDEXER_FILE_DIR = get_default_cache_dir() REGISTER_MODULE = 'register_module' IGNORED_PACKAGES = ['modelscope', '.'] SCAN_SUB_FOLDERS = [ @@ -42,9 +43,11 @@ EXPRESS_KEY = 'express' FROM_IMPORT_KEY = 'from_imports' IMPORT_KEY = 'imports' FILE_NAME_KEY = 'filepath' +MODELSCOPE_PATH_KEY = 'modelscope_path' VERSION_KEY = 'version' MD5_KEY = 'md5' INDEX_KEY = 'index' +FILES_MTIME_KEY = 'files_mtime' REQUIREMENT_KEY = 'requirements' MODULE_KEY = 'module' CLASS_NAME = 'class_name' @@ -502,9 +505,11 @@ class FilesAstScaning(object): except Exception as e: detail = traceback.extract_tb(e.__traceback__) raise Exception( - f'During ast indexing, error is in the file {detail[-1].filename}' - f' line: {detail[-1].lineno}: "{detail[-1].line}" with error msg: ' - f'"{type(e).__name__}: {e}"') + f'During ast indexing the file {file}, a related error excepted ' + f'in the file {detail[-1].filename} at line: ' + f'{detail[-1].lineno}: "{detail[-1].line}" with error msg: ' + f'"{type(e).__name__}: {e}", please double check the origin file {file} ' + f'to see whether the file is correctly edited.') import_list = self.parse_import(output) return output[DECORATOR_KEY], import_list @@ -534,11 +539,13 @@ class FilesAstScaning(object): return inverted_index def get_files_scan_results(self, + target_file_list=None, target_dir=MODELSCOPE_PATH, target_folders=SCAN_SUB_FOLDERS): """the entry method of the ast scan method Args: + target_file_list can override the dir and folders combine target_dir (str, optional): the absolute path of the target directory to be scaned. Defaults to None. target_folder (list, optional): the list of sub-folders to be scaned in the target folder. @@ -547,9 +554,11 @@ class FilesAstScaning(object): Returns: dict: indexer of registry """ - - self.traversal_files(target_dir, target_folders) start = time.time() + if target_file_list is not None: + self.file_dirs = target_file_list + else: + self.traversal_files(target_dir, target_folders) logger.info( f'AST-Scaning the path "{target_dir}" with the following sub folders {target_folders}' ) @@ -574,31 +583,41 @@ class FilesAstScaning(object): REQUIREMENT_KEY: module_import } logger.info( - f'Scaning done! A number of {len(inverted_index_with_results)}' - f' files indexed! Time consumed {time.time()-start}s') + f'Scaning done! A number of {len(inverted_index_with_results)} ' + f'components indexed or updated! Time consumed {time.time()-start}s' + ) return index def files_mtime_md5(self, target_path=MODELSCOPE_PATH, - target_subfolder=SCAN_SUB_FOLDERS): + target_subfolder=SCAN_SUB_FOLDERS, + file_list=None): self.file_dirs = [] - self.traversal_files(target_path, target_subfolder) + if file_list and isinstance(file_list, list): + self.file_dirs = file_list + else: + self.traversal_files(target_path, target_subfolder) files_mtime = [] + files_mtime_dict = dict() for item in self.file_dirs: - files_mtime.append(os.path.getmtime(item)) + mtime = os.path.getmtime(item) + files_mtime.append(mtime) + files_mtime_dict[item] = mtime result_str = reduce(lambda x, y: str(x) + str(y), files_mtime, '') md5 = hashlib.md5(result_str.encode()) - return md5.hexdigest() + return md5.hexdigest(), files_mtime_dict file_scanner = FilesAstScaning() -def _save_index(index, file_path): +def _save_index(index, file_path, file_list=None): # convert tuple key to str key index[INDEX_KEY] = {str(k): v for k, v in index[INDEX_KEY].items()} index[VERSION_KEY] = __version__ - index[MD5_KEY] = file_scanner.files_mtime_md5() + index[MD5_KEY], index[FILES_MTIME_KEY] = file_scanner.files_mtime_md5( + file_list=file_list) + index[MODELSCOPE_PATH_KEY] = MODELSCOPE_PATH.as_posix() json_index = json.dumps(index) storage.write(json_index.encode(), file_path) index[INDEX_KEY] = { @@ -618,15 +637,56 @@ def _load_index(file_path): return wrapped_index -def load_index(force_rebuild=False): +def _update_index(index, files_mtime): + # inplace update index + origin_files_mtime = index[FILES_MTIME_KEY] + new_files = list(set(files_mtime) - set(origin_files_mtime)) + removed_files = list(set(origin_files_mtime) - set(files_mtime)) + updated_files = [] + for file in origin_files_mtime: + if file not in removed_files and \ + (origin_files_mtime[file] != files_mtime[file]): + updated_files.append(file) + updated_files.extend(new_files) + + # remove deleted index + if len(removed_files) > 0: + remove_index_keys = [] + remove_requirement_keys = [] + for key in index[INDEX_KEY]: + if index[INDEX_KEY][key][FILE_NAME_KEY] in removed_files: + remove_index_keys.append(key) + remove_requirement_keys.append( + index[INDEX_KEY][key][MODULE_KEY]) + for key in remove_index_keys: + del index[INDEX_KEY][key] + for key in remove_requirement_keys: + if key in index[REQUIREMENT_KEY]: + del index[REQUIREMENT_KEY][key] + + # add new index + updated_index = file_scanner.get_files_scan_results(updated_files) + index[INDEX_KEY].update(updated_index[INDEX_KEY]) + index[REQUIREMENT_KEY].update(updated_index[REQUIREMENT_KEY]) + + +def load_index( + file_list=None, + force_rebuild=False, + indexer_file_dir=INDEXER_FILE_DIR, + indexer_file=INDEXER_FILE, +): """get the index from scan results or cache Args: - force_rebuild: If set true, rebuild and load index + file_list: load indexer only from the file lists if provided, default as None + force_rebuild: If set true, rebuild and load index, default as False, + indexer_file_dir: The dir where the indexer file saved, default as INDEXER_FILE_DIR + indexer_file: The indexer file name, default as INDEXER_FILE Returns: dict: the index information for all registred modules, including key: - index, requirments, version and md5, the detail is shown below example: - { + index, requirments, files last modified time, modelscope home path, + version and md5, the detail is shown below example: { 'index': { ('MODELS', 'nlp', 'bert'):{ 'filepath' : 'path/to/the/registered/model', 'imports': @@ -638,32 +698,56 @@ def load_index(force_rebuild=False): 'modelscope.models.nlp.bert': ['os', 'torch', 'typeing'], 'modelscope.models.nlp.structbert': ['os', 'torch', 'typeing'], ... - }, 'version': '0.2.3', 'md5': '8616924970fe6bc119d1562832625612', + }, 'files_mtime' : { + '/User/Path/To/Your/Modelscope/modelscope/preprocessors/nlp/text_generation_preprocessor.py': + 16554565445, ... + },'version': '0.2.3', 'md5': '8616924970fe6bc119d1562832625612', + 'modelscope_path': '/User/Path/To/Your/Modelscope' } """ - cache_dir = os.getenv('MODELSCOPE_CACHE', get_default_cache_dir()) - file_path = os.path.join(cache_dir, INDEXER_FILE) + # env variable override + cache_dir = os.getenv('MODELSCOPE_CACHE', indexer_file_dir) + index_file = os.getenv('MODELSCOPE_INDEX_FILE', indexer_file) + file_path = os.path.join(cache_dir, index_file) logger.info(f'Loading ast index from {file_path}') index = None + local_changed = False if not force_rebuild and os.path.exists(file_path): wrapped_index = _load_index(file_path) - md5 = file_scanner.files_mtime_md5() - if (wrapped_index[VERSION_KEY] == __version__ - and wrapped_index[MD5_KEY] == md5): + md5, files_mtime = file_scanner.files_mtime_md5(file_list=file_list) + if (wrapped_index[VERSION_KEY] == __version__): index = wrapped_index + if (wrapped_index[MD5_KEY] != md5): + local_changed = True + full_index_flag = False if index is None: + full_index_flag = True + elif index and local_changed and FILES_MTIME_KEY not in index: + full_index_flag = True + elif index and local_changed and MODELSCOPE_PATH_KEY not in index: + full_index_flag = True + elif index and local_changed and index[ + MODELSCOPE_PATH_KEY] != MODELSCOPE_PATH.as_posix(): + full_index_flag = True + + if full_index_flag: if force_rebuild: logger.info('Force rebuilding ast index') else: logger.info( f'No valid ast index found from {file_path}, rebuilding ast index!' ) - index = file_scanner.get_files_scan_results() - _save_index(index, file_path) + index = file_scanner.get_files_scan_results(file_list) + _save_index(index, file_path, file_list) + elif local_changed and not full_index_flag: + _update_index(index, files_mtime) + _save_index(index, file_path, file_list) + logger.info( f'Loading done! Current index file version is {index[VERSION_KEY]}, ' - f'with md5 {index[MD5_KEY]}') + f'with md5 {index[MD5_KEY]} and a total number of ' + f'{len(index[INDEX_KEY])} components indexed') return index @@ -678,4 +762,3 @@ def check_import_module_avaliable(module_dicts: dict) -> list: if __name__ == '__main__': index = load_index() - print(index) diff --git a/tests/utils/test_ast.py b/tests/utils/test_ast.py index 2db61637..850945b9 100644 --- a/tests/utils/test_ast.py +++ b/tests/utils/test_ast.py @@ -7,7 +7,10 @@ import time import unittest from pathlib import Path -from modelscope.utils.ast_utils import AstScaning, FilesAstScaning, load_index +from modelscope.utils.ast_utils import (FILES_MTIME_KEY, INDEX_KEY, MD5_KEY, + MODELSCOPE_PATH_KEY, REQUIREMENT_KEY, + VERSION_KEY, AstScaning, + FilesAstScaning, load_index) p = Path(__file__) @@ -55,10 +58,14 @@ class AstScaningTest(unittest.TestCase): def test_files_scaning_method(self): fileScaner = FilesAstScaning() - output = fileScaner.get_files_scan_results() - self.assertTrue(output['index'] is not None) - self.assertTrue(output['requirements'] is not None) - index, requirements = output['index'], output['requirements'] + # case of pass in files directly + pipeline_file = os.path.join(MODELSCOPE_PATH, 'pipelines', 'nlp', + 'text_generation_pipeline.py') + file_list = [pipeline_file] + output = fileScaner.get_files_scan_results(file_list) + self.assertTrue(output[INDEX_KEY] is not None) + self.assertTrue(output[REQUIREMENT_KEY] is not None) + index, requirements = output[INDEX_KEY], output[REQUIREMENT_KEY] self.assertIsInstance(index, dict) self.assertIsInstance(requirements, dict) self.assertIsInstance(list(index.keys())[0], tuple) @@ -77,24 +84,110 @@ class AstScaningTest(unittest.TestCase): with open(self.test_file, 'w', encoding='utf-8') as f: f.write('This is the new test!') - md5_1 = fileScaner.files_mtime_md5(self.tmp_dir, []) - md5_2 = fileScaner.files_mtime_md5(self.tmp_dir, []) + md5_1, mtime_1 = fileScaner.files_mtime_md5(self.tmp_dir, []) + md5_2, mtime_2 = fileScaner.files_mtime_md5(self.tmp_dir, []) self.assertEqual(md5_1, md5_2) + self.assertEqual(mtime_1, mtime_2) + self.assertIsInstance(mtime_1, dict) + self.assertEqual(list(mtime_1.keys()), [self.test_file]) + self.assertEqual(mtime_1[self.test_file], mtime_2[self.test_file]) + time.sleep(2) # case of revise with open(self.test_file, 'w', encoding='utf-8') as f: f.write('test again') - md5_3 = fileScaner.files_mtime_md5(self.tmp_dir, []) + md5_3, mtime_3 = fileScaner.files_mtime_md5(self.tmp_dir, []) self.assertNotEqual(md5_1, md5_3) + self.assertNotEqual(mtime_1[self.test_file], mtime_3[self.test_file]) # case of create self.test_file_new = os.path.join(self.tmp_dir, 'test_1.py') time.sleep(2) with open(self.test_file_new, 'w', encoding='utf-8') as f: f.write('test again') - md5_4 = fileScaner.files_mtime_md5(self.tmp_dir, []) + md5_4, mtime_4 = fileScaner.files_mtime_md5(self.tmp_dir, []) self.assertNotEqual(md5_1, md5_4) self.assertNotEqual(md5_3, md5_4) + self.assertEqual( + set(mtime_4.keys()) - set([self.test_file, self.test_file_new]), + set()) + + def test_load_index_method(self): + # test full indexing case + output = load_index() + self.assertTrue(output[INDEX_KEY] is not None) + self.assertTrue(output[REQUIREMENT_KEY] is not None) + index, requirements = output[INDEX_KEY], output[REQUIREMENT_KEY] + self.assertIsInstance(index, dict) + self.assertIsInstance(requirements, dict) + self.assertIsInstance(list(index.keys())[0], tuple) + index_0 = list(index.keys())[0] + self.assertIsInstance(index[index_0], dict) + self.assertTrue(index[index_0]['imports'] is not None) + self.assertIsInstance(index[index_0]['imports'], list) + self.assertTrue(index[index_0]['module'] is not None) + self.assertIsInstance(index[index_0]['module'], str) + index_0 = list(requirements.keys())[0] + self.assertIsInstance(requirements[index_0], list) + self.assertIsInstance(output[MD5_KEY], str) + self.assertIsInstance(output[MODELSCOPE_PATH_KEY], str) + self.assertIsInstance(output[VERSION_KEY], str) + self.assertIsInstance(output[FILES_MTIME_KEY], dict) + + def test_update_load_index_method(self): + file_number = 20 + file_list = [] + for i in range(file_number): + filename = os.path.join(self.tmp_dir, f'test_{i}.py') + with open(filename, 'w', encoding='utf-8') as f: + f.write('import os') + file_list.append(filename) + + index_file = 'ast_indexer_1' + + start = time.time() + index = load_index( + file_list=file_list, + indexer_file_dir=self.tmp_dir, + indexer_file=index_file) + duration_1 = time.time() - start + self.assertEqual(len(index[FILES_MTIME_KEY]), file_number) + + # no changing case, time should be less than original + start = time.time() + index = load_index( + file_list=file_list, + indexer_file_dir=self.tmp_dir, + indexer_file=index_file) + duration_2 = time.time() - start + self.assertGreater(duration_1, duration_2) + self.assertEqual(len(index[FILES_MTIME_KEY]), file_number) + + # adding new file, time should be less than original + test_file_new_2 = os.path.join(self.tmp_dir, 'test_new.py') + with open(test_file_new_2, 'w', encoding='utf-8') as f: + f.write('import os') + file_list.append(test_file_new_2) + + start = time.time() + index = load_index( + file_list=file_list, + indexer_file_dir=self.tmp_dir, + indexer_file=index_file) + duration_3 = time.time() - start + self.assertGreater(duration_1, duration_3) + self.assertEqual(len(index[FILES_MTIME_KEY]), file_number + 1) + + # deleting one file, time should be less than original + file_list.pop() + start = time.time() + index = load_index( + file_list=file_list, + indexer_file_dir=self.tmp_dir, + indexer_file=index_file) + duration_4 = time.time() - start + self.assertGreater(duration_1, duration_4) + self.assertEqual(len(index[FILES_MTIME_KEY]), file_number) if __name__ == '__main__':