Browse Source

[to #42322933] speed up the ast indexing during editing

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10907357
master^2
zhangzhicheng.zzc wenmeng.zwm 3 years ago
parent
commit
a318f27247
2 changed files with 212 additions and 36 deletions
  1. +110
    -27
      modelscope/utils/ast_utils.py
  2. +102
    -9
      tests/utils/test_ast.py

+ 110
- 27
modelscope/utils/ast_utils.py View File

@@ -31,6 +31,7 @@ p = Path(__file__)

# get the path of package 'modelscope'
MODELSCOPE_PATH = p.resolve().parents[1]
INDEXER_FILE_DIR = get_default_cache_dir()
REGISTER_MODULE = 'register_module'
IGNORED_PACKAGES = ['modelscope', '.']
SCAN_SUB_FOLDERS = [
@@ -42,9 +43,11 @@ EXPRESS_KEY = 'express'
FROM_IMPORT_KEY = 'from_imports'
IMPORT_KEY = 'imports'
FILE_NAME_KEY = 'filepath'
MODELSCOPE_PATH_KEY = 'modelscope_path'
VERSION_KEY = 'version'
MD5_KEY = 'md5'
INDEX_KEY = 'index'
FILES_MTIME_KEY = 'files_mtime'
REQUIREMENT_KEY = 'requirements'
MODULE_KEY = 'module'
CLASS_NAME = 'class_name'
@@ -502,9 +505,11 @@ class FilesAstScaning(object):
except Exception as e:
detail = traceback.extract_tb(e.__traceback__)
raise Exception(
f'During ast indexing, error is in the file {detail[-1].filename}'
f' line: {detail[-1].lineno}: "{detail[-1].line}" with error msg: '
f'"{type(e).__name__}: {e}"')
f'During ast indexing the file {file}, a related error excepted '
f'in the file {detail[-1].filename} at line: '
f'{detail[-1].lineno}: "{detail[-1].line}" with error msg: '
f'"{type(e).__name__}: {e}", please double check the origin file {file} '
f'to see whether the file is correctly edited.')

import_list = self.parse_import(output)
return output[DECORATOR_KEY], import_list
@@ -534,11 +539,13 @@ class FilesAstScaning(object):
return inverted_index

def get_files_scan_results(self,
target_file_list=None,
target_dir=MODELSCOPE_PATH,
target_folders=SCAN_SUB_FOLDERS):
"""the entry method of the ast scan method

Args:
target_file_list can override the dir and folders combine
target_dir (str, optional): the absolute path of the target directory to be scaned. Defaults to None.
target_folder (list, optional): the list of
sub-folders to be scaned in the target folder.
@@ -547,9 +554,11 @@ class FilesAstScaning(object):
Returns:
dict: indexer of registry
"""

self.traversal_files(target_dir, target_folders)
start = time.time()
if target_file_list is not None:
self.file_dirs = target_file_list
else:
self.traversal_files(target_dir, target_folders)
logger.info(
f'AST-Scaning the path "{target_dir}" with the following sub folders {target_folders}'
)
@@ -574,31 +583,41 @@ class FilesAstScaning(object):
REQUIREMENT_KEY: module_import
}
logger.info(
f'Scaning done! A number of {len(inverted_index_with_results)}'
f' files indexed! Time consumed {time.time()-start}s')
f'Scaning done! A number of {len(inverted_index_with_results)} '
f'components indexed or updated! Time consumed {time.time()-start}s'
)
return index

def files_mtime_md5(self,
target_path=MODELSCOPE_PATH,
target_subfolder=SCAN_SUB_FOLDERS):
target_subfolder=SCAN_SUB_FOLDERS,
file_list=None):
self.file_dirs = []
self.traversal_files(target_path, target_subfolder)
if file_list and isinstance(file_list, list):
self.file_dirs = file_list
else:
self.traversal_files(target_path, target_subfolder)
files_mtime = []
files_mtime_dict = dict()
for item in self.file_dirs:
files_mtime.append(os.path.getmtime(item))
mtime = os.path.getmtime(item)
files_mtime.append(mtime)
files_mtime_dict[item] = mtime
result_str = reduce(lambda x, y: str(x) + str(y), files_mtime, '')
md5 = hashlib.md5(result_str.encode())
return md5.hexdigest()
return md5.hexdigest(), files_mtime_dict


file_scanner = FilesAstScaning()


def _save_index(index, file_path):
def _save_index(index, file_path, file_list=None):
# convert tuple key to str key
index[INDEX_KEY] = {str(k): v for k, v in index[INDEX_KEY].items()}
index[VERSION_KEY] = __version__
index[MD5_KEY] = file_scanner.files_mtime_md5()
index[MD5_KEY], index[FILES_MTIME_KEY] = file_scanner.files_mtime_md5(
file_list=file_list)
index[MODELSCOPE_PATH_KEY] = MODELSCOPE_PATH.as_posix()
json_index = json.dumps(index)
storage.write(json_index.encode(), file_path)
index[INDEX_KEY] = {
@@ -618,15 +637,56 @@ def _load_index(file_path):
return wrapped_index


def load_index(force_rebuild=False):
def _update_index(index, files_mtime):
# inplace update index
origin_files_mtime = index[FILES_MTIME_KEY]
new_files = list(set(files_mtime) - set(origin_files_mtime))
removed_files = list(set(origin_files_mtime) - set(files_mtime))
updated_files = []
for file in origin_files_mtime:
if file not in removed_files and \
(origin_files_mtime[file] != files_mtime[file]):
updated_files.append(file)
updated_files.extend(new_files)

# remove deleted index
if len(removed_files) > 0:
remove_index_keys = []
remove_requirement_keys = []
for key in index[INDEX_KEY]:
if index[INDEX_KEY][key][FILE_NAME_KEY] in removed_files:
remove_index_keys.append(key)
remove_requirement_keys.append(
index[INDEX_KEY][key][MODULE_KEY])
for key in remove_index_keys:
del index[INDEX_KEY][key]
for key in remove_requirement_keys:
if key in index[REQUIREMENT_KEY]:
del index[REQUIREMENT_KEY][key]

# add new index
updated_index = file_scanner.get_files_scan_results(updated_files)
index[INDEX_KEY].update(updated_index[INDEX_KEY])
index[REQUIREMENT_KEY].update(updated_index[REQUIREMENT_KEY])


def load_index(
file_list=None,
force_rebuild=False,
indexer_file_dir=INDEXER_FILE_DIR,
indexer_file=INDEXER_FILE,
):
"""get the index from scan results or cache

Args:
force_rebuild: If set true, rebuild and load index
file_list: load indexer only from the file lists if provided, default as None
force_rebuild: If set true, rebuild and load index, default as False,
indexer_file_dir: The dir where the indexer file saved, default as INDEXER_FILE_DIR
indexer_file: The indexer file name, default as INDEXER_FILE
Returns:
dict: the index information for all registred modules, including key:
index, requirments, version and md5, the detail is shown below example:
{
index, requirments, files last modified time, modelscope home path,
version and md5, the detail is shown below example: {
'index': {
('MODELS', 'nlp', 'bert'):{
'filepath' : 'path/to/the/registered/model', 'imports':
@@ -638,32 +698,56 @@ def load_index(force_rebuild=False):
'modelscope.models.nlp.bert': ['os', 'torch', 'typeing'],
'modelscope.models.nlp.structbert': ['os', 'torch', 'typeing'],
...
}, 'version': '0.2.3', 'md5': '8616924970fe6bc119d1562832625612',
}, 'files_mtime' : {
'/User/Path/To/Your/Modelscope/modelscope/preprocessors/nlp/text_generation_preprocessor.py':
16554565445, ...
},'version': '0.2.3', 'md5': '8616924970fe6bc119d1562832625612',
'modelscope_path': '/User/Path/To/Your/Modelscope'
}
"""
cache_dir = os.getenv('MODELSCOPE_CACHE', get_default_cache_dir())
file_path = os.path.join(cache_dir, INDEXER_FILE)
# env variable override
cache_dir = os.getenv('MODELSCOPE_CACHE', indexer_file_dir)
index_file = os.getenv('MODELSCOPE_INDEX_FILE', indexer_file)
file_path = os.path.join(cache_dir, index_file)
logger.info(f'Loading ast index from {file_path}')
index = None
local_changed = False
if not force_rebuild and os.path.exists(file_path):
wrapped_index = _load_index(file_path)
md5 = file_scanner.files_mtime_md5()
if (wrapped_index[VERSION_KEY] == __version__
and wrapped_index[MD5_KEY] == md5):
md5, files_mtime = file_scanner.files_mtime_md5(file_list=file_list)
if (wrapped_index[VERSION_KEY] == __version__):
index = wrapped_index
if (wrapped_index[MD5_KEY] != md5):
local_changed = True
full_index_flag = False

if index is None:
full_index_flag = True
elif index and local_changed and FILES_MTIME_KEY not in index:
full_index_flag = True
elif index and local_changed and MODELSCOPE_PATH_KEY not in index:
full_index_flag = True
elif index and local_changed and index[
MODELSCOPE_PATH_KEY] != MODELSCOPE_PATH.as_posix():
full_index_flag = True

if full_index_flag:
if force_rebuild:
logger.info('Force rebuilding ast index')
else:
logger.info(
f'No valid ast index found from {file_path}, rebuilding ast index!'
)
index = file_scanner.get_files_scan_results()
_save_index(index, file_path)
index = file_scanner.get_files_scan_results(file_list)
_save_index(index, file_path, file_list)
elif local_changed and not full_index_flag:
_update_index(index, files_mtime)
_save_index(index, file_path, file_list)

logger.info(
f'Loading done! Current index file version is {index[VERSION_KEY]}, '
f'with md5 {index[MD5_KEY]}')
f'with md5 {index[MD5_KEY]} and a total number of '
f'{len(index[INDEX_KEY])} components indexed')
return index


@@ -678,4 +762,3 @@ def check_import_module_avaliable(module_dicts: dict) -> list:

if __name__ == '__main__':
index = load_index()
print(index)

+ 102
- 9
tests/utils/test_ast.py View File

@@ -7,7 +7,10 @@ import time
import unittest
from pathlib import Path

from modelscope.utils.ast_utils import AstScaning, FilesAstScaning, load_index
from modelscope.utils.ast_utils import (FILES_MTIME_KEY, INDEX_KEY, MD5_KEY,
MODELSCOPE_PATH_KEY, REQUIREMENT_KEY,
VERSION_KEY, AstScaning,
FilesAstScaning, load_index)

p = Path(__file__)

@@ -55,10 +58,14 @@ class AstScaningTest(unittest.TestCase):

def test_files_scaning_method(self):
fileScaner = FilesAstScaning()
output = fileScaner.get_files_scan_results()
self.assertTrue(output['index'] is not None)
self.assertTrue(output['requirements'] is not None)
index, requirements = output['index'], output['requirements']
# case of pass in files directly
pipeline_file = os.path.join(MODELSCOPE_PATH, 'pipelines', 'nlp',
'text_generation_pipeline.py')
file_list = [pipeline_file]
output = fileScaner.get_files_scan_results(file_list)
self.assertTrue(output[INDEX_KEY] is not None)
self.assertTrue(output[REQUIREMENT_KEY] is not None)
index, requirements = output[INDEX_KEY], output[REQUIREMENT_KEY]
self.assertIsInstance(index, dict)
self.assertIsInstance(requirements, dict)
self.assertIsInstance(list(index.keys())[0], tuple)
@@ -77,24 +84,110 @@ class AstScaningTest(unittest.TestCase):
with open(self.test_file, 'w', encoding='utf-8') as f:
f.write('This is the new test!')

md5_1 = fileScaner.files_mtime_md5(self.tmp_dir, [])
md5_2 = fileScaner.files_mtime_md5(self.tmp_dir, [])
md5_1, mtime_1 = fileScaner.files_mtime_md5(self.tmp_dir, [])
md5_2, mtime_2 = fileScaner.files_mtime_md5(self.tmp_dir, [])
self.assertEqual(md5_1, md5_2)
self.assertEqual(mtime_1, mtime_2)
self.assertIsInstance(mtime_1, dict)
self.assertEqual(list(mtime_1.keys()), [self.test_file])
self.assertEqual(mtime_1[self.test_file], mtime_2[self.test_file])

time.sleep(2)
# case of revise
with open(self.test_file, 'w', encoding='utf-8') as f:
f.write('test again')
md5_3 = fileScaner.files_mtime_md5(self.tmp_dir, [])
md5_3, mtime_3 = fileScaner.files_mtime_md5(self.tmp_dir, [])
self.assertNotEqual(md5_1, md5_3)
self.assertNotEqual(mtime_1[self.test_file], mtime_3[self.test_file])

# case of create
self.test_file_new = os.path.join(self.tmp_dir, 'test_1.py')
time.sleep(2)
with open(self.test_file_new, 'w', encoding='utf-8') as f:
f.write('test again')
md5_4 = fileScaner.files_mtime_md5(self.tmp_dir, [])
md5_4, mtime_4 = fileScaner.files_mtime_md5(self.tmp_dir, [])
self.assertNotEqual(md5_1, md5_4)
self.assertNotEqual(md5_3, md5_4)
self.assertEqual(
set(mtime_4.keys()) - set([self.test_file, self.test_file_new]),
set())

def test_load_index_method(self):
# test full indexing case
output = load_index()
self.assertTrue(output[INDEX_KEY] is not None)
self.assertTrue(output[REQUIREMENT_KEY] is not None)
index, requirements = output[INDEX_KEY], output[REQUIREMENT_KEY]
self.assertIsInstance(index, dict)
self.assertIsInstance(requirements, dict)
self.assertIsInstance(list(index.keys())[0], tuple)
index_0 = list(index.keys())[0]
self.assertIsInstance(index[index_0], dict)
self.assertTrue(index[index_0]['imports'] is not None)
self.assertIsInstance(index[index_0]['imports'], list)
self.assertTrue(index[index_0]['module'] is not None)
self.assertIsInstance(index[index_0]['module'], str)
index_0 = list(requirements.keys())[0]
self.assertIsInstance(requirements[index_0], list)
self.assertIsInstance(output[MD5_KEY], str)
self.assertIsInstance(output[MODELSCOPE_PATH_KEY], str)
self.assertIsInstance(output[VERSION_KEY], str)
self.assertIsInstance(output[FILES_MTIME_KEY], dict)

def test_update_load_index_method(self):
file_number = 20
file_list = []
for i in range(file_number):
filename = os.path.join(self.tmp_dir, f'test_{i}.py')
with open(filename, 'w', encoding='utf-8') as f:
f.write('import os')
file_list.append(filename)

index_file = 'ast_indexer_1'

start = time.time()
index = load_index(
file_list=file_list,
indexer_file_dir=self.tmp_dir,
indexer_file=index_file)
duration_1 = time.time() - start
self.assertEqual(len(index[FILES_MTIME_KEY]), file_number)

# no changing case, time should be less than original
start = time.time()
index = load_index(
file_list=file_list,
indexer_file_dir=self.tmp_dir,
indexer_file=index_file)
duration_2 = time.time() - start
self.assertGreater(duration_1, duration_2)
self.assertEqual(len(index[FILES_MTIME_KEY]), file_number)

# adding new file, time should be less than original
test_file_new_2 = os.path.join(self.tmp_dir, 'test_new.py')
with open(test_file_new_2, 'w', encoding='utf-8') as f:
f.write('import os')
file_list.append(test_file_new_2)

start = time.time()
index = load_index(
file_list=file_list,
indexer_file_dir=self.tmp_dir,
indexer_file=index_file)
duration_3 = time.time() - start
self.assertGreater(duration_1, duration_3)
self.assertEqual(len(index[FILES_MTIME_KEY]), file_number + 1)

# deleting one file, time should be less than original
file_list.pop()
start = time.time()
index = load_index(
file_list=file_list,
indexer_file_dir=self.tmp_dir,
indexer_file=index_file)
duration_4 = time.time() - start
self.assertGreater(duration_1, duration_4)
self.assertEqual(len(index[FILES_MTIME_KEY]), file_number)


if __name__ == '__main__':


Loading…
Cancel
Save