From 87290ed6f011219c7a12391a256e1935a2edd537 Mon Sep 17 00:00:00 2001 From: "piaoyu.lxy" Date: Wed, 17 Aug 2022 21:16:13 +0800 Subject: [PATCH] [to #42322933] fix punkt file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 删除模型仓库中多余的文件,只保留punkt.zip文件,运行时解压 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9800149 --- modelscope/preprocessors/star/fields/common_utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/modelscope/preprocessors/star/fields/common_utils.py b/modelscope/preprocessors/star/fields/common_utils.py index 2d33b7ab..431e66b6 100644 --- a/modelscope/preprocessors/star/fields/common_utils.py +++ b/modelscope/preprocessors/star/fields/common_utils.py @@ -193,6 +193,15 @@ class SubPreprocessor(): from nltk import data data.path.append(os.path.join(self.model_dir, 'nltk_data')) + + zippath = os.path.join(self.model_dir, 'nltk_data/tokenizers/punkt') + if os.path.exists(zippath): + print('punkt has already exist!') + else: + import zipfile + with zipfile.ZipFile(zippath + '.zip') as zf: + zf.extractall( + os.path.join(self.model_dir, 'nltk_data/tokenizers/')) question = nltk.word_tokenize(question) question = mwtokenizer.tokenize(question)