From 9390b1cda156126bcb48a2ca134817256cfe9dd8 Mon Sep 17 00:00:00 2001 From: gjl <2802427218@qq.com> Date: Wed, 11 Sep 2024 15:31:27 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20PDF,DOC,EXCEL,PPT=E6=97=A0=E6=B3=95?= =?UTF-8?q?=E8=A7=A3=E6=9E=90=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 5 +++ src/mindpilot/app/configs/__init__.py | 2 +- src/mindpilot/app/configs/kb_config.py | 2 +- .../file_rag/document_loaders/myimgloader.py | 2 +- .../file_rag/document_loaders/mypdfloader.py | 4 +-- src/mindpilot/app/knowledge_base/utils.py | 35 ++++++------------- 6 files changed, 20 insertions(+), 30 deletions(-) diff --git a/requirements.txt b/requirements.txt index 34c5859..12485d3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,3 +34,8 @@ unstructured~=0.15.8 rank_bm25~=0.2.2 langchain-huggingface~=0.0.3 python-magic-bin~=0.4.14 +python-docx~=1.1.2 +rapidocr_onnxruntime~=1.3.24 +python-pptx~=1.0.2 +pyMuPDF~=1.24.10 +openpyxl~=3.1.5 diff --git a/src/mindpilot/app/configs/__init__.py b/src/mindpilot/app/configs/__init__.py index a3328df..a5e2369 100644 --- a/src/mindpilot/app/configs/__init__.py +++ b/src/mindpilot/app/configs/__init__.py @@ -22,7 +22,7 @@ __all__ = [ "DEFAULT_SEARCH_ENGINE", "SEARCH_ENGINE_TOP_K", "ZH_TITLE_ENHANCE", - # "PDF_OCR_THRESHOLD", + "PDF_OCR_THRESHOLD", "KB_INFO", "CHATCHAT_ROOT", "KB_ROOT_PATH", diff --git a/src/mindpilot/app/configs/kb_config.py b/src/mindpilot/app/configs/kb_config.py index 4909fa9..9c7da75 100644 --- a/src/mindpilot/app/configs/kb_config.py +++ b/src/mindpilot/app/configs/kb_config.py @@ -105,7 +105,7 @@ text_splitter_dict = { } # TEXT_SPLITTER 名称 -TEXT_SPLITTER_NAME = "ChineseRecursiveTextSplitter" +TEXT_SPLITTER_NAME = "RecursiveCharacterTextSplitter" # Embedding模型定制词语的词表文件 EMBEDDING_KEYWORD_FILE = "embedding_keywords.txt" diff --git a/src/mindpilot/app/knowledge_base/file_rag/document_loaders/myimgloader.py b/src/mindpilot/app/knowledge_base/file_rag/document_loaders/myimgloader.py index f11b6c5..cab9b49 100644 --- a/src/mindpilot/app/knowledge_base/file_rag/document_loaders/myimgloader.py +++ b/src/mindpilot/app/knowledge_base/file_rag/document_loaders/myimgloader.py @@ -2,7 +2,7 @@ from typing import List from langchain_community.document_loaders.unstructured import UnstructuredFileLoader -from chatchat.server.file_rag.document_loaders.ocr import get_ocr +from .ocr import get_ocr class RapidOCRLoader(UnstructuredFileLoader): diff --git a/src/mindpilot/app/knowledge_base/file_rag/document_loaders/mypdfloader.py b/src/mindpilot/app/knowledge_base/file_rag/document_loaders/mypdfloader.py index aa981a2..7284a3d 100644 --- a/src/mindpilot/app/knowledge_base/file_rag/document_loaders/mypdfloader.py +++ b/src/mindpilot/app/knowledge_base/file_rag/document_loaders/mypdfloader.py @@ -6,8 +6,8 @@ import tqdm from langchain_community.document_loaders.unstructured import UnstructuredFileLoader from PIL import Image -from chatchat.configs import PDF_OCR_THRESHOLD -from chatchat.server.file_rag.document_loaders.ocr import get_ocr +from ....configs import PDF_OCR_THRESHOLD +from .ocr import get_ocr class RapidOCRPDFLoader(UnstructuredFileLoader): diff --git a/src/mindpilot/app/knowledge_base/utils.py b/src/mindpilot/app/knowledge_base/utils.py index 4bec933..1285b88 100644 --- a/src/mindpilot/app/knowledge_base/utils.py +++ b/src/mindpilot/app/knowledge_base/utils.py @@ -107,14 +107,14 @@ LOADER_DICT = { "JSONLoader": [".json"], "JSONLinesLoader": [".jsonl"], "CSVLoader": [".csv"], - # "FilteredCSVLoader": [".csv"], 如果使用自定义分割csv - # "RapidOCRPDFLoader": [".pdf"], - # "RapidOCRDocLoader": [".docx", ".doc"], - # "RapidOCRPPTLoader": [ - # ".ppt", - # ".pptx", - # ], - # "RapidOCRLoader": [".png", ".jpg", ".jpeg", ".bmp"], + "FilteredCSVLoader": [".csv"], # 如果使用自定义分割csv + "RapidOCRPDFLoader": [".pdf"], + "RapidOCRDocLoader": [".docx", ".doc"], + "RapidOCRPPTLoader": [ + ".ppt", + ".pptx", + ], + "RapidOCRLoader": [".png", ".jpg", ".jpeg", ".bmp"], "PyPDFLoader": [".pdf"], "UnstructuredFileLoader": [ ".eml", @@ -186,7 +186,7 @@ def get_loader(loader_name: str, file_path: str, loader_kwargs: Dict = None): "RapidOCRPPTLoader", ]: document_loaders_module = importlib.import_module( - "chatchat.server.file_rag.document_loaders" + "app.knowledge_base.file_rag.document_loaders" ) else: document_loaders_module = importlib.import_module( @@ -221,7 +221,7 @@ def get_loader(loader_name: str, file_path: str, loader_kwargs: Dict = None): loader_kwargs.setdefault("jq_schema", ".") loader_kwargs.setdefault("text_content", False) - loader = DocumentLoader(file_path, **loader_kwargs) + loader = DocumentLoader(str(Path(file_path).resolve()), **loader_kwargs) return loader @@ -467,18 +467,3 @@ def files2docs_in_thread( func=files2docs_in_thread_file2docs, params=kwargs_list ): yield result - - -if __name__ == "__main__": - from pprint import pprint - - kb_file = KnowledgeFile( - filename="E:\\LLM\\Data\\Test.md", knowledge_base_name="samples" - ) - # kb_file.text_splitter_name = "RecursiveCharacterTextSplitter" - kb_file.text_splitter_name = "MarkdownHeaderTextSplitter" - docs = kb_file.file2docs() - # pprint(docs[-1]) - texts = kb_file.docs2texts(docs) - for text in texts: - print(text)