Browse Source

fix: PDF,DOC,EXCEL,PPT无法解析问题

main
gjl 1 year ago
parent
commit
9390b1cda1
6 changed files with 20 additions and 30 deletions
  1. +5
    -0
      requirements.txt
  2. +1
    -1
      src/mindpilot/app/configs/__init__.py
  3. +1
    -1
      src/mindpilot/app/configs/kb_config.py
  4. +1
    -1
      src/mindpilot/app/knowledge_base/file_rag/document_loaders/myimgloader.py
  5. +2
    -2
      src/mindpilot/app/knowledge_base/file_rag/document_loaders/mypdfloader.py
  6. +10
    -25
      src/mindpilot/app/knowledge_base/utils.py

+ 5
- 0
requirements.txt View File

@@ -34,3 +34,8 @@ unstructured~=0.15.8
rank_bm25~=0.2.2
langchain-huggingface~=0.0.3
python-magic-bin~=0.4.14
python-docx~=1.1.2
rapidocr_onnxruntime~=1.3.24
python-pptx~=1.0.2
pyMuPDF~=1.24.10
openpyxl~=3.1.5

+ 1
- 1
src/mindpilot/app/configs/__init__.py View File

@@ -22,7 +22,7 @@ __all__ = [
"DEFAULT_SEARCH_ENGINE",
"SEARCH_ENGINE_TOP_K",
"ZH_TITLE_ENHANCE",
# "PDF_OCR_THRESHOLD",
"PDF_OCR_THRESHOLD",
"KB_INFO",
"CHATCHAT_ROOT",
"KB_ROOT_PATH",


+ 1
- 1
src/mindpilot/app/configs/kb_config.py View File

@@ -105,7 +105,7 @@ text_splitter_dict = {
}

# TEXT_SPLITTER 名称
TEXT_SPLITTER_NAME = "ChineseRecursiveTextSplitter"
TEXT_SPLITTER_NAME = "RecursiveCharacterTextSplitter"

# Embedding模型定制词语的词表文件
EMBEDDING_KEYWORD_FILE = "embedding_keywords.txt"


+ 1
- 1
src/mindpilot/app/knowledge_base/file_rag/document_loaders/myimgloader.py View File

@@ -2,7 +2,7 @@ from typing import List

from langchain_community.document_loaders.unstructured import UnstructuredFileLoader

from chatchat.server.file_rag.document_loaders.ocr import get_ocr
from .ocr import get_ocr


class RapidOCRLoader(UnstructuredFileLoader):


+ 2
- 2
src/mindpilot/app/knowledge_base/file_rag/document_loaders/mypdfloader.py View File

@@ -6,8 +6,8 @@ import tqdm
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
from PIL import Image

from chatchat.configs import PDF_OCR_THRESHOLD
from chatchat.server.file_rag.document_loaders.ocr import get_ocr
from ....configs import PDF_OCR_THRESHOLD
from .ocr import get_ocr


class RapidOCRPDFLoader(UnstructuredFileLoader):


+ 10
- 25
src/mindpilot/app/knowledge_base/utils.py View File

@@ -107,14 +107,14 @@ LOADER_DICT = {
"JSONLoader": [".json"],
"JSONLinesLoader": [".jsonl"],
"CSVLoader": [".csv"],
# "FilteredCSVLoader": [".csv"], 如果使用自定义分割csv
# "RapidOCRPDFLoader": [".pdf"],
# "RapidOCRDocLoader": [".docx", ".doc"],
# "RapidOCRPPTLoader": [
# ".ppt",
# ".pptx",
# ],
# "RapidOCRLoader": [".png", ".jpg", ".jpeg", ".bmp"],
"FilteredCSVLoader": [".csv"], # 如果使用自定义分割csv
"RapidOCRPDFLoader": [".pdf"],
"RapidOCRDocLoader": [".docx", ".doc"],
"RapidOCRPPTLoader": [
".ppt",
".pptx",
],
"RapidOCRLoader": [".png", ".jpg", ".jpeg", ".bmp"],
"PyPDFLoader": [".pdf"],
"UnstructuredFileLoader": [
".eml",
@@ -186,7 +186,7 @@ def get_loader(loader_name: str, file_path: str, loader_kwargs: Dict = None):
"RapidOCRPPTLoader",
]:
document_loaders_module = importlib.import_module(
"chatchat.server.file_rag.document_loaders"
"app.knowledge_base.file_rag.document_loaders"
)
else:
document_loaders_module = importlib.import_module(
@@ -221,7 +221,7 @@ def get_loader(loader_name: str, file_path: str, loader_kwargs: Dict = None):
loader_kwargs.setdefault("jq_schema", ".")
loader_kwargs.setdefault("text_content", False)

loader = DocumentLoader(file_path, **loader_kwargs)
loader = DocumentLoader(str(Path(file_path).resolve()), **loader_kwargs)
return loader


@@ -467,18 +467,3 @@ def files2docs_in_thread(
func=files2docs_in_thread_file2docs, params=kwargs_list
):
yield result


if __name__ == "__main__":
from pprint import pprint

kb_file = KnowledgeFile(
filename="E:\\LLM\\Data\\Test.md", knowledge_base_name="samples"
)
# kb_file.text_splitter_name = "RecursiveCharacterTextSplitter"
kb_file.text_splitter_name = "MarkdownHeaderTextSplitter"
docs = kb_file.file2docs()
# pprint(docs[-1])
texts = kb_file.docs2texts(docs)
for text in texts:
print(text)

Loading…
Cancel
Save