diff --git a/requirements.txt b/requirements.txt index de5adf0..34c5859 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,4 +32,5 @@ pymilvus~=2.4.5 python-dateutil~=2.9.0post0 unstructured~=0.15.8 rank_bm25~=0.2.2 -langchain-huggingface~=0.0.3 \ No newline at end of file +langchain-huggingface~=0.0.3 +python-magic-bin~=0.4.14 diff --git a/src/mindpilot/app/configs/kb_config.py b/src/mindpilot/app/configs/kb_config.py index 0afe341..4909fa9 100644 --- a/src/mindpilot/app/configs/kb_config.py +++ b/src/mindpilot/app/configs/kb_config.py @@ -83,7 +83,7 @@ kbs_config = { # TextSplitter配置项,如果你不明白其中的含义,就不要修改。 text_splitter_dict = { "ChineseRecursiveTextSplitter": { - "source": "", # 选择tiktoken则使用openai的方法 "huggingface" + "source": "", "tokenizer_name_or_path": "", }, "SpacyTextSplitter": { diff --git a/src/mindpilot/app/knowledge_base/kb_doc_api.py b/src/mindpilot/app/knowledge_base/kb_doc_api.py index fc7ff46..6f31b4d 100644 --- a/src/mindpilot/app/knowledge_base/kb_doc_api.py +++ b/src/mindpilot/app/knowledge_base/kb_doc_api.py @@ -3,6 +3,7 @@ import os import urllib from typing import Dict, List +import chardet from fastapi import Body, File, Form, Query, UploadFile from fastapi.responses import FileResponse from langchain.docstore.document import Document @@ -107,6 +108,11 @@ def _save_files_in_thread( data = {"knowledge_base_name": knowledge_base_name, "file_name": filename} file_content = file.file.read() # 读取上传文件的内容 + # 检测文件编码 + detected_encoding = chardet.detect(file_content)['encoding'] + if detected_encoding and 'gb2312' in detected_encoding.lower(): + file_content = file_content.decode('GB18030').encode('utf-8') + if ( os.path.isfile(file_path) and not override