Browse Source

fix: txt编码问题

main
gjl 1 year ago
parent
commit
612075666f
3 changed files with 9 additions and 2 deletions
  1. +2
    -1
      requirements.txt
  2. +1
    -1
      src/mindpilot/app/configs/kb_config.py
  3. +6
    -0
      src/mindpilot/app/knowledge_base/kb_doc_api.py

+ 2
- 1
requirements.txt View File

@@ -32,4 +32,5 @@ pymilvus~=2.4.5
python-dateutil~=2.9.0post0 python-dateutil~=2.9.0post0
unstructured~=0.15.8 unstructured~=0.15.8
rank_bm25~=0.2.2 rank_bm25~=0.2.2
langchain-huggingface~=0.0.3
langchain-huggingface~=0.0.3
python-magic-bin~=0.4.14

+ 1
- 1
src/mindpilot/app/configs/kb_config.py View File

@@ -83,7 +83,7 @@ kbs_config = {
# TextSplitter配置项,如果你不明白其中的含义,就不要修改。 # TextSplitter配置项,如果你不明白其中的含义,就不要修改。
text_splitter_dict = { text_splitter_dict = {
"ChineseRecursiveTextSplitter": { "ChineseRecursiveTextSplitter": {
"source": "", # 选择tiktoken则使用openai的方法 "huggingface"
"source": "",
"tokenizer_name_or_path": "", "tokenizer_name_or_path": "",
}, },
"SpacyTextSplitter": { "SpacyTextSplitter": {


+ 6
- 0
src/mindpilot/app/knowledge_base/kb_doc_api.py View File

@@ -3,6 +3,7 @@ import os
import urllib import urllib
from typing import Dict, List from typing import Dict, List


import chardet
from fastapi import Body, File, Form, Query, UploadFile from fastapi import Body, File, Form, Query, UploadFile
from fastapi.responses import FileResponse from fastapi.responses import FileResponse
from langchain.docstore.document import Document from langchain.docstore.document import Document
@@ -107,6 +108,11 @@ def _save_files_in_thread(
data = {"knowledge_base_name": knowledge_base_name, "file_name": filename} data = {"knowledge_base_name": knowledge_base_name, "file_name": filename}


file_content = file.file.read() # 读取上传文件的内容 file_content = file.file.read() # 读取上传文件的内容
# 检测文件编码
detected_encoding = chardet.detect(file_content)['encoding']
if detected_encoding and 'gb2312' in detected_encoding.lower():
file_content = file_content.decode('GB18030').encode('utf-8')

if ( if (
os.path.isfile(file_path) os.path.isfile(file_path)
and not override and not override


Loading…
Cancel
Save