Browse Source

fix: txt编码问题

main
gjl 1 year ago
parent
commit
612075666f
3 changed files with 9 additions and 2 deletions
  1. +2
    -1
      requirements.txt
  2. +1
    -1
      src/mindpilot/app/configs/kb_config.py
  3. +6
    -0
      src/mindpilot/app/knowledge_base/kb_doc_api.py

+ 2
- 1
requirements.txt View File

@@ -32,4 +32,5 @@ pymilvus~=2.4.5
python-dateutil~=2.9.0post0
unstructured~=0.15.8
rank_bm25~=0.2.2
langchain-huggingface~=0.0.3
langchain-huggingface~=0.0.3
python-magic-bin~=0.4.14

+ 1
- 1
src/mindpilot/app/configs/kb_config.py View File

@@ -83,7 +83,7 @@ kbs_config = {
# TextSplitter配置项,如果你不明白其中的含义,就不要修改。
text_splitter_dict = {
"ChineseRecursiveTextSplitter": {
"source": "", # 选择tiktoken则使用openai的方法 "huggingface"
"source": "",
"tokenizer_name_or_path": "",
},
"SpacyTextSplitter": {


+ 6
- 0
src/mindpilot/app/knowledge_base/kb_doc_api.py View File

@@ -3,6 +3,7 @@ import os
import urllib
from typing import Dict, List

import chardet
from fastapi import Body, File, Form, Query, UploadFile
from fastapi.responses import FileResponse
from langchain.docstore.document import Document
@@ -107,6 +108,11 @@ def _save_files_in_thread(
data = {"knowledge_base_name": knowledge_base_name, "file_name": filename}

file_content = file.file.read() # 读取上传文件的内容
# 检测文件编码
detected_encoding = chardet.detect(file_content)['encoding']
if detected_encoding and 'gb2312' in detected_encoding.lower():
file_content = file_content.decode('GB18030').encode('utf-8')

if (
os.path.isfile(file_path)
and not override


Loading…
Cancel
Save