增加知识库,知识图谱,文件上传的字数限制
This commit is contained in:
parent
1343dbbdcb
commit
329ef962ff
|
|
@ -17,6 +17,7 @@ from core.database import get_db_pool
|
||||||
from services.chat_thread_file_service import ChatThreadFileService
|
from services.chat_thread_file_service import ChatThreadFileService
|
||||||
from services.vector_service import get_vector_service
|
from services.vector_service import get_vector_service
|
||||||
from services.oss_service import get_oss_service
|
from services.oss_service import get_oss_service
|
||||||
|
from services.kb_text_limits import decode_txt_char_count, validate_chat_file_text_length
|
||||||
from models.chat_thread_file import (
|
from models.chat_thread_file import (
|
||||||
ChatThreadFileUploadResponse,
|
ChatThreadFileUploadResponse,
|
||||||
ChatThreadFileListResponse
|
ChatThreadFileListResponse
|
||||||
|
|
@ -438,6 +439,14 @@ async def upload_chat_file(
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"✅ 文件大小验证通过: {file_size_mb:.2f}MB ({file_size} bytes)")
|
logger.info(f"✅ 文件大小验证通过: {file_size_mb:.2f}MB ({file_size} bytes)")
|
||||||
|
|
||||||
|
# .txt 文件上传时直接校验字符数,其他格式在后台处理时校验
|
||||||
|
if file_ext == ".txt":
|
||||||
|
try:
|
||||||
|
validate_chat_file_text_length(decode_txt_char_count(content))
|
||||||
|
except ValueError as e:
|
||||||
|
logger.warning(f"❌ 文本文件字数超限: {file.filename}, {e}")
|
||||||
|
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
|
||||||
|
|
||||||
# 生成唯一文件名(使用时间戳)
|
# 生成唯一文件名(使用时间戳)
|
||||||
timestamp = int(time.time() * 1000)
|
timestamp = int(time.time() * 1000)
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,7 @@ from services.knowledge_base_service import KnowledgeBaseService
|
||||||
from services.knowledge_base_file_service import KnowledgeBaseFileService
|
from services.knowledge_base_file_service import KnowledgeBaseFileService
|
||||||
from services.audit_service import AuditService
|
from services.audit_service import AuditService
|
||||||
from services.vector_service import get_vector_service
|
from services.vector_service import get_vector_service
|
||||||
|
from services.kb_text_limits import decode_txt_char_count, validate_kb_text_length
|
||||||
from services.oss_service import get_oss_service
|
from services.oss_service import get_oss_service
|
||||||
from utils.helpers import BaseResponse
|
from utils.helpers import BaseResponse
|
||||||
from logger.logging import get_logger
|
from logger.logging import get_logger
|
||||||
|
|
@ -356,6 +357,13 @@ async def upload_file(
|
||||||
raise BadRequestError(f"文件大小超过限制,当前: {file_size_mb:.2f}MB,最大允许: 15MB")
|
raise BadRequestError(f"文件大小超过限制,当前: {file_size_mb:.2f}MB,最大允许: 15MB")
|
||||||
|
|
||||||
logger.info(f"✅ 文件大小验证通过: {file_size_mb:.2f}MB ({file_size} bytes)")
|
logger.info(f"✅ 文件大小验证通过: {file_size_mb:.2f}MB ({file_size} bytes)")
|
||||||
|
|
||||||
|
if file_ext == ".txt":
|
||||||
|
try:
|
||||||
|
validate_kb_text_length(decode_txt_char_count(content))
|
||||||
|
except ValueError as e:
|
||||||
|
logger.warning(f"❌ 文本文件字数超限: {file.filename}, {e}")
|
||||||
|
raise BadRequestError(str(e)) from e
|
||||||
|
|
||||||
# 生成唯一文件名
|
# 生成唯一文件名
|
||||||
timestamp = int(time.time() * 1000)
|
timestamp = int(time.time() * 1000)
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,7 @@ from services import neo4j_service
|
||||||
from services.novel_kg_service import (
|
from services.novel_kg_service import (
|
||||||
extract_and_import_knowledge_graph,
|
extract_and_import_knowledge_graph,
|
||||||
extract_knowledge_document_text,
|
extract_knowledge_document_text,
|
||||||
|
validate_knowledge_graph_text_length,
|
||||||
)
|
)
|
||||||
from utils.helpers import BaseResponse
|
from utils.helpers import BaseResponse
|
||||||
from logger.logging import get_logger
|
from logger.logging import get_logger
|
||||||
|
|
@ -139,6 +140,7 @@ async def create_knowledge_graph(
|
||||||
|
|
||||||
try:
|
try:
|
||||||
text = await extract_knowledge_document_text(file.filename, raw)
|
text = await extract_knowledge_document_text(file.filename, raw)
|
||||||
|
validate_knowledge_graph_text_length(text)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
raise HTTPException(status_code=400, detail=str(e)) from e
|
raise HTTPException(status_code=400, detail=str(e)) from e
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,57 @@
|
||||||
|
"""
|
||||||
|
知识库与对话文件的正文长度限制。
|
||||||
|
|
||||||
|
处理流程对比:
|
||||||
|
知识图谱 每块 1 次串行 LLM 抽关系 → 最严格,8 万字
|
||||||
|
知识库 批量 embedding + 单次摘要 → 较宽松,30 万字
|
||||||
|
对话文件 批量 embedding + 单次摘要(临时会话上下文)→ 20 万字
|
||||||
|
"""
|
||||||
|
|
||||||
|
# ---------- 知识库 ----------
|
||||||
|
# chunk_size=4096、overlap=200 → 有效步长约 3896 字/块
|
||||||
|
# 30 万字 ≈ 77 块向量化,通常 1-2 分钟可完成
|
||||||
|
MAX_KB_INPUT_CHARS = 300_000
|
||||||
|
MAX_KB_VECTOR_CHUNKS = 80
|
||||||
|
|
||||||
|
# ---------- 对话文件(聊天时上传) ----------
|
||||||
|
# 对话文件是单次会话临时上下文,精度要求高于知识库;过长会稀释检索精度
|
||||||
|
# 20 万字 ≈ 51 块,兼顾处理速度与上下文覆盖
|
||||||
|
MAX_CHAT_FILE_INPUT_CHARS = 200_000
|
||||||
|
MAX_CHAT_FILE_VECTOR_CHUNKS = 60
|
||||||
|
|
||||||
|
|
||||||
|
def decode_txt_char_count(raw: bytes) -> int:
|
||||||
|
"""估算纯文本文件字符数(上传前快速校验)。"""
|
||||||
|
try:
|
||||||
|
text = raw.decode("utf-8")
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
text = raw.decode("gb18030", errors="replace")
|
||||||
|
return len(text.strip())
|
||||||
|
|
||||||
|
|
||||||
|
def validate_kb_text_length(char_count: int, *, chunk_count: int | None = None) -> None:
|
||||||
|
"""校验知识库可处理的正文规模。"""
|
||||||
|
if char_count > MAX_KB_INPUT_CHARS:
|
||||||
|
raise ValueError(
|
||||||
|
f"提取的正文过长(约 {char_count:,} 字),知识库单文件上限为 {MAX_KB_INPUT_CHARS:,} 字。"
|
||||||
|
"请将文档拆分为多个文件分别上传。"
|
||||||
|
)
|
||||||
|
if chunk_count is not None and chunk_count > MAX_KB_VECTOR_CHUNKS:
|
||||||
|
raise ValueError(
|
||||||
|
f"文本分块过多({chunk_count} 块,上限 {MAX_KB_VECTOR_CHUNKS} 块),"
|
||||||
|
f"请将正文控制在约 {MAX_KB_INPUT_CHARS:,} 字以内后重试。"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_chat_file_text_length(char_count: int, *, chunk_count: int | None = None) -> None:
|
||||||
|
"""校验对话文件可处理的正文规模。"""
|
||||||
|
if char_count > MAX_CHAT_FILE_INPUT_CHARS:
|
||||||
|
raise ValueError(
|
||||||
|
f"提取的正文过长(约 {char_count:,} 字),对话文件上限为 {MAX_CHAT_FILE_INPUT_CHARS:,} 字。"
|
||||||
|
"请将文档拆分后上传,或使用知识库功能处理长文档。"
|
||||||
|
)
|
||||||
|
if chunk_count is not None and chunk_count > MAX_CHAT_FILE_VECTOR_CHUNKS:
|
||||||
|
raise ValueError(
|
||||||
|
f"文本分块过多({chunk_count} 块,上限 {MAX_CHAT_FILE_VECTOR_CHUNKS} 块),"
|
||||||
|
f"请将正文控制在约 {MAX_CHAT_FILE_INPUT_CHARS:,} 字以内后重试。"
|
||||||
|
)
|
||||||
|
|
@ -23,7 +23,10 @@ from logger.logging import get_logger
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
MAX_INPUT_CHARS = 800_000
|
# 知识图谱抽取上限:每块约 900 字、重叠 120,每块 1 次 DeepSeek 调用(串行)。
|
||||||
|
# 8 万字 ≈ 100 次调用,后台约 5–10 分钟可完成;50 万字需 600+ 次,基本不可行。
|
||||||
|
MAX_INPUT_CHARS = 80_000
|
||||||
|
MAX_KG_EXTRACT_CHUNKS = 100
|
||||||
CHUNK_SIZE = 900
|
CHUNK_SIZE = 900
|
||||||
CHUNK_OVERLAP = 120
|
CHUNK_OVERLAP = 120
|
||||||
|
|
||||||
|
|
@ -644,17 +647,32 @@ def merge_triplets(chunks: list[list[dict[str, Any]]]) -> list[dict[str, Any]]:
|
||||||
return merged
|
return merged
|
||||||
|
|
||||||
|
|
||||||
|
def validate_knowledge_graph_text_length(text: str) -> None:
|
||||||
|
"""上传/构建前校验正文长度,避免超长文本进入图谱抽取流水线。"""
|
||||||
|
char_count = len(text)
|
||||||
|
if char_count > MAX_INPUT_CHARS:
|
||||||
|
raise ValueError(
|
||||||
|
f"提取的正文过长(约 {char_count:,} 字),知识图谱构建上限为 {MAX_INPUT_CHARS:,} 字。"
|
||||||
|
"请拆分为多个文件分别上传,或使用知识库功能处理长文档问答。"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def extract_and_import_knowledge_graph(text: str, graph_id: str) -> dict[str, Any]:
|
async def extract_and_import_knowledge_graph(text: str, graph_id: str) -> dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
对整篇文本分块调用 LLM,合并三元组后写入 Neo4j。
|
对整篇文本分块调用 LLM,合并三元组后写入 Neo4j。
|
||||||
"""
|
"""
|
||||||
if len(text) > MAX_INPUT_CHARS:
|
validate_knowledge_graph_text_length(text)
|
||||||
raise ValueError(f"文本过长,请控制在约 {MAX_INPUT_CHARS} 字以内")
|
|
||||||
|
|
||||||
chunks = split_novel_text(text)
|
chunks = split_novel_text(text)
|
||||||
if not chunks:
|
if not chunks:
|
||||||
raise ValueError("文本为空")
|
raise ValueError("文本为空")
|
||||||
|
|
||||||
|
if len(chunks) > MAX_KG_EXTRACT_CHUNKS:
|
||||||
|
raise ValueError(
|
||||||
|
f"文本分块过多({len(chunks)} 块,上限 {MAX_KG_EXTRACT_CHUNKS} 块),"
|
||||||
|
f"请将正文控制在约 {MAX_INPUT_CHARS:,} 字以内后重试。"
|
||||||
|
)
|
||||||
|
|
||||||
logger.info("知识图谱:共 {} 个文本块", len(chunks))
|
logger.info("知识图谱:共 {} 个文本块", len(chunks))
|
||||||
batch_results: list[list[dict[str, Any]]] = []
|
batch_results: list[list[dict[str, Any]]] = []
|
||||||
for i, ch in enumerate(chunks):
|
for i, ch in enumerate(chunks):
|
||||||
|
|
|
||||||
|
|
@ -78,6 +78,7 @@ import bs4
|
||||||
|
|
||||||
from logger.logging import get_logger
|
from logger.logging import get_logger
|
||||||
from core.config import settings
|
from core.config import settings
|
||||||
|
from services.kb_text_limits import validate_kb_text_length, validate_chat_file_text_length
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
@ -859,6 +860,14 @@ class VectorService:
|
||||||
error_msg = "未能从文件加载到任何内容"
|
error_msg = "未能从文件加载到任何内容"
|
||||||
logger.warning(error_msg)
|
logger.warning(error_msg)
|
||||||
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
|
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
|
||||||
|
|
||||||
|
total_chars = sum(len(d.page_content or "") for d in docs)
|
||||||
|
try:
|
||||||
|
validate_kb_text_length(total_chars)
|
||||||
|
except ValueError as e:
|
||||||
|
error_msg = str(e)
|
||||||
|
logger.warning(error_msg)
|
||||||
|
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
|
||||||
|
|
||||||
# 2. 分割文本
|
# 2. 分割文本
|
||||||
all_splits = self.text_splitter.split_documents(docs)
|
all_splits = self.text_splitter.split_documents(docs)
|
||||||
|
|
@ -869,6 +878,13 @@ class VectorService:
|
||||||
error_msg = "文档分割后没有内容,可能是空白文档"
|
error_msg = "文档分割后没有内容,可能是空白文档"
|
||||||
logger.warning(error_msg)
|
logger.warning(error_msg)
|
||||||
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
|
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
|
||||||
|
|
||||||
|
try:
|
||||||
|
validate_kb_text_length(total_chars, chunk_count=len(all_splits))
|
||||||
|
except ValueError as e:
|
||||||
|
error_msg = str(e)
|
||||||
|
logger.warning(error_msg)
|
||||||
|
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
|
||||||
|
|
||||||
# 3. 向量化并存储
|
# 3. 向量化并存储
|
||||||
collection_name = f"kb_{knowledge_base_id}"
|
collection_name = f"kb_{knowledge_base_id}"
|
||||||
|
|
@ -961,6 +977,14 @@ class VectorService:
|
||||||
error_msg = "未能从 URL 加载到任何内容"
|
error_msg = "未能从 URL 加载到任何内容"
|
||||||
logger.warning(error_msg)
|
logger.warning(error_msg)
|
||||||
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
|
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
|
||||||
|
|
||||||
|
total_chars = sum(len(d.page_content or "") for d in docs)
|
||||||
|
try:
|
||||||
|
validate_kb_text_length(total_chars)
|
||||||
|
except ValueError as e:
|
||||||
|
error_msg = str(e)
|
||||||
|
logger.warning(error_msg)
|
||||||
|
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
|
||||||
|
|
||||||
# 2. 分割文本
|
# 2. 分割文本
|
||||||
all_splits = self.text_splitter.split_documents(docs)
|
all_splits = self.text_splitter.split_documents(docs)
|
||||||
|
|
@ -971,6 +995,13 @@ class VectorService:
|
||||||
error_msg = "网页分割后没有内容,可能是空白页面或无法提取文本"
|
error_msg = "网页分割后没有内容,可能是空白页面或无法提取文本"
|
||||||
logger.warning(error_msg)
|
logger.warning(error_msg)
|
||||||
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
|
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
|
||||||
|
|
||||||
|
try:
|
||||||
|
validate_kb_text_length(total_chars, chunk_count=len(all_splits))
|
||||||
|
except ValueError as e:
|
||||||
|
error_msg = str(e)
|
||||||
|
logger.warning(error_msg)
|
||||||
|
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
|
||||||
|
|
||||||
# 3. 向量化并存储
|
# 3. 向量化并存储
|
||||||
collection_name = f"kb_{knowledge_base_id}"
|
collection_name = f"kb_{knowledge_base_id}"
|
||||||
|
|
@ -1081,6 +1112,14 @@ class VectorService:
|
||||||
error_msg = "未能加载到任何内容"
|
error_msg = "未能加载到任何内容"
|
||||||
logger.warning(error_msg)
|
logger.warning(error_msg)
|
||||||
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
|
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
|
||||||
|
|
||||||
|
total_chars = sum(len(d.page_content or "") for d in docs)
|
||||||
|
try:
|
||||||
|
validate_chat_file_text_length(total_chars)
|
||||||
|
except ValueError as e:
|
||||||
|
error_msg = str(e)
|
||||||
|
logger.warning(error_msg)
|
||||||
|
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
|
||||||
|
|
||||||
# 分割文本
|
# 分割文本
|
||||||
all_splits = self.text_splitter.split_documents(docs)
|
all_splits = self.text_splitter.split_documents(docs)
|
||||||
|
|
@ -1091,6 +1130,13 @@ class VectorService:
|
||||||
error_msg = "文档分割后没有内容,可能是空白文档或无法提取文本"
|
error_msg = "文档分割后没有内容,可能是空白文档或无法提取文本"
|
||||||
logger.warning(error_msg)
|
logger.warning(error_msg)
|
||||||
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
|
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
|
||||||
|
|
||||||
|
try:
|
||||||
|
validate_chat_file_text_length(total_chars, chunk_count=len(all_splits))
|
||||||
|
except ValueError as e:
|
||||||
|
error_msg = str(e)
|
||||||
|
logger.warning(error_msg)
|
||||||
|
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
|
||||||
|
|
||||||
# 向量化并存储(使用 thread_id 作为集合名)
|
# 向量化并存储(使用 thread_id 作为集合名)
|
||||||
collection_name = f"thread_{thread_id}"
|
collection_name = f"thread_{thread_id}"
|
||||||
|
|
|
||||||
|
|
@ -216,7 +216,7 @@
|
||||||
ref="fileInput"
|
ref="fileInput"
|
||||||
>
|
>
|
||||||
</label>
|
</label>
|
||||||
<small class="upload-hint">支持 PDF、DOCX、Excel(xlsx/xls)、CSV、TXT、图片(PNG、JPG、BMP)格式,文件将自动进行向量化处理</small>
|
<small class="upload-hint">支持 PDF、DOCX、Excel(xlsx/xls)、CSV、TXT、图片(PNG、JPG、BMP)格式,单文件不超过 15MB,提取正文不超过 30 万字(过长请拆分)</small>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- URL 上传 -->
|
<!-- URL 上传 -->
|
||||||
|
|
|
||||||
|
|
@ -219,8 +219,7 @@
|
||||||
class="form-control bg-dark text-white border-secondary"
|
class="form-control bg-dark text-white border-secondary"
|
||||||
@change="onFileSelect"
|
@change="onFileSelect"
|
||||||
/>
|
/>
|
||||||
<div class="mt-2 small text-muted">支持 .txt、.pdf、.docx 及常见图片;扫描件将尝试 OCR + 通义视觉。单文件不超过约 15MB</div>
|
<div class="mt-2 small text-muted">支持 .txt、.pdf、.docx 及常见图片;扫描件将尝试 OCR + 通义视觉。单文件不超过约 15MB,提取正文建议不超过 8 万字(过长请拆分或改用知识库)</div> </div>
|
||||||
</div>
|
|
||||||
<div v-if="uploadError" class="alert alert-danger py-2 small">{{ uploadError }}</div>
|
<div v-if="uploadError" class="alert alert-danger py-2 small">{{ uploadError }}</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="modal-footer-custom">
|
<div class="modal-footer-custom">
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue