增加知识库,知识图谱,文件上传的字数限制

This commit is contained in:
silk 2026-06-06 13:23:40 +08:00
parent 1343dbbdcb
commit 329ef962ff
8 changed files with 145 additions and 6 deletions

View File

@ -17,6 +17,7 @@ from core.database import get_db_pool
from services.chat_thread_file_service import ChatThreadFileService from services.chat_thread_file_service import ChatThreadFileService
from services.vector_service import get_vector_service from services.vector_service import get_vector_service
from services.oss_service import get_oss_service from services.oss_service import get_oss_service
from services.kb_text_limits import decode_txt_char_count, validate_chat_file_text_length
from models.chat_thread_file import ( from models.chat_thread_file import (
ChatThreadFileUploadResponse, ChatThreadFileUploadResponse,
ChatThreadFileListResponse ChatThreadFileListResponse
@ -438,6 +439,14 @@ async def upload_chat_file(
) )
logger.info(f"✅ 文件大小验证通过: {file_size_mb:.2f}MB ({file_size} bytes)") logger.info(f"✅ 文件大小验证通过: {file_size_mb:.2f}MB ({file_size} bytes)")
# .txt 文件上传时直接校验字符数,其他格式在后台处理时校验
if file_ext == ".txt":
try:
validate_chat_file_text_length(decode_txt_char_count(content))
except ValueError as e:
logger.warning(f"❌ 文本文件字数超限: {file.filename}, {e}")
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
# 生成唯一文件名(使用时间戳) # 生成唯一文件名(使用时间戳)
timestamp = int(time.time() * 1000) timestamp = int(time.time() * 1000)

View File

@ -23,6 +23,7 @@ from services.knowledge_base_service import KnowledgeBaseService
from services.knowledge_base_file_service import KnowledgeBaseFileService from services.knowledge_base_file_service import KnowledgeBaseFileService
from services.audit_service import AuditService from services.audit_service import AuditService
from services.vector_service import get_vector_service from services.vector_service import get_vector_service
from services.kb_text_limits import decode_txt_char_count, validate_kb_text_length
from services.oss_service import get_oss_service from services.oss_service import get_oss_service
from utils.helpers import BaseResponse from utils.helpers import BaseResponse
from logger.logging import get_logger from logger.logging import get_logger
@ -356,6 +357,13 @@ async def upload_file(
raise BadRequestError(f"文件大小超过限制,当前: {file_size_mb:.2f}MB最大允许: 15MB") raise BadRequestError(f"文件大小超过限制,当前: {file_size_mb:.2f}MB最大允许: 15MB")
logger.info(f"✅ 文件大小验证通过: {file_size_mb:.2f}MB ({file_size} bytes)") logger.info(f"✅ 文件大小验证通过: {file_size_mb:.2f}MB ({file_size} bytes)")
if file_ext == ".txt":
try:
validate_kb_text_length(decode_txt_char_count(content))
except ValueError as e:
logger.warning(f"❌ 文本文件字数超限: {file.filename}, {e}")
raise BadRequestError(str(e)) from e
# 生成唯一文件名 # 生成唯一文件名
timestamp = int(time.time() * 1000) timestamp = int(time.time() * 1000)

View File

@ -22,6 +22,7 @@ from services import neo4j_service
from services.novel_kg_service import ( from services.novel_kg_service import (
extract_and_import_knowledge_graph, extract_and_import_knowledge_graph,
extract_knowledge_document_text, extract_knowledge_document_text,
validate_knowledge_graph_text_length,
) )
from utils.helpers import BaseResponse from utils.helpers import BaseResponse
from logger.logging import get_logger from logger.logging import get_logger
@ -139,6 +140,7 @@ async def create_knowledge_graph(
try: try:
text = await extract_knowledge_document_text(file.filename, raw) text = await extract_knowledge_document_text(file.filename, raw)
validate_knowledge_graph_text_length(text)
except ValueError as e: except ValueError as e:
raise HTTPException(status_code=400, detail=str(e)) from e raise HTTPException(status_code=400, detail=str(e)) from e

View File

@ -0,0 +1,57 @@
"""
知识库与对话文件的正文长度限制
处理流程对比
知识图谱 每块 1 次串行 LLM 抽关系 最严格8 万字
知识库 批量 embedding + 单次摘要 较宽松30 万字
对话文件 批量 embedding + 单次摘要临时会话上下文 20 万字
"""
# ---------- 知识库 ----------
# chunk_size=4096、overlap=200 → 有效步长约 3896 字/块
# 30 万字 ≈ 77 块向量化,通常 1-2 分钟可完成
MAX_KB_INPUT_CHARS = 300_000
MAX_KB_VECTOR_CHUNKS = 80
# ---------- 对话文件(聊天时上传) ----------
# 对话文件是单次会话临时上下文,精度要求高于知识库;过长会稀释检索精度
# 20 万字 ≈ 51 块,兼顾处理速度与上下文覆盖
MAX_CHAT_FILE_INPUT_CHARS = 200_000
MAX_CHAT_FILE_VECTOR_CHUNKS = 60
def decode_txt_char_count(raw: bytes) -> int:
"""估算纯文本文件字符数(上传前快速校验)。"""
try:
text = raw.decode("utf-8")
except UnicodeDecodeError:
text = raw.decode("gb18030", errors="replace")
return len(text.strip())
def validate_kb_text_length(char_count: int, *, chunk_count: int | None = None) -> None:
"""校验知识库可处理的正文规模。"""
if char_count > MAX_KB_INPUT_CHARS:
raise ValueError(
f"提取的正文过长(约 {char_count:,} 字),知识库单文件上限为 {MAX_KB_INPUT_CHARS:,} 字。"
"请将文档拆分为多个文件分别上传。"
)
if chunk_count is not None and chunk_count > MAX_KB_VECTOR_CHUNKS:
raise ValueError(
f"文本分块过多({chunk_count} 块,上限 {MAX_KB_VECTOR_CHUNKS} 块),"
f"请将正文控制在约 {MAX_KB_INPUT_CHARS:,} 字以内后重试。"
)
def validate_chat_file_text_length(char_count: int, *, chunk_count: int | None = None) -> None:
"""校验对话文件可处理的正文规模。"""
if char_count > MAX_CHAT_FILE_INPUT_CHARS:
raise ValueError(
f"提取的正文过长(约 {char_count:,} 字),对话文件上限为 {MAX_CHAT_FILE_INPUT_CHARS:,} 字。"
"请将文档拆分后上传,或使用知识库功能处理长文档。"
)
if chunk_count is not None and chunk_count > MAX_CHAT_FILE_VECTOR_CHUNKS:
raise ValueError(
f"文本分块过多({chunk_count} 块,上限 {MAX_CHAT_FILE_VECTOR_CHUNKS} 块),"
f"请将正文控制在约 {MAX_CHAT_FILE_INPUT_CHARS:,} 字以内后重试。"
)

View File

@ -23,7 +23,10 @@ from logger.logging import get_logger
logger = get_logger(__name__) logger = get_logger(__name__)
MAX_INPUT_CHARS = 800_000 # 知识图谱抽取上限:每块约 900 字、重叠 120每块 1 次 DeepSeek 调用(串行)。
# 8 万字 ≈ 100 次调用,后台约 510 分钟可完成50 万字需 600+ 次,基本不可行。
MAX_INPUT_CHARS = 80_000
MAX_KG_EXTRACT_CHUNKS = 100
CHUNK_SIZE = 900 CHUNK_SIZE = 900
CHUNK_OVERLAP = 120 CHUNK_OVERLAP = 120
@ -644,17 +647,32 @@ def merge_triplets(chunks: list[list[dict[str, Any]]]) -> list[dict[str, Any]]:
return merged return merged
def validate_knowledge_graph_text_length(text: str) -> None:
"""上传/构建前校验正文长度,避免超长文本进入图谱抽取流水线。"""
char_count = len(text)
if char_count > MAX_INPUT_CHARS:
raise ValueError(
f"提取的正文过长(约 {char_count:,} 字),知识图谱构建上限为 {MAX_INPUT_CHARS:,} 字。"
"请拆分为多个文件分别上传,或使用知识库功能处理长文档问答。"
)
async def extract_and_import_knowledge_graph(text: str, graph_id: str) -> dict[str, Any]: async def extract_and_import_knowledge_graph(text: str, graph_id: str) -> dict[str, Any]:
""" """
对整篇文本分块调用 LLM合并三元组后写入 Neo4j 对整篇文本分块调用 LLM合并三元组后写入 Neo4j
""" """
if len(text) > MAX_INPUT_CHARS: validate_knowledge_graph_text_length(text)
raise ValueError(f"文本过长,请控制在约 {MAX_INPUT_CHARS} 字以内")
chunks = split_novel_text(text) chunks = split_novel_text(text)
if not chunks: if not chunks:
raise ValueError("文本为空") raise ValueError("文本为空")
if len(chunks) > MAX_KG_EXTRACT_CHUNKS:
raise ValueError(
f"文本分块过多({len(chunks)} 块,上限 {MAX_KG_EXTRACT_CHUNKS} 块),"
f"请将正文控制在约 {MAX_INPUT_CHARS:,} 字以内后重试。"
)
logger.info("知识图谱:共 {} 个文本块", len(chunks)) logger.info("知识图谱:共 {} 个文本块", len(chunks))
batch_results: list[list[dict[str, Any]]] = [] batch_results: list[list[dict[str, Any]]] = []
for i, ch in enumerate(chunks): for i, ch in enumerate(chunks):

View File

@ -78,6 +78,7 @@ import bs4
from logger.logging import get_logger from logger.logging import get_logger
from core.config import settings from core.config import settings
from services.kb_text_limits import validate_kb_text_length, validate_chat_file_text_length
logger = get_logger(__name__) logger = get_logger(__name__)
@ -859,6 +860,14 @@ class VectorService:
error_msg = "未能从文件加载到任何内容" error_msg = "未能从文件加载到任何内容"
logger.warning(error_msg) logger.warning(error_msg)
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg) return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
total_chars = sum(len(d.page_content or "") for d in docs)
try:
validate_kb_text_length(total_chars)
except ValueError as e:
error_msg = str(e)
logger.warning(error_msg)
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
# 2. 分割文本 # 2. 分割文本
all_splits = self.text_splitter.split_documents(docs) all_splits = self.text_splitter.split_documents(docs)
@ -869,6 +878,13 @@ class VectorService:
error_msg = "文档分割后没有内容,可能是空白文档" error_msg = "文档分割后没有内容,可能是空白文档"
logger.warning(error_msg) logger.warning(error_msg)
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg) return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
try:
validate_kb_text_length(total_chars, chunk_count=len(all_splits))
except ValueError as e:
error_msg = str(e)
logger.warning(error_msg)
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
# 3. 向量化并存储 # 3. 向量化并存储
collection_name = f"kb_{knowledge_base_id}" collection_name = f"kb_{knowledge_base_id}"
@ -961,6 +977,14 @@ class VectorService:
error_msg = "未能从 URL 加载到任何内容" error_msg = "未能从 URL 加载到任何内容"
logger.warning(error_msg) logger.warning(error_msg)
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg) return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
total_chars = sum(len(d.page_content or "") for d in docs)
try:
validate_kb_text_length(total_chars)
except ValueError as e:
error_msg = str(e)
logger.warning(error_msg)
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
# 2. 分割文本 # 2. 分割文本
all_splits = self.text_splitter.split_documents(docs) all_splits = self.text_splitter.split_documents(docs)
@ -971,6 +995,13 @@ class VectorService:
error_msg = "网页分割后没有内容,可能是空白页面或无法提取文本" error_msg = "网页分割后没有内容,可能是空白页面或无法提取文本"
logger.warning(error_msg) logger.warning(error_msg)
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg) return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
try:
validate_kb_text_length(total_chars, chunk_count=len(all_splits))
except ValueError as e:
error_msg = str(e)
logger.warning(error_msg)
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
# 3. 向量化并存储 # 3. 向量化并存储
collection_name = f"kb_{knowledge_base_id}" collection_name = f"kb_{knowledge_base_id}"
@ -1081,6 +1112,14 @@ class VectorService:
error_msg = "未能加载到任何内容" error_msg = "未能加载到任何内容"
logger.warning(error_msg) logger.warning(error_msg)
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg) return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
total_chars = sum(len(d.page_content or "") for d in docs)
try:
validate_chat_file_text_length(total_chars)
except ValueError as e:
error_msg = str(e)
logger.warning(error_msg)
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
# 分割文本 # 分割文本
all_splits = self.text_splitter.split_documents(docs) all_splits = self.text_splitter.split_documents(docs)
@ -1091,6 +1130,13 @@ class VectorService:
error_msg = "文档分割后没有内容,可能是空白文档或无法提取文本" error_msg = "文档分割后没有内容,可能是空白文档或无法提取文本"
logger.warning(error_msg) logger.warning(error_msg)
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg) return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
try:
validate_chat_file_text_length(total_chars, chunk_count=len(all_splits))
except ValueError as e:
error_msg = str(e)
logger.warning(error_msg)
return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg)
# 向量化并存储(使用 thread_id 作为集合名) # 向量化并存储(使用 thread_id 作为集合名)
collection_name = f"thread_{thread_id}" collection_name = f"thread_{thread_id}"

View File

@ -216,7 +216,7 @@
ref="fileInput" ref="fileInput"
> >
</label> </label>
<small class="upload-hint">支持 PDFDOCXExcelxlsx/xlsCSVTXT图片PNGJPGBMP格式文件将自动进行向量化处理</small> <small class="upload-hint">支持 PDFDOCXExcelxlsx/xlsCSVTXT图片PNGJPGBMP格式单文件不超过 15MB提取正文不超过 30 万字过长请拆分</small>
</div> </div>
<!-- URL 上传 --> <!-- URL 上传 -->

View File

@ -219,8 +219,7 @@
class="form-control bg-dark text-white border-secondary" class="form-control bg-dark text-white border-secondary"
@change="onFileSelect" @change="onFileSelect"
/> />
<div class="mt-2 small text-muted">支持 .txt.pdf.docx 及常见图片扫描件将尝试 OCR + 通义视觉单文件不超过约 15MB</div> <div class="mt-2 small text-muted">支持 .txt.pdf.docx 及常见图片扫描件将尝试 OCR + 通义视觉单文件不超过约 15MB提取正文建议不超过 8 万字过长请拆分或改用知识库</div> </div>
</div>
<div v-if="uploadError" class="alert alert-danger py-2 small">{{ uploadError }}</div> <div v-if="uploadError" class="alert alert-danger py-2 small">{{ uploadError }}</div>
</div> </div>
<div class="modal-footer-custom"> <div class="modal-footer-custom">