diff --git a/backend/api/chat_file.py b/backend/api/chat_file.py index 21b23af..f901f7d 100644 --- a/backend/api/chat_file.py +++ b/backend/api/chat_file.py @@ -17,6 +17,7 @@ from core.database import get_db_pool from services.chat_thread_file_service import ChatThreadFileService from services.vector_service import get_vector_service from services.oss_service import get_oss_service +from services.kb_text_limits import decode_txt_char_count, validate_chat_file_text_length from models.chat_thread_file import ( ChatThreadFileUploadResponse, ChatThreadFileListResponse @@ -438,6 +439,14 @@ async def upload_chat_file( ) logger.info(f"✅ 文件大小验证通过: {file_size_mb:.2f}MB ({file_size} bytes)") + + # .txt 文件上传时直接校验字符数,其他格式在后台处理时校验 + if file_ext == ".txt": + try: + validate_chat_file_text_length(decode_txt_char_count(content)) + except ValueError as e: + logger.warning(f"❌ 文本文件字数超限: {file.filename}, {e}") + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e)) # 生成唯一文件名(使用时间戳) timestamp = int(time.time() * 1000) diff --git a/backend/api/kb_file_router.py b/backend/api/kb_file_router.py index 614ede6..85315bf 100644 --- a/backend/api/kb_file_router.py +++ b/backend/api/kb_file_router.py @@ -23,6 +23,7 @@ from services.knowledge_base_service import KnowledgeBaseService from services.knowledge_base_file_service import KnowledgeBaseFileService from services.audit_service import AuditService from services.vector_service import get_vector_service +from services.kb_text_limits import decode_txt_char_count, validate_kb_text_length from services.oss_service import get_oss_service from utils.helpers import BaseResponse from logger.logging import get_logger @@ -356,6 +357,13 @@ async def upload_file( raise BadRequestError(f"文件大小超过限制,当前: {file_size_mb:.2f}MB,最大允许: 15MB") logger.info(f"✅ 文件大小验证通过: {file_size_mb:.2f}MB ({file_size} bytes)") + + if file_ext == ".txt": + try: + validate_kb_text_length(decode_txt_char_count(content)) + except ValueError as e: + logger.warning(f"❌ 文本文件字数超限: {file.filename}, {e}") + raise BadRequestError(str(e)) from e # 生成唯一文件名 timestamp = int(time.time() * 1000) diff --git a/backend/api/knowledge_graph_router.py b/backend/api/knowledge_graph_router.py index b9f180b..724896e 100644 --- a/backend/api/knowledge_graph_router.py +++ b/backend/api/knowledge_graph_router.py @@ -22,6 +22,7 @@ from services import neo4j_service from services.novel_kg_service import ( extract_and_import_knowledge_graph, extract_knowledge_document_text, + validate_knowledge_graph_text_length, ) from utils.helpers import BaseResponse from logger.logging import get_logger @@ -139,6 +140,7 @@ async def create_knowledge_graph( try: text = await extract_knowledge_document_text(file.filename, raw) + validate_knowledge_graph_text_length(text) except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) from e diff --git a/backend/services/kb_text_limits.py b/backend/services/kb_text_limits.py new file mode 100644 index 0000000..0f932b2 --- /dev/null +++ b/backend/services/kb_text_limits.py @@ -0,0 +1,57 @@ +""" +知识库与对话文件的正文长度限制。 + +处理流程对比: + 知识图谱 每块 1 次串行 LLM 抽关系 → 最严格,8 万字 + 知识库 批量 embedding + 单次摘要 → 较宽松,30 万字 + 对话文件 批量 embedding + 单次摘要(临时会话上下文)→ 20 万字 +""" + +# ---------- 知识库 ---------- +# chunk_size=4096、overlap=200 → 有效步长约 3896 字/块 +# 30 万字 ≈ 77 块向量化,通常 1-2 分钟可完成 +MAX_KB_INPUT_CHARS = 300_000 +MAX_KB_VECTOR_CHUNKS = 80 + +# ---------- 对话文件(聊天时上传) ---------- +# 对话文件是单次会话临时上下文,精度要求高于知识库;过长会稀释检索精度 +# 20 万字 ≈ 51 块,兼顾处理速度与上下文覆盖 +MAX_CHAT_FILE_INPUT_CHARS = 200_000 +MAX_CHAT_FILE_VECTOR_CHUNKS = 60 + + +def decode_txt_char_count(raw: bytes) -> int: + """估算纯文本文件字符数(上传前快速校验)。""" + try: + text = raw.decode("utf-8") + except UnicodeDecodeError: + text = raw.decode("gb18030", errors="replace") + return len(text.strip()) + + +def validate_kb_text_length(char_count: int, *, chunk_count: int | None = None) -> None: + """校验知识库可处理的正文规模。""" + if char_count > MAX_KB_INPUT_CHARS: + raise ValueError( + f"提取的正文过长(约 {char_count:,} 字),知识库单文件上限为 {MAX_KB_INPUT_CHARS:,} 字。" + "请将文档拆分为多个文件分别上传。" + ) + if chunk_count is not None and chunk_count > MAX_KB_VECTOR_CHUNKS: + raise ValueError( + f"文本分块过多({chunk_count} 块,上限 {MAX_KB_VECTOR_CHUNKS} 块)," + f"请将正文控制在约 {MAX_KB_INPUT_CHARS:,} 字以内后重试。" + ) + + +def validate_chat_file_text_length(char_count: int, *, chunk_count: int | None = None) -> None: + """校验对话文件可处理的正文规模。""" + if char_count > MAX_CHAT_FILE_INPUT_CHARS: + raise ValueError( + f"提取的正文过长(约 {char_count:,} 字),对话文件上限为 {MAX_CHAT_FILE_INPUT_CHARS:,} 字。" + "请将文档拆分后上传,或使用知识库功能处理长文档。" + ) + if chunk_count is not None and chunk_count > MAX_CHAT_FILE_VECTOR_CHUNKS: + raise ValueError( + f"文本分块过多({chunk_count} 块,上限 {MAX_CHAT_FILE_VECTOR_CHUNKS} 块)," + f"请将正文控制在约 {MAX_CHAT_FILE_INPUT_CHARS:,} 字以内后重试。" + ) diff --git a/backend/services/novel_kg_service.py b/backend/services/novel_kg_service.py index 7fd3743..1843c7d 100644 --- a/backend/services/novel_kg_service.py +++ b/backend/services/novel_kg_service.py @@ -23,7 +23,10 @@ from logger.logging import get_logger logger = get_logger(__name__) -MAX_INPUT_CHARS = 800_000 +# 知识图谱抽取上限:每块约 900 字、重叠 120,每块 1 次 DeepSeek 调用(串行)。 +# 8 万字 ≈ 100 次调用,后台约 5–10 分钟可完成;50 万字需 600+ 次,基本不可行。 +MAX_INPUT_CHARS = 80_000 +MAX_KG_EXTRACT_CHUNKS = 100 CHUNK_SIZE = 900 CHUNK_OVERLAP = 120 @@ -644,17 +647,32 @@ def merge_triplets(chunks: list[list[dict[str, Any]]]) -> list[dict[str, Any]]: return merged +def validate_knowledge_graph_text_length(text: str) -> None: + """上传/构建前校验正文长度,避免超长文本进入图谱抽取流水线。""" + char_count = len(text) + if char_count > MAX_INPUT_CHARS: + raise ValueError( + f"提取的正文过长(约 {char_count:,} 字),知识图谱构建上限为 {MAX_INPUT_CHARS:,} 字。" + "请拆分为多个文件分别上传,或使用知识库功能处理长文档问答。" + ) + + async def extract_and_import_knowledge_graph(text: str, graph_id: str) -> dict[str, Any]: """ 对整篇文本分块调用 LLM,合并三元组后写入 Neo4j。 """ - if len(text) > MAX_INPUT_CHARS: - raise ValueError(f"文本过长,请控制在约 {MAX_INPUT_CHARS} 字以内") + validate_knowledge_graph_text_length(text) chunks = split_novel_text(text) if not chunks: raise ValueError("文本为空") + if len(chunks) > MAX_KG_EXTRACT_CHUNKS: + raise ValueError( + f"文本分块过多({len(chunks)} 块,上限 {MAX_KG_EXTRACT_CHUNKS} 块)," + f"请将正文控制在约 {MAX_INPUT_CHARS:,} 字以内后重试。" + ) + logger.info("知识图谱:共 {} 个文本块", len(chunks)) batch_results: list[list[dict[str, Any]]] = [] for i, ch in enumerate(chunks): diff --git a/backend/services/vector_service.py b/backend/services/vector_service.py index 33f5760..e17486c 100644 --- a/backend/services/vector_service.py +++ b/backend/services/vector_service.py @@ -78,6 +78,7 @@ import bs4 from logger.logging import get_logger from core.config import settings +from services.kb_text_limits import validate_kb_text_length, validate_chat_file_text_length logger = get_logger(__name__) @@ -859,6 +860,14 @@ class VectorService: error_msg = "未能从文件加载到任何内容" logger.warning(error_msg) return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg) + + total_chars = sum(len(d.page_content or "") for d in docs) + try: + validate_kb_text_length(total_chars) + except ValueError as e: + error_msg = str(e) + logger.warning(error_msg) + return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg) # 2. 分割文本 all_splits = self.text_splitter.split_documents(docs) @@ -869,6 +878,13 @@ class VectorService: error_msg = "文档分割后没有内容,可能是空白文档" logger.warning(error_msg) return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg) + + try: + validate_kb_text_length(total_chars, chunk_count=len(all_splits)) + except ValueError as e: + error_msg = str(e) + logger.warning(error_msg) + return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg) # 3. 向量化并存储 collection_name = f"kb_{knowledge_base_id}" @@ -961,6 +977,14 @@ class VectorService: error_msg = "未能从 URL 加载到任何内容" logger.warning(error_msg) return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg) + + total_chars = sum(len(d.page_content or "") for d in docs) + try: + validate_kb_text_length(total_chars) + except ValueError as e: + error_msg = str(e) + logger.warning(error_msg) + return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg) # 2. 分割文本 all_splits = self.text_splitter.split_documents(docs) @@ -971,6 +995,13 @@ class VectorService: error_msg = "网页分割后没有内容,可能是空白页面或无法提取文本" logger.warning(error_msg) return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg) + + try: + validate_kb_text_length(total_chars, chunk_count=len(all_splits)) + except ValueError as e: + error_msg = str(e) + logger.warning(error_msg) + return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg) # 3. 向量化并存储 collection_name = f"kb_{knowledge_base_id}" @@ -1081,6 +1112,14 @@ class VectorService: error_msg = "未能加载到任何内容" logger.warning(error_msg) return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg) + + total_chars = sum(len(d.page_content or "") for d in docs) + try: + validate_chat_file_text_length(total_chars) + except ValueError as e: + error_msg = str(e) + logger.warning(error_msg) + return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg) # 分割文本 all_splits = self.text_splitter.split_documents(docs) @@ -1091,6 +1130,13 @@ class VectorService: error_msg = "文档分割后没有内容,可能是空白文档或无法提取文本" logger.warning(error_msg) return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg) + + try: + validate_chat_file_text_length(total_chars, chunk_count=len(all_splits)) + except ValueError as e: + error_msg = str(e) + logger.warning(error_msg) + return ProcessResult(success=False, chunks=[], chunk_count=0, error_message=error_msg) # 向量化并存储(使用 thread_id 作为集合名) collection_name = f"thread_{thread_id}" diff --git a/frontend/src/views/KnowledgeBase.vue b/frontend/src/views/KnowledgeBase.vue index 1642c1e..3d996c4 100644 --- a/frontend/src/views/KnowledgeBase.vue +++ b/frontend/src/views/KnowledgeBase.vue @@ -216,7 +216,7 @@ ref="fileInput" > - 支持 PDF、DOCX、Excel(xlsx/xls)、CSV、TXT、图片(PNG、JPG、BMP)格式,文件将自动进行向量化处理 + 支持 PDF、DOCX、Excel(xlsx/xls)、CSV、TXT、图片(PNG、JPG、BMP)格式,单文件不超过 15MB,提取正文不超过 30 万字(过长请拆分) diff --git a/frontend/src/views/KnowledgeGraph.vue b/frontend/src/views/KnowledgeGraph.vue index 2decb7c..8d18b81 100644 --- a/frontend/src/views/KnowledgeGraph.vue +++ b/frontend/src/views/KnowledgeGraph.vue @@ -219,8 +219,7 @@ class="form-control bg-dark text-white border-secondary" @change="onFileSelect" /> -
支持 .txt、.pdf、.docx 及常见图片;扫描件将尝试 OCR + 通义视觉。单文件不超过约 15MB
- +
支持 .txt、.pdf、.docx 及常见图片;扫描件将尝试 OCR + 通义视觉。单文件不超过约 15MB,提取正文建议不超过 8 万字(过长请拆分或改用知识库)
{{ uploadError }}