huoyan-enterprise/backend/services/kb_text_limits.py

"""
知识库与对话文件的正文长度限制。

处理流程对比：
  知识图谱  每块 1 次串行 LLM 抽关系 → 最严格，8 万字
  知识库    批量 embedding + 单次摘要 → 较宽松，30 万字
  对话文件  批量 embedding + 单次摘要（临时会话上下文）→ 20 万字
"""

# ---------- 知识库 ----------
# chunk_size=4096、overlap=200 → 有效步长约 3896 字/块
# 30 万字 ≈ 77 块向量化，通常 1-2 分钟可完成
MAX_KB_INPUT_CHARS = 300_000
MAX_KB_VECTOR_CHUNKS = 80

# ---------- 对话文件（聊天时上传） ----------
# 对话文件是单次会话临时上下文，精度要求高于知识库；过长会稀释检索精度
# 20 万字 ≈ 51 块，兼顾处理速度与上下文覆盖
MAX_CHAT_FILE_INPUT_CHARS = 200_000
MAX_CHAT_FILE_VECTOR_CHUNKS = 60


def decode_txt_char_count(raw: bytes) -> int:
    """估算纯文本文件字符数（上传前快速校验）。"""
    try:
        text = raw.decode("utf-8")
    except UnicodeDecodeError:
        text = raw.decode("gb18030", errors="replace")
    return len(text.strip())


def validate_kb_text_length(char_count: int, *, chunk_count: int | None = None) -> None:
    """校验知识库可处理的正文规模。"""
    if char_count > MAX_KB_INPUT_CHARS:
        raise ValueError(
            f"提取的正文过长（约 {char_count:,} 字），知识库单文件上限为 {MAX_KB_INPUT_CHARS:,} 字。"
            "请将文档拆分为多个文件分别上传。"
        )
    if chunk_count is not None and chunk_count > MAX_KB_VECTOR_CHUNKS:
        raise ValueError(
            f"文本分块过多（{chunk_count} 块，上限 {MAX_KB_VECTOR_CHUNKS} 块），"
            f"请将正文控制在约 {MAX_KB_INPUT_CHARS:,} 字以内后重试。"
        )


def validate_chat_file_text_length(char_count: int, *, chunk_count: int | None = None) -> None:
    """校验对话文件可处理的正文规模。"""
    if char_count > MAX_CHAT_FILE_INPUT_CHARS:
        raise ValueError(
            f"提取的正文过长（约 {char_count:,} 字），对话文件上限为 {MAX_CHAT_FILE_INPUT_CHARS:,} 字。"
            "请将文档拆分后上传，或使用知识库功能处理长文档。"
        )
    if chunk_count is not None and chunk_count > MAX_CHAT_FILE_VECTOR_CHUNKS:
        raise ValueError(
            f"文本分块过多（{chunk_count} 块，上限 {MAX_CHAT_FILE_VECTOR_CHUNKS} 块），"
            f"请将正文控制在约 {MAX_CHAT_FILE_INPUT_CHARS:,} 字以内后重试。"
        )