huoyan-enterprise/backend/services/vision_service.py

"""
视觉模型服务

基于阿里云通义千问视觉模型 (qwen-vl-max-latest) 提供图片理解能力。
参考 server/aaa/jenius_attachment_knowledge_base/jenius_rag_util.py 的实现。
"""
import asyncio
import base64
from typing import Optional

from openai import OpenAI, AsyncOpenAI
from core.llm_env import tongyi_openai_compatible_base_url
from core.config import settings
from logger.logging import get_logger

logger = get_logger(__name__)


def _is_vision_image_url(url: str) -> bool:
    if not url:
        return False
    if url.startswith(("http://", "https://")):
        return True
    if url.startswith("data:image/") and "base64," in url:
        return True
    return False


def image_bytes_to_data_url(image_bytes: bytes, mime_hint: Optional[str] = None) -> str:
    """将本地图片字节转为 OpenAI/DashScope 兼容的 data URL。"""
    mime = mime_hint or "image/jpeg"
    if mime_hint is None:
        if len(image_bytes) >= 8 and image_bytes[:8] == b"\x89PNG\r\n\x1a\n":
            mime = "image/png"
        elif len(image_bytes) >= 3 and image_bytes[:3] == b"\xff\xd8\xff":
            mime = "image/jpeg"
        elif len(image_bytes) >= 6 and image_bytes[:6] in (b"GIF87a", b"GIF89a"):
            mime = "image/gif"
        elif len(image_bytes) >= 2 and image_bytes[:2] == b"BM":
            mime = "image/bmp"
        elif len(image_bytes) >= 12 and image_bytes[:4] == b"RIFF" and image_bytes[8:12] == b"WEBP":
            mime = "image/webp"
    b64 = base64.standard_b64encode(image_bytes).decode("ascii")
    return f"data:{mime};base64,{b64}"


class VisionService:
    """视觉模型服务类

    使用阿里云通义千问视觉模型进行图片理解和描述。
    """

    _client_cache: Optional[AsyncOpenAI] = None
    _sync_client_cache: Optional[OpenAI] = None
    _lock = asyncio.Lock()

    @classmethod
    async def _get_async_client(cls) -> AsyncOpenAI:
        """获取或创建异步客户端（单例模式）"""
        if cls._client_cache is not None:
            return cls._client_cache

        async with cls._lock:
            if cls._client_cache is None:
                cls._client_cache = AsyncOpenAI(
                    api_key=settings.dashscope_api_key,
                    base_url=tongyi_openai_compatible_base_url(),
                )
            return cls._client_cache

    @classmethod
    def _get_sync_client(cls) -> OpenAI:
        """获取或创建同步客户端（单例模式）"""
        if cls._sync_client_cache is not None:
            return cls._sync_client_cache

        cls._sync_client_cache = OpenAI(
            api_key=settings.dashscope_api_key,
            base_url=tongyi_openai_compatible_base_url(),
        )
        return cls._sync_client_cache

    @classmethod
    async def get_image_description(
        cls,
        image_url: str,
        prompt: str = "图中的主要内容是什么?回答以'图片'开头, 500字以内"
    ) -> str:
        """
        获取图片的描述（异步）

        Args:
            image_url: 图片的 URL 地址（必须是 http/https 开头）
            prompt: 提示词，用于引导模型生成描述

        Returns:
            str: 图片描述文本
        """
        if not _is_vision_image_url(image_url):
            logger.warning(f"无效的图片 URL: {image_url[:80] if image_url else ''}")
            return ""

        try:
            client = await cls._get_async_client()

            completion = await client.chat.completions.create(
                model="qwen-vl-max-latest",
                messages=[
                    {
                        "role": "system",
                        "content": [{"type": "text", "text": "You are a helpful assistant."}]
                    },
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image_url",
                                "image_url": {"url": image_url}
                            },
                            {"type": "text", "text": prompt}
                        ]
                    }
                ]
            )

            description = completion.choices[0].message.content
            logger.info(f"成功获取图片描述: {description[:50]}...")
            return description

        except Exception as e:
            logger.error(f"获取图片描述失败: {e}")
            return ""

    @classmethod
    async def get_image_description_from_bytes(
        cls,
        image_bytes: bytes,
        prompt: str = "图中的主要内容是什么?回答以'图片'开头, 500字以内",
        mime_hint: Optional[str] = None,
    ) -> str:
        """
        从内存中的图片字节获取描述（异步），使用 data URL 调用通义 VL。
        用于知识图谱上传等无公网 URL 的场景。
        """
        if not settings.dashscope_api_key:
            logger.warning("未配置 DASHSCOPE_API_KEY，无法进行视觉理解")
            return ""
        if not image_bytes:
            return ""
        data_url = image_bytes_to_data_url(image_bytes, mime_hint)
        return await cls.get_image_description(data_url, prompt=prompt)

    @classmethod
    def get_image_description_sync(
        cls,
        image_url: str,
        prompt: str = "图中的主要内容是什么?回答以'图片'开头, 500字以内"
    ) -> str:
        """
        获取图片的描述（同步）

        Args:
            image_url: 图片的 URL 地址（必须是 http/https 开头）
            prompt: 提示词，用于引导模型生成描述

        Returns:
            str: 图片描述文本
        """
        if not _is_vision_image_url(image_url):
            logger.warning(f"无效的图片 URL: {image_url[:80] if image_url else ''}")
            return ""

        try:
            client = cls._get_sync_client()

            completion = client.chat.completions.create(
                model="qwen-vl-max-latest",
                messages=[
                    {
                        "role": "system",
                        "content": [{"type": "text", "text": "You are a helpful assistant."}]
                    },
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image_url",
                                "image_url": {"url": image_url}
                            },
                            {"type": "text", "text": prompt}
                        ]
                    }
                ]
            )

            description = completion.choices[0].message.content
            logger.info(f"成功获取图片描述: {description[:50]}...")
            return description

        except Exception as e:
            logger.error(f"获取图片描述失败: {e}")
            return ""

    @classmethod
    async def analyze_image_with_question(
        cls,
        image_url: str,
        question: str
    ) -> str:
        """
        基于问题分析图片

        Args:
            image_url: 图片的 URL 地址
            question: 用户的问题

        Returns:
            str: 分析结果
        """
        if not _is_vision_image_url(image_url):
            logger.warning(f"无效的图片 URL: {image_url[:80] if image_url else ''}")
            return ""

        try:
            client = await cls._get_async_client()

            completion = await client.chat.completions.create(
                model="qwen-vl-max-latest",
                messages=[
                    {
                        "role": "system",
                        "content": [{"type": "text", "text": "You are a helpful assistant that can analyze images and answer questions about them."}]
                    },
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image_url",
                                "image_url": {"url": image_url}
                            },
                            {"type": "text", "text": question}
                        ]
                    }
                ]
            )

            answer = completion.choices[0].message.content
            logger.info(f"成功分析图片并回答问题")
            return answer

        except Exception as e:
            logger.error(f"分析图片失败: {e}")
            return ""


# 批量处理辅助函数
async def batch_get_image_descriptions(
    image_urls: list[str],
    prompt: str = "图中的主要内容是什么?回答以'图片'开头, 500字以内"
) -> dict[str, str]:
    """
    批量获取图片描述

    Args:
        image_urls: 图片 URL 列表
        prompt: 提示词

    Returns:
        dict: URL 到描述的映射
    """
    tasks = [
        VisionService.get_image_description(url, prompt)
        for url in image_urls
    ]

    descriptions = await asyncio.gather(*tasks, return_exceptions=True)

    result = {}
    for url, desc in zip(image_urls, descriptions):
        if isinstance(desc, Exception):
            logger.error(f"获取图片描述失败 {url}: {desc}")
            result[url] = ""
        else:
            result[url] = desc

    return result