huoyan-enterprise/backend/services/vision_service.py

287 lines
9.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
视觉模型服务
基于阿里云通义千问视觉模型 (qwen-vl-max-latest) 提供图片理解能力。
参考 server/aaa/jenius_attachment_knowledge_base/jenius_rag_util.py 的实现。
"""
import asyncio
import base64
from typing import Optional
from openai import OpenAI, AsyncOpenAI
from core.llm_env import tongyi_openai_compatible_base_url
from core.config import settings
from logger.logging import get_logger
logger = get_logger(__name__)
def _is_vision_image_url(url: str) -> bool:
if not url:
return False
if url.startswith(("http://", "https://")):
return True
if url.startswith("data:image/") and "base64," in url:
return True
return False
def image_bytes_to_data_url(image_bytes: bytes, mime_hint: Optional[str] = None) -> str:
"""将本地图片字节转为 OpenAI/DashScope 兼容的 data URL。"""
mime = mime_hint or "image/jpeg"
if mime_hint is None:
if len(image_bytes) >= 8 and image_bytes[:8] == b"\x89PNG\r\n\x1a\n":
mime = "image/png"
elif len(image_bytes) >= 3 and image_bytes[:3] == b"\xff\xd8\xff":
mime = "image/jpeg"
elif len(image_bytes) >= 6 and image_bytes[:6] in (b"GIF87a", b"GIF89a"):
mime = "image/gif"
elif len(image_bytes) >= 2 and image_bytes[:2] == b"BM":
mime = "image/bmp"
elif len(image_bytes) >= 12 and image_bytes[:4] == b"RIFF" and image_bytes[8:12] == b"WEBP":
mime = "image/webp"
b64 = base64.standard_b64encode(image_bytes).decode("ascii")
return f"data:{mime};base64,{b64}"
class VisionService:
"""视觉模型服务类
使用阿里云通义千问视觉模型进行图片理解和描述。
"""
_client_cache: Optional[AsyncOpenAI] = None
_sync_client_cache: Optional[OpenAI] = None
_lock = asyncio.Lock()
@classmethod
async def _get_async_client(cls) -> AsyncOpenAI:
"""获取或创建异步客户端(单例模式)"""
if cls._client_cache is not None:
return cls._client_cache
async with cls._lock:
if cls._client_cache is None:
cls._client_cache = AsyncOpenAI(
api_key=settings.dashscope_api_key,
base_url=tongyi_openai_compatible_base_url(),
)
return cls._client_cache
@classmethod
def _get_sync_client(cls) -> OpenAI:
"""获取或创建同步客户端(单例模式)"""
if cls._sync_client_cache is not None:
return cls._sync_client_cache
cls._sync_client_cache = OpenAI(
api_key=settings.dashscope_api_key,
base_url=tongyi_openai_compatible_base_url(),
)
return cls._sync_client_cache
@classmethod
async def get_image_description(
cls,
image_url: str,
prompt: str = "图中的主要内容是什么?回答以'图片'开头, 500字以内"
) -> str:
"""
获取图片的描述(异步)
Args:
image_url: 图片的 URL 地址(必须是 http/https 开头)
prompt: 提示词,用于引导模型生成描述
Returns:
str: 图片描述文本
"""
if not _is_vision_image_url(image_url):
logger.warning(f"无效的图片 URL: {image_url[:80] if image_url else ''}")
return ""
try:
client = await cls._get_async_client()
completion = await client.chat.completions.create(
model="qwen-vl-max-latest",
messages=[
{
"role": "system",
"content": [{"type": "text", "text": "You are a helpful assistant."}]
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": image_url}
},
{"type": "text", "text": prompt}
]
}
]
)
description = completion.choices[0].message.content
logger.info(f"成功获取图片描述: {description[:50]}...")
return description
except Exception as e:
logger.error(f"获取图片描述失败: {e}")
return ""
@classmethod
async def get_image_description_from_bytes(
cls,
image_bytes: bytes,
prompt: str = "图中的主要内容是什么?回答以'图片'开头, 500字以内",
mime_hint: Optional[str] = None,
) -> str:
"""
从内存中的图片字节获取描述(异步),使用 data URL 调用通义 VL。
用于知识图谱上传等无公网 URL 的场景。
"""
if not settings.dashscope_api_key:
logger.warning("未配置 DASHSCOPE_API_KEY无法进行视觉理解")
return ""
if not image_bytes:
return ""
data_url = image_bytes_to_data_url(image_bytes, mime_hint)
return await cls.get_image_description(data_url, prompt=prompt)
@classmethod
def get_image_description_sync(
cls,
image_url: str,
prompt: str = "图中的主要内容是什么?回答以'图片'开头, 500字以内"
) -> str:
"""
获取图片的描述(同步)
Args:
image_url: 图片的 URL 地址(必须是 http/https 开头)
prompt: 提示词,用于引导模型生成描述
Returns:
str: 图片描述文本
"""
if not _is_vision_image_url(image_url):
logger.warning(f"无效的图片 URL: {image_url[:80] if image_url else ''}")
return ""
try:
client = cls._get_sync_client()
completion = client.chat.completions.create(
model="qwen-vl-max-latest",
messages=[
{
"role": "system",
"content": [{"type": "text", "text": "You are a helpful assistant."}]
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": image_url}
},
{"type": "text", "text": prompt}
]
}
]
)
description = completion.choices[0].message.content
logger.info(f"成功获取图片描述: {description[:50]}...")
return description
except Exception as e:
logger.error(f"获取图片描述失败: {e}")
return ""
@classmethod
async def analyze_image_with_question(
cls,
image_url: str,
question: str
) -> str:
"""
基于问题分析图片
Args:
image_url: 图片的 URL 地址
question: 用户的问题
Returns:
str: 分析结果
"""
if not _is_vision_image_url(image_url):
logger.warning(f"无效的图片 URL: {image_url[:80] if image_url else ''}")
return ""
try:
client = await cls._get_async_client()
completion = await client.chat.completions.create(
model="qwen-vl-max-latest",
messages=[
{
"role": "system",
"content": [{"type": "text", "text": "You are a helpful assistant that can analyze images and answer questions about them."}]
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": image_url}
},
{"type": "text", "text": question}
]
}
]
)
answer = completion.choices[0].message.content
logger.info(f"成功分析图片并回答问题")
return answer
except Exception as e:
logger.error(f"分析图片失败: {e}")
return ""
# 批量处理辅助函数
async def batch_get_image_descriptions(
image_urls: list[str],
prompt: str = "图中的主要内容是什么?回答以'图片'开头, 500字以内"
) -> dict[str, str]:
"""
批量获取图片描述
Args:
image_urls: 图片 URL 列表
prompt: 提示词
Returns:
dict: URL 到描述的映射
"""
tasks = [
VisionService.get_image_description(url, prompt)
for url in image_urls
]
descriptions = await asyncio.gather(*tasks, return_exceptions=True)
result = {}
for url, desc in zip(image_urls, descriptions):
if isinstance(desc, Exception):
logger.error(f"获取图片描述失败 {url}: {desc}")
result[url] = ""
else:
result[url] = desc
return result