diff --git a/vlmeval/api/__init__.py b/vlmeval/api/__init__.py old mode 100644 new mode 100755 index 82ae29d50..e82cce0c2 --- a/vlmeval/api/__init__.py +++ b/vlmeval/api/__init__.py @@ -1,43 +1,44 @@ -from .gpt import OpenAIWrapper, GPT4V -from .hf_chat_model import HFChatModel -from .gemini import GeminiWrapper, Gemini -from .qwen_vl_api import QwenVLWrapper, QwenVLAPI, Qwen2VLAPI -from .qwen_api import QwenAPI -from .claude import Claude_Wrapper, Claude3V -from .reka import Reka -from .glm_vision import GLMVisionAPI -from .cloudwalk import CWWrapper -from .sensechat_vision import SenseChatVisionAPI -from .siliconflow import SiliconFlowAPI, TeleMMAPI -from .telemm import TeleMM2_API -from .telemm_thinking import TeleMM2Thinking_API -from .hunyuan import HunyuanVision -from .bailingmm import bailingMMAPI -from .bluelm_api import BlueLMWrapper, BlueLM_API -from .jt_vl_chat import JTVLChatAPI -from .jt_vl_chat_mini import JTVLChatAPI_Mini, JTVLChatAPI_2B -from .video_chat_online_v2 import VideoChatOnlineV2API -from .taiyi import TaiyiAPI -from .lmdeploy import LMDeployAPI -from .arm_thinker import ARM_thinker -from .taichu import TaichuVLAPI, TaichuVLRAPI -from .doubao_vl_api import DoubaoVL -from .mug_u import MUGUAPI -from .kimivl_api import KimiVLAPIWrapper, KimiVLAPI -from .rbdashmm_chat3_api import RBdashMMChat3_API, RBdashChat3_5_API -from .rbdashmm_chat3_5_api import RBdashMMChat3_78B_API, RBdashMMChat3_5_38B_API -from .together import TogetherAPI -from .gcp_vertex import GCPVertexAPI -from .bedrock import BedrockAPI - -__all__ = [ - 'OpenAIWrapper', 'HFChatModel', 'GeminiWrapper', 'GPT4V', 'Gemini', - 'QwenVLWrapper', 'QwenVLAPI', 'QwenAPI', 'Claude3V', 'Claude_Wrapper', - 'Reka', 'GLMVisionAPI', 'CWWrapper', 'SenseChatVisionAPI', 'HunyuanVision', - 'Qwen2VLAPI', 'BlueLMWrapper', 'BlueLM_API', 'JTVLChatAPI', 'JTVLChatAPI_Mini', 'JTVLChatAPI_2B', - 'bailingMMAPI', 'TaiyiAPI', 'TeleMMAPI', 'SiliconFlowAPI', 'LMDeployAPI', 'ARM_thinker', - 'TaichuVLAPI', 'TaichuVLRAPI', 'DoubaoVL', "MUGUAPI", 'KimiVLAPIWrapper', 'KimiVLAPI', - 'RBdashMMChat3_API', 'RBdashChat3_5_API', 'RBdashMMChat3_78B_API', 'RBdashMMChat3_5_38B_API', - 'VideoChatOnlineV2API', 'TeleMM2_API', 'TeleMM2Thinking_API', - 'TogetherAPI', 'GCPVertexAPI', 'BedrockAPI', -] +from .gpt import OpenAIWrapper, GPT4V +from .hf_chat_model import HFChatModel +from .gemini import GeminiWrapper, Gemini +from .qwen_vl_api import QwenVLWrapper, QwenVLAPI, Qwen2VLAPI +from .qwen_api import QwenAPI +from .claude import Claude_Wrapper, Claude3V +from .reka import Reka +from .glm_vision import GLMVisionAPI +from .cloudwalk import CWWrapper +from .sensechat_vision import SenseChatVisionAPI +from .siliconflow import SiliconFlowAPI, TeleMMAPI +from .telemm import TeleMM2_API +from .telemm_thinking import TeleMM2Thinking_API +from .hunyuan import HunyuanVision +from .bailingmm import bailingMMAPI +from .bluelm_api import BlueLMWrapper, BlueLM_API +from .jt_vl_chat import JTVLChatAPI +from .jt_vl_chat_mini import JTVLChatAPI_Mini, JTVLChatAPI_2B +from .video_chat_online_v2 import VideoChatOnlineV2API +from .taiyi import TaiyiAPI +from .lmdeploy import LMDeployAPI +from .arm_thinker import ARM_thinker +from .taichu import TaichuVLAPI, TaichuVLRAPI +from .doubao_vl_api import DoubaoVL +from .mug_u import MUGUAPI +from .kimivl_api import KimiVLAPIWrapper, KimiVLAPI +from .rbdashmm_chat3_api import RBdashMMChat3_API, RBdashChat3_5_API +from .rbdashmm_chat3_5_api import RBdashMMChat3_78B_API, RBdashMMChat3_5_38B_API +from .together import TogetherAPI +from .gcp_vertex import GCPVertexAPI +from .bedrock import BedrockAPI +from .video_chat_online_v3 import VideoChatOnlineV3API + +__all__ = [ + 'OpenAIWrapper', 'HFChatModel', 'GeminiWrapper', 'GPT4V', 'Gemini', + 'QwenVLWrapper', 'QwenVLAPI', 'QwenAPI', 'Claude3V', 'Claude_Wrapper', + 'Reka', 'GLMVisionAPI', 'CWWrapper', 'SenseChatVisionAPI', 'HunyuanVision', + 'Qwen2VLAPI', 'BlueLMWrapper', 'BlueLM_API', 'JTVLChatAPI', 'JTVLChatAPI_Mini', 'JTVLChatAPI_2B', + 'bailingMMAPI', 'TaiyiAPI', 'TeleMMAPI', 'SiliconFlowAPI', 'LMDeployAPI', 'ARM_thinker', + 'TaichuVLAPI', 'TaichuVLRAPI', 'DoubaoVL', "MUGUAPI", 'KimiVLAPIWrapper', 'KimiVLAPI', + 'RBdashMMChat3_API', 'RBdashChat3_5_API', 'RBdashMMChat3_78B_API', 'RBdashMMChat3_5_38B_API', + 'VideoChatOnlineV2API', 'TeleMM2_API', 'TeleMM2Thinking_API', + 'TogetherAPI', 'GCPVertexAPI', 'BedrockAPI', 'VideoChatOnlineV3API' +] diff --git a/vlmeval/api/base_client.py b/vlmeval/api/base_client.py new file mode 100755 index 000000000..5647c2b18 --- /dev/null +++ b/vlmeval/api/base_client.py @@ -0,0 +1,250 @@ +import requests +import json +import base64 +from typing import List, Dict, Any +import time + + +class VLLMClient: + def __init__(self, base_url: str = "http://localhost:9100/v1/chat/completions", app_code: str = 'B0m6Tuglt5shfY7t3GyoJn1V5yVAm0Ba'): + """ + 初始化vLLM客户端 + + Args: + base_url: vLLM server地址 + """ + self.base_url = base_url + self.app_code = app_code + + def encode_image_to_base64(self, image_path: str) -> str: + """ + 将图片编码为base64字符串 + + Args: + image_path: 图片路径 + + Returns: + base64编码的图片字符串 + """ + with open(image_path, "rb") as image_file: + encoded_string = base64.b64encode(image_file.read()).decode('utf-8') + return encoded_string + + def create_messages_with_images( + self, + prompt: str, + image_paths: List[str], + image_format: str = "base64" + ) -> List[Dict]: + """ + 创建包含图片的消息 + + Args: + prompt: 文本提示词 + image_paths: 图片路径列表 + image_format: 图片格式,支持"base64"或"url" + + Returns: + 消息列表 + """ + messages = [ + { + "role": "user", + "content": [] + } + ] + + # 添加文本部分 + messages[0]["content"].append({ + "type": "text", + "text": prompt + }) + + # 添加图片部分 + for image_path in image_paths: + if image_format == "base64": + # 读取并编码图片 + base64_image = self.encode_image_to_base64(image_path) + image_content = { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}" + } + } + elif image_format == "url": + image_content = { + "type": "image_url", + "image_url": { + "url": image_path + } + } + else: + raise ValueError(f"不支持的图片格式: {image_format}") + + messages[0]["content"].append(image_content) + + return messages + + def stream_completion( + self, + prompt: str = None, + messages: List[Dict] = None, + image_paths: List[str] = None, + model: str = None, + max_tokens: int = None, + temperature: float = 0.7, + top_p: float = None, + stream: bool = True, + **kwargs + ): + """ + 流式输出请求 + + Args: + prompt: 文本提示词(如果使用messages参数,则忽略此参数) + messages: 消息列表(支持多模态) + image_paths: 图片路径列表 + model: 模型名称 + max_tokens: 最大token数 + temperature: 温度参数 + top_p: top-p采样参数 + stream: 是否使用流式输出 + **kwargs: 其他参数 + + Yields: + 生成的文本片段 + """ + # 构建请求体 + + request_data = { + "model": model, + "stream": stream, + **kwargs + } + if temperature is not None: + request_data["temperature"] = temperature + if max_tokens is not None: + request_data["max_tokens"] = max_tokens + if max_tokens is not None: + request_data["max_tokens"] = max_tokens + print("request_data:",request_data) + # 处理消息 + if messages is not None: + request_data["messages"] = messages + elif image_paths is not None: + # 如果有图片路径,创建包含图片的消息 + if prompt is None: + prompt = "请描述图片内容" + request_data["messages"] = self.create_messages_with_images(prompt, image_paths) + elif prompt is not None: + # 纯文本消息 + request_data["messages"] = [ + { + "role": "user", + "content": prompt + } + ] + else: + raise ValueError("必须提供prompt、messages或image_paths中的一个") + + # 发送请求 + response = requests.post( + self.base_url, + json=request_data, + stream=True, + headers={"Content-Type": "application/json",'Authorization':self.app_code} + ) + + if response.status_code != 200: + raise Exception(f"请求失败,状态码: {response.status_code}, 响应: {response.text}") + + # 处理流式响应 + full_response = "" + for line in response.iter_lines(): + if line: + line = line.decode('utf-8') + + # 跳过SSE事件开始标记 + if line.startswith('data: '): + data = line[6:] # 去掉"data: "前缀 + + # 检查是否为结束标记 + if data == '[DONE]': + break + + try: + # 解析JSON + json_data = json.loads(data) + + # 提取内容 + if 'choices' in json_data and len(json_data['choices']) > 0: + delta = json_data['choices'][0].get('delta', {}) + content = delta.get('content', '') + + if content: + full_response += content + yield content + except json.JSONDecodeError as e: + print(f"JSON解析错误: {e}, 原始数据: {data}") + + # 返回完整响应 + return full_response + + def non_stream_completion( + self, + prompt: str = None, + messages: List[Dict] = None, + image_paths: List[str] = None, + model: str = None, + max_tokens: int = 512, + temperature: float = 0.7, + top_p: float = 0.9, + **kwargs + ) -> Dict[str, Any]: + """ + 非流式输出请求 + + Args: + 参数同stream_completion + + Returns: + 完整的响应 + """ + # 构建请求体 + request_data = { + "model": model, + "max_tokens": max_tokens, + "temperature": temperature, + "top_p": top_p, + "stream": False, + **kwargs + } + + # 处理消息(同流式版本) + if messages is not None: + request_data["messages"] = messages + elif image_paths is not None: + if prompt is None: + prompt = "请描述图片内容" + request_data["messages"] = self.create_messages_with_images(prompt, image_paths) + elif prompt is not None: + request_data["messages"] = [ + { + "role": "user", + "content": prompt + } + ] + else: + raise ValueError("必须提供prompt、messages或image_paths中的一个") + + # 发送请求 + response = requests.post( + self.base_url, + json=request_data, + headers={"Content-Type": "application/json"} + ) + + if response.status_code != 200: + raise Exception(f"请求失败,状态码: {response.status_code}, 响应: {response.text}") + + return response.json() diff --git a/vlmeval/api/video_chat_online_v3.py b/vlmeval/api/video_chat_online_v3.py new file mode 100755 index 000000000..7c42cc909 --- /dev/null +++ b/vlmeval/api/video_chat_online_v3.py @@ -0,0 +1,208 @@ +import pandas as pd +import requests +import json +import os +import base64 +from vlmeval.smp import * +from vlmeval.api.base import BaseAPI +from vlmeval.dataset import DATASET_TYPE +from vlmeval.dataset import img_root_map +from vlmeval.api.base_client import VLLMClient + +API_ENDPOINT = '' +APP_CODE = '' + +class VideoChatOnlineV3Wrapper(BaseAPI): + is_api: bool = True + INTERLEAVE = False + + def __init__(self, + model: str = 'jtchat', + retry: int = 5, + wait: int = 5, + api_base: str = '', + app_code: str = '', + verbose: bool = True, + system_prompt: str = None, + temperature: float = 0.7, + max_tokens: int = 2048, + proxy: str = None, + **kwargs): + self.model = model + + self.temperature = temperature + self.max_tokens = max_tokens + self.api_base = api_base + self.app_code = app_code + + + super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + def dump_image(self, line, dataset): + """Dump the image(s) of the input line to the corresponding dataset folder. + + Args: + line (line of pd.DataFrame): The raw input line. + dataset (str): The name of the dataset. + + Returns: + str | list[str]: The paths of the dumped images. + """ + ROOT = LMUDataRoot() + assert isinstance(dataset, str) + + img_root = os.path.join(ROOT, 'images', img_root_map(dataset) if dataset in img_root_map(dataset) else dataset) + os.makedirs(img_root, exist_ok=True) + if 'image' in line: + if isinstance(line['image'], list): + tgt_path = [] + assert 'image_path' in line + for img, im_name in zip(line['image'], line['image_path']): + path = osp.join(img_root, im_name) + if not read_ok(path): + decode_base64_to_image_file(img, path) + tgt_path.append(path) + else: + tgt_path = osp.join(img_root, f"{line['index']}.jpg") + if not read_ok(tgt_path): + decode_base64_to_image_file(line['image'], tgt_path) + tgt_path = [tgt_path] + else: + assert 'image_path' in line + tgt_path = toliststr(line['image_path']) + + return tgt_path + + def use_custom_prompt(self, dataset): + assert dataset is not None + if listinstr(['MMMU_DEV_VAL','MMMU_TEST'], dataset): + return False + else: + return True + + def build_multi_choice_prompt(self, line, dataset=None): + question = line['question'] + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + if hint is not None: + question = hint + '\n' + question + + # options = { + # cand: line[cand] + # for cand in string.ascii_uppercase + # if cand in line and not pd.isna(line[cand]) + # } + options = { + cand.upper(): line[cand] + for cand in string.ascii_letters # string.ascii_letters 包含所有大小写字母 + if cand in line and not pd.isna(line[cand]) + } + for key, item in options.items(): + question += f'\n{key}. {item}' + prompt = question + + if len(options): + prompt += '\n请直接回答选项字母。' if cn_string( + prompt) else "\nAnswer with the option's letter from the given choices directly." + else: + prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.' + + return prompt + + def build_prompt(self, line, dataset=None): + assert self.use_custom_prompt(dataset) + assert dataset is None or isinstance(dataset, str) + + tgt_path = self.dump_image(line, dataset) + + if dataset is not None and listinstr(['MME'], dataset): + question = line['question'] + prompt = question + ' Answer the question using a single word or phrase.' + elif dataset is not None and listinstr(['HallusionBench'], dataset): + question = line['question'] + prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.' + elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ': + prompt = self.build_multi_choice_prompt(line, dataset) + elif dataset is not None and DATASET_TYPE(dataset) == 'VQA': + if listinstr(['MathVista', 'MathVision','LogicVista','MultimodalCreation','QA_CN',"VQU","Perception_ZJ","OCRBench_v2"], dataset): + prompt = line['question'] + elif listinstr(['LLaVABench'], dataset): + question = line['question'] + prompt = question + '\nAnswer this question in detail.' + elif listinstr(['MMVet'], dataset): + prompt = line['question'] + else: + question = line['question'] + prompt = question + '\nAnswer the question using a single word or phrase.' + else: + prompt = line['question'] + message = [dict(type='text', value=prompt)] + message.extend([dict(type='image', value=s) for s in tgt_path]) + return message + + def message_to_promptimg(self, message, dataset=None): + assert not self.INTERLEAVE + model_name = self.__class__.__name__ + import warnings + warnings.warn( + f'Model {model_name} does not support interleaved input. ' + 'Will use the first image and aggregated texts as prompt. ') + num_images = len([x for x in message if x['type'] == 'image']) + if num_images == 0: + prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text']) + image = None + else: + prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text']) + if dataset == 'BLINK': + image = concat_images_vlmeval( + [x['value'] for x in message if x['type'] == 'image'], + target_size=512) + else: + image = [x['value'] for x in message if x['type'] == 'image'][0] + return prompt, image + + def generate_inner(self, inputs, **kwargs) -> str: + assert isinstance(inputs, str) or isinstance(inputs, list) + inputs = [inputs] if isinstance(inputs, str) else inputs + dataset = kwargs.get('dataset', None) + prompt, image_path = self.message_to_promptimg(message=inputs, dataset=dataset) + + client = VLLMClient(base_url=API_ENDPOINT) + print("\n=== 示例: 多图输入流式输出 ===") + print(API_ENDPOINT) + image_paths = [image_path] + import os + for img_path in image_paths: + if not os.path.exists(img_path): + print(f"警告: 图片文件不存在: {img_path}") + # 使用占位符 + image_paths = [] + + if image_paths: + # prompt = "请描述这些图片的内容" + print(f"提示: {prompt}") + print(f"图片数量: {len(image_paths)}") + print("响应:", end=" ", flush=True) + + full_response = "" + + try: + for chunk in client.stream_completion( + prompt=prompt, + image_paths=image_paths, + model="jtchat" + ): + print(chunk, end="", flush=True) + full_response += chunk + except Exception as e: + print(f"错误: {e}") + else: + print("无有效图片,跳过示例2") + + print("\n" + "="*50) + print("完整输出:",full_response) + return 0,full_response,'Succeeded! ' + +class VideoChatOnlineV3API(VideoChatOnlineV3Wrapper): + + def generate(self, message, dataset=None): + return super(VideoChatOnlineV3API, self).generate(message, dataset=dataset)