import sys import os import re import json import base64 from io import BytesIO from PIL import Image import argparse # from vis_python_exe import PythonExecutor from shared_vis_python_exe import PythonExecutor from openai import OpenAI from typing import Optional, Union import gradio as gr import markdown def encode_image(image): """ 将PIL.Image对象或图像文件路径转换为base64编码字符串 参数: image: 可以是PIL.Image对象或图像文件路径 返回: base64编码的字符串 """ if isinstance(image, str): # 处理文件路径的情况 with open(image, "rb") as image_file: return base64.b64encode(image_file.read()).decode('utf-8') else: # 处理PIL.Image对象的情况 buffered = BytesIO() image.save(buffered, format='PNG') return base64.b64encode(buffered.getvalue()).decode('utf-8') def excute_codes(codes, messages, executor: PythonExecutor): no_code_idx = [] codes_use = [] for i, code in enumerate(codes): if code == "": no_code_idx.append(i) else: codes_use.append(code) batch_results = executor.batch_apply(codes_use, messages) return batch_results, no_code_idx def process_prompt_init(question, image, prompt_template, prompt_type): prompt_prefix = prompt_template[prompt_type] image_base64 = encode_image(image) question_with_options = question messages = [ { "role": "user", "content": [{"type": "text", "text": ""}] + [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}}] + [{"type": "text", "text": "\n\n"}] + [{"type": "text", "text": prompt_prefix.format(query=question_with_options)}] } ] return messages def update_messages_with_excu_content(messages, images_result, text_result, image_clue_idx): new_messages = [] image_content = [] for message_item in messages[:-1]: new_messages.append(message_item) assistant_message_item = messages[-1]['content'] interpreter_message_text_prefix = [{"type": "text", "text": f"\nText Result:\n{text_result}\nImage Result:\n"}] if images_result is not None: for image_base64_item in images_result[image_clue_idx-1:]: interpreter_message_images = [{"type": "text", "text": f""}] + [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64_item}"}}] + [{"type": "text", "text": f""}] image_content += interpreter_message_images image_clue_idx += 1 else: image_content = [{"type": "text", "text": "None"}] interpreter_message_text_profill = [{"type": "text", "text": "\n"}] assistant_message_item = assistant_message_item + interpreter_message_text_prefix + image_content + interpreter_message_text_profill new_messages.append({"role": "assistant", "content": assistant_message_item}) return new_messages, image_clue_idx def update_messages_with_code(messages, generated_content): message_item = { "role": "assistant", "content": [{"type": "text", "text": f"{generated_content}\n"}] } messages.append(message_item) return messages def update_messages_with_text(messages, generated_content): message_item = { "role": "assistant", "content": [{"type": "text", "text": f"{generated_content}"}] } messages.append(message_item) return messages def call_chatgpt_api(messages, client, max_tokens=10000, stop=None, temperature=0.6): """Call ChatGPT API with the given messages""" try: response = client.chat.completions.create( model="gpt-4.1", # 使用支持视觉的模型 messages=messages, max_tokens=max_tokens, temperature=temperature, top_p=1.0, stop=stop ) response_text = response.choices[0].message.content # 检查是否遇到停止标记 stop_reason = None if stop and any(s in response_text for s in stop): for s in stop: if s in response_text: stop_reason = s break else: stop_reason = response.choices[0].finish_reason if "" in response_text: stop_reason = "" return response_text, stop_reason except Exception as e: print(f"API Error: {str(e)}") return None, None def evaluate_single_data(data, client, executor, prompt_template, prompt_type): messages = process_prompt_init(data["question"], data['image'], prompt_template, prompt_type) # 生成初始响应 response_text, pred_stop_reason = call_chatgpt_api( messages, client, max_tokens=10000, stop=[""] ) # 处理响应 final_response = response_text code_execution_count = 0 image_clue_idx = 1 while True: # 检查是否需要执行代码 if pred_stop_reason == "": # 提取要执行的代码 messages = update_messages_with_code(messages, response_text) code_to_execute = response_text.split("```python")[-1].split("```")[0].strip() # 执行代码 exe_result = excute_codes([code_to_execute], messages, executor)[0][0] if exe_result is None: text_result = "None" images_result = None else: output, report = exe_result try: text_result = exe_result[0]['text'] except: text_result = None print("text result is none.") try: images_result = exe_result[0]['images'] except: images_result = None print("image result is none.") messages, new_image_clue_idx = update_messages_with_excu_content(messages, images_result, text_result, image_clue_idx) image_clue_idx = new_image_clue_idx code_execution_count += 1 print(f"Code Execution Count: {code_execution_count}") # 生成下一部分响应 response_text, pred_stop_reason = call_chatgpt_api( messages, client, max_tokens=10000, stop=[""] ) else: final_response = response_text messages = update_messages_with_text(messages, response_text) print("GPT-4.1 finish.") break return messages def process_message(messages): # 创建HTML输出 html_output = '
' # 添加一个包裹所有内容的div,设置文本颜色为黑色 for message_item in messages: role = message_item['role'] content = message_item['content'] # 根据角色设置样式 if role == "user" or role == "human": html_output += f'
User:
' elif role == "assistant": html_output += f'
Assistant:
' else: html_output += f'
{role.capitalize()}:
' # 处理内容 for content_item in content: content_type = content_item['type'] if content_type == "text": # 将Markdown文本转换为HTML md_text = content_item['text'] html_text = markdown.markdown(md_text, extensions=['fenced_code', 'codehilite']) # html_text = markdown.markdown(md_text) # html_text = md_text html_output += f'
{html_text}
' elif content_type == "image_url": content_value = content_item['image_url']['url'] # 如果是base64图片 if content_value.startswith("data:"): html_output += f'' else: html_output += f'' html_output += '
' html_output += '
' # 关闭最外层div return html_output def o3_chat(api_key, base_url, question, image): print("done!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") # 初始化组件 client = OpenAI(api_key=api_key, base_url=base_url) executor = PythonExecutor() # executor = SharedRuntimeExecutor(var_whitelist="RETAIN_ALL_VARS") prompt_template = json.load(open("./prompt_template_vis.json", "r", encoding="utf-8")) prompt_type = 'vistool' data = { "question": question, "image": image, } # 评估单个数据点 messages = evaluate_single_data(data, client, executor, prompt_template, prompt_type) html_output = process_message(messages) # 将消息转换为JSON字符串,用于下载 json_str = json.dumps(messages, ensure_ascii=False, indent=4) return html_output # Gradio界面 def create_demo(): with gr.Blocks(title="GPT-4.1 with Python Interpreter", css="div.prose * {color: black !important;}") as demo: gr.Markdown("# GPT-4.1 with Python Interpreter") gr.Markdown("please do not share to others") gr.Markdown("Upload an image and ask a question to get a response with code execution capabilities.") with gr.Row(): with gr.Column(scale=1): api_key = gr.Textbox(label="OpenAI API Key", type="password", value="sk-kBQuM0gvNBhOHmKz43b3iQut01bsOgg8Pv76eMKguu6jvncm") base_url = gr.Textbox(label="Base URL (optional)", value="https://api.claudeshop.top/v1") image_input = gr.Image(label="Upload Image", type="pil") question = gr.Textbox(label="Question", placeholder="Ask a question about the image...") submit_btn = gr.Button("Submit") with gr.Row(): output = gr.HTML(label="Response") # 处理提交 submit_btn.click( fn=o3_chat, inputs=[api_key, base_url, question, image_input], outputs=[output] ) # 示例部分 examples = [ [ "./examples/1.png", "From the information on that advertising board, what is the type of this shop?\nA. The shop is a yoga studio.\nB. The shop is a cafe.\nC. The shop is a seven-eleven.\nD. The shop is a milk tea shop." ], [ "./examples/2.png", "What is the diagnosis for the abnormality seen in this image?\nA. Pulmonary embolism.\nB. Tuberculosis.\nC. COVID-19 infection.\nD. Influenza." ], [ "./examples/3.png", "What is the color of the liquid contained in the glass on the table?\nA. The color of the liquid contained in the glass on the table is green.\nB. The color of the liquid contained in the glass on the table is transparent.\nC. The color of the liquid contained in the glass on the table is white.\nD. The color of the liquid contained in the glass on the table is orange." ], [ "./examples/4.png", "Is the dog on the left or right side of the bicycle?\nA. The dog is on the right side of the bicycle.\nB. The dog is on the left side of the bicycle." ], [ "./examples/5.png", "Is the kid with black shirt on the left or right side of the kid with blue shirt?\nA. The kid with black shirt is on the left side of the kid with blue shirt.\nB. The kid with black shirt is on the right side of the kid with blue shirt." ], [ "./examples/6.png", "What can be observed in this image?\nA. Nerve entrapment.\nB. Musculoskeletal abnormality.\nC. Arteriovenous anomaly.\nD. Renal cyst." ], [ "./examples/7.png", "What is the specific stage of cancer depicted in the image? A)Stage Ib, B)Stage IIIb, C)Stage IIc, D)Stage IIIa" ], [ "./examples/8.png", "A gymnast jotted down the number of cartwheels she did each day. What is the mode of the numbers?" ], [ "./examples/9.png", "Does Virginia have the highest value in the USA ?" ], [ "./examples/10.png", "AB is the diameter of ⊙O, PA is tangent to ⊙O at point A, and PO intersects ⊙O at point C; connect BC, if ∠P = 40.0, then ∠B is equal to ()" ], [ "./examples/11.png", "How many single-color paths go from C to A?" ], ] gr.Examples( examples, [image_input, question], label="Click any example to try it out!" ) gr.Markdown(""" ### Tips 1. Click the 'log' botton top left to check the output log. 2. It may take 2~5 min. """) return demo # 创建并启动应用 if __name__ == "__main__": demo = create_demo() demo.launch()