import os import io import base64 from typing import List, Tuple, Optional import gradio as gr from PIL import Image from openai import OpenAI BASE_URL = "https://api.stepfun.com/v1" DEFAULT_MODEL = "step-3" # 可改为 step-r1-v-mini DEFAULT_DETAIL = "high" # high | low | auto def _get_api_key() -> Optional[str]: # 优先读环境变量(在 HF Spaces 的 Settings -> Variables and secrets 中配置) return os.environ.get("STEPFUN_API_KEY") def pil_image_to_data_uri(img: Image.Image) -> str: buffer = io.BytesIO() # 统一编码为 JPEG,降低大小并确保浏览器/模型兼容 rgb_img = img.convert("RGB") rgb_img.save(buffer, format="JPEG", quality=90) b64 = base64.b64encode(buffer.getvalue()).decode("utf-8") return f"data:image/jpeg;base64,{b64}" def build_messages( chat_history: List[Tuple[str, str]], user_text: str, image: Optional[Image.Image], system_prompt: Optional[str], detail: str, ) -> list: messages: List[dict] = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) # 将历史轮次压缩为仅文本内容(简单稳妥) for user_turn, assistant_turn in chat_history: if user_turn: messages.append({ "role": "user", "content": [{"type": "text", "text": user_turn}], }) if assistant_turn: messages.append({ "role": "assistant", "content": [{"type": "text", "text": assistant_turn}], }) # 当前用户输入:可包含图片 + 文本 content: List[dict] = [] if image is not None: data_uri = pil_image_to_data_uri(image) content.append({ "type": "image_url", "image_url": {"url": data_uri, "detail": detail}, }) if user_text: content.append({"type": "text", "text": user_text}) if not content: # 保底,避免空消息 content.append({"type": "text", "text": ""}) messages.append({"role": "user", "content": content}) return messages def stream_response( user_text: str, image: Optional[Image.Image], model: str, detail: str, system_prompt: str, chat_history: List[Tuple[str, str]], ): api_key = _get_api_key() if not api_key: error_text = "未检测到 STEPFUN_API_KEY,请在 Space 的 Settings -> Variables and secrets 中配置后重试。" # 将错误作为助手消息显示 display_user = (user_text or "") + ("\n[已附带图片]" if image is not None else "") new_history = chat_history + [(display_user, error_text)] yield new_history, "" return client = OpenAI(api_key=api_key, base_url=BASE_URL) # 将用户消息先追加到对话框 display_user = (user_text or "") + ("\n[已附带图片]" if image is not None else "") chat_history = chat_history + [(display_user, "")] # 预先占位一条助手回复 yield chat_history, "" try: messages = build_messages(chat_history[:-1], user_text=user_text, image=image, system_prompt=system_prompt, detail=detail) stream = client.chat.completions.create( model=model or DEFAULT_MODEL, messages=messages, stream=True, ) assistant_acc = [] for chunk in stream: delta = None try: delta = chunk.choices[0].delta except Exception: pass if delta and getattr(delta, "content", None): assistant_acc.append(delta.content) # 实时更新最后一条消息 chat_history[-1] = (display_user, "".join(assistant_acc)) yield chat_history, "" except Exception as e: chat_history[-1] = (display_user, f"[调用失败] {type(e).__name__}: {e}") yield chat_history, "" with gr.Blocks(title="StepFun - Step3 Multimodal Chat", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # StepFun Step-3 多模态对话(Hugging Face Space) - 支持上传图片 + 文本提问,后端接口兼容 OpenAI Chat Completions。 - 在 Space 中运行时,请到 Settings -> Variables and secrets 配置 `STEPFUN_API_KEY`。 - 可在右上角切换到 **dev mode** 查看构建/运行日志。 """) with gr.Row(): model = gr.Dropdown( label="模型", choices=["step-3", "step-r1-v-mini"], value=DEFAULT_MODEL, interactive=True, ) detail = gr.Dropdown( label="图像细节", choices=["high", "low", "auto"], value=DEFAULT_DETAIL, interactive=True, ) system_prompt = gr.Textbox( label="系统提示(可选)", placeholder="例如:你是一个美食专家,回答要简洁。", lines=2, ) chatbot = gr.Chatbot(height=420, show_label=False) with gr.Row(): image = gr.Image(label="上传图片(可选)", type="pil") user_text = gr.Textbox(label="你的问题", placeholder="描述你的问题……", lines=4) with gr.Row(): submit = gr.Button("发送", variant="primary") clear = gr.Button("清空对话") # 清空 def _clear_chat(): return [], None, "" clear.click(_clear_chat, outputs=[chatbot, image, user_text]) # 发送并流式生成 submit.click( fn=stream_response, inputs=[user_text, image, model, detail, system_prompt, chatbot], outputs=[chatbot, user_text], ) if __name__ == "__main__": # 本地调试:python app.py demo.queue().launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))