Spaces:
Running
Running
from fastapi import FastAPI, Form | |
from fastapi.responses import JSONResponse | |
from pydantic import BaseModel | |
from PIL import Image | |
from io import BytesIO | |
import base64 | |
import torch | |
from transformers import Qwen2VLProcessor | |
from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer | |
from gui_actor.inference import inference | |
app = FastAPI() | |
# Load model | |
model_name = "microsoft/GUI-Actor-2B-Qwen2-VL" | |
processor = Qwen2VLProcessor.from_pretrained(model_name) | |
tokenizer = processor.tokenizer | |
model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained( | |
model_name, | |
torch_dtype=torch.float32, # use float32 for CPU | |
device_map=None, # don't map to cuda | |
attn_implementation=None, | |
).eval() | |
class Base64Request(BaseModel): | |
image_base64: str | |
instruction: str | |
async def predict_click_base64(data: Base64Request): | |
# Decode base64 to image | |
image_data = base64.b64decode(data.image_base64.split(",")[-1]) | |
pil_image = Image.open(BytesIO(image_data)).convert("RGB") | |
conversation = [ | |
{ | |
"role": "system", | |
"content": [ | |
{ | |
"type": "text", | |
"text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.", | |
} | |
] | |
}, | |
{ | |
"role": "user", | |
"content": [ | |
{ | |
"type": "image", | |
"image": pil_image, | |
}, | |
{ | |
"type": "text", | |
"text": data.instruction, | |
}, | |
], | |
}, | |
] | |
pred = inference(conversation, model, tokenizer, processor, use_placeholder=True, topk=3) | |
px, py = pred["topk_points"][0] | |
return JSONResponse(content={"x": round(px, 4), "y": round(py, 4)}) | |