GUI-Agent / app.py
abiyyufahri's picture
Install error fix
55b2cb1
raw
history blame
1.95 kB
from fastapi import FastAPI, Form
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from PIL import Image
from io import BytesIO
import base64
import torch
from transformers import Qwen2VLProcessor
from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
from gui_actor.inference import inference
app = FastAPI()
# Load model
model_name = "microsoft/GUI-Actor-2B-Qwen2-VL"
processor = Qwen2VLProcessor.from_pretrained(model_name)
tokenizer = processor.tokenizer
model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
model_name,
torch_dtype=torch.float32, # use float32 for CPU
device_map=None, # don't map to cuda
attn_implementation=None,
).eval()
class Base64Request(BaseModel):
image_base64: str
instruction: str
@app.post("/click/base64")
async def predict_click_base64(data: Base64Request):
# Decode base64 to image
image_data = base64.b64decode(data.image_base64.split(",")[-1])
pil_image = Image.open(BytesIO(image_data)).convert("RGB")
conversation = [
{
"role": "system",
"content": [
{
"type": "text",
"text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
}
]
},
{
"role": "user",
"content": [
{
"type": "image",
"image": pil_image,
},
{
"type": "text",
"text": data.instruction,
},
],
},
]
pred = inference(conversation, model, tokenizer, processor, use_placeholder=True, topk=3)
px, py = pred["topk_points"][0]
return JSONResponse(content={"x": round(px, 4), "y": round(py, 4)})