Spaces:

abiyyufahri
/

GUI-Agent

Sleeping

App Files Files Community

abiyyufahri commited on Jul 24

Commit

2cf117f

1 Parent(s): 665bdb7

Add base64 GUI click endpoint

Browse files

Files changed (4) hide show

Dockerfile +16 -0
README.md +5 -4
app.py +108 -0
requirements.txt +10 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM python:3.9
+RUN apt-get update && apt-get install -y git && \
+    useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+COPY --chown=user . .
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,9 +1,10 @@
 ---
-title: GUI Agent
-emoji: 😻
-colorFrom: yellow
-colorTo: pink
 sdk: docker
 pinned: false
 ---

 ---
+title: GUI Actor VL Demo
+emoji: 🖱️
+colorFrom: gray
+colorTo: blue
 sdk: docker
+app_port: 7860
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from fastapi import FastAPI, UploadFile, Form
+from fastapi.responses import JSONResponse
+from PIL import Image
+from io import BytesIO
+import torch
+import base64
+from transformers import Qwen2VLProcessor
+from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
+from gui_actor.inference import inference
+app = FastAPI()
+# Load model
+model_name = "microsoft/GUI-Actor-2B-Qwen2-VL"
+processor = Qwen2VLProcessor.from_pretrained(model_name)
+tokenizer = processor.tokenizer
+model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
+    model_name,
+    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+    device_map="auto",
+    attn_implementation="flash_attention_2" if torch.cuda.is_available() else None,
+).eval()
+@app.post("/click_base64")
+async def predict_click_base64(
+    image_base64: str = Form(...),
+    instruction: str = Form(...)
+):
+    # Decode base64 image
+    try:
+        if "," in image_base64:
+            image_base64 = image_base64.split(",")[1]
+        image_data = base64.b64decode(image_base64)
+        pil_image = Image.open(BytesIO(image_data)).convert("RGB")
+    except Exception as e:
+        return JSONResponse(status_code=400, content={"error": f"Invalid image format: {str(e)}"})
+    # Prepare conversation
+    conversation = [
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
+                }
+            ]
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": pil_image,
+                },
+                {
+                    "type": "text",
+                    "text": instruction,
+                },
+            ],
+        },
+    ]
+    # Inference
+    try:
+        pred = inference(conversation, model, tokenizer, processor, use_placeholder=True, topk=3)
+        px, py = pred["topk_points"][0]
+        return JSONResponse(content={"x": round(px, 4), "y": round(py, 4)})
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"error": f"Inference failed: {str(e)}"})
+@app.post("/click")
+async def predict_click(image: UploadFile, instruction: str = Form(...)):
+    # Load image
+    contents = await image.read()
+    pil_image = Image.open(BytesIO(contents)).convert("RGB")
+    conversation = [
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
+                }
+            ]
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": pil_image,
+                },
+                {
+                    "type": "text",
+                    "text": instruction,
+                },
+            ],
+        },
+    ]
+    pred = inference(conversation, model, tokenizer, processor, use_placeholder=True, topk=3)
+    px, py = pred["topk_points"][0]
+    return JSONResponse(content={"x": round(px, 4), "y": round(py, 4)})

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi
+uvicorn[standard]
+transformers
+torch
+datasets
+Pillow
+accelerate
+scipy
+# library tambahan dari repo `gui_actor`
+git+https://github.com/microsoft/GUI-Actor.git