abiyyufahri commited on
Commit
2cf117f
·
1 Parent(s): 665bdb7

Add base64 GUI click endpoint

Browse files
Files changed (4) hide show
  1. Dockerfile +16 -0
  2. README.md +5 -4
  3. app.py +108 -0
  4. requirements.txt +10 -0
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ RUN apt-get update && apt-get install -y git && \
4
+ useradd -m -u 1000 user
5
+ USER user
6
+ ENV PATH="/home/user/.local/bin:$PATH"
7
+
8
+ WORKDIR /app
9
+
10
+ COPY --chown=user requirements.txt .
11
+ RUN pip install --no-cache-dir --upgrade pip && \
12
+ pip install --no-cache-dir -r requirements.txt
13
+
14
+ COPY --chown=user . .
15
+
16
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,9 +1,10 @@
1
  ---
2
- title: GUI Agent
3
- emoji: 😻
4
- colorFrom: yellow
5
- colorTo: pink
6
  sdk: docker
 
7
  pinned: false
8
  ---
9
 
 
1
  ---
2
+ title: GUI Actor VL Demo
3
+ emoji: 🖱️
4
+ colorFrom: gray
5
+ colorTo: blue
6
  sdk: docker
7
+ app_port: 7860
8
  pinned: false
9
  ---
10
 
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, Form
2
+ from fastapi.responses import JSONResponse
3
+ from PIL import Image
4
+ from io import BytesIO
5
+ import torch
6
+ import base64
7
+
8
+ from transformers import Qwen2VLProcessor
9
+ from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
10
+ from gui_actor.inference import inference
11
+
12
+ app = FastAPI()
13
+
14
+ # Load model
15
+ model_name = "microsoft/GUI-Actor-2B-Qwen2-VL"
16
+ processor = Qwen2VLProcessor.from_pretrained(model_name)
17
+ tokenizer = processor.tokenizer
18
+ model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
19
+ model_name,
20
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
21
+ device_map="auto",
22
+ attn_implementation="flash_attention_2" if torch.cuda.is_available() else None,
23
+ ).eval()
24
+
25
+
26
+ @app.post("/click_base64")
27
+ async def predict_click_base64(
28
+ image_base64: str = Form(...),
29
+ instruction: str = Form(...)
30
+ ):
31
+ # Decode base64 image
32
+ try:
33
+ if "," in image_base64:
34
+ image_base64 = image_base64.split(",")[1]
35
+ image_data = base64.b64decode(image_base64)
36
+ pil_image = Image.open(BytesIO(image_data)).convert("RGB")
37
+ except Exception as e:
38
+ return JSONResponse(status_code=400, content={"error": f"Invalid image format: {str(e)}"})
39
+
40
+ # Prepare conversation
41
+ conversation = [
42
+ {
43
+ "role": "system",
44
+ "content": [
45
+ {
46
+ "type": "text",
47
+ "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
48
+ }
49
+ ]
50
+ },
51
+ {
52
+ "role": "user",
53
+ "content": [
54
+ {
55
+ "type": "image",
56
+ "image": pil_image,
57
+ },
58
+ {
59
+ "type": "text",
60
+ "text": instruction,
61
+ },
62
+ ],
63
+ },
64
+ ]
65
+
66
+ # Inference
67
+ try:
68
+ pred = inference(conversation, model, tokenizer, processor, use_placeholder=True, topk=3)
69
+ px, py = pred["topk_points"][0]
70
+ return JSONResponse(content={"x": round(px, 4), "y": round(py, 4)})
71
+ except Exception as e:
72
+ return JSONResponse(status_code=500, content={"error": f"Inference failed: {str(e)}"})
73
+
74
+
75
+ @app.post("/click")
76
+ async def predict_click(image: UploadFile, instruction: str = Form(...)):
77
+ # Load image
78
+ contents = await image.read()
79
+ pil_image = Image.open(BytesIO(contents)).convert("RGB")
80
+
81
+ conversation = [
82
+ {
83
+ "role": "system",
84
+ "content": [
85
+ {
86
+ "type": "text",
87
+ "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
88
+ }
89
+ ]
90
+ },
91
+ {
92
+ "role": "user",
93
+ "content": [
94
+ {
95
+ "type": "image",
96
+ "image": pil_image,
97
+ },
98
+ {
99
+ "type": "text",
100
+ "text": instruction,
101
+ },
102
+ ],
103
+ },
104
+ ]
105
+
106
+ pred = inference(conversation, model, tokenizer, processor, use_placeholder=True, topk=3)
107
+ px, py = pred["topk_points"][0]
108
+ return JSONResponse(content={"x": round(px, 4), "y": round(py, 4)})
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ transformers
4
+ torch
5
+ datasets
6
+ Pillow
7
+ accelerate
8
+ scipy
9
+ # library tambahan dari repo `gui_actor`
10
+ git+https://github.com/microsoft/GUI-Actor.git