abiyyufahri commited on
Commit
e670b79
·
1 Parent(s): 755e79c

Install error fix attemp 6

Browse files
Files changed (3) hide show
  1. Dockerfile +28 -7
  2. app.py +93 -40
  3. requirements.txt +4 -1
Dockerfile CHANGED
@@ -1,21 +1,42 @@
1
- FROM python:3.10-slim
2
 
 
3
  RUN apt-get update && apt-get install -y --no-install-recommends \
4
- git gcc libglib2.0-0 libsm6 libxext6 libxrender-dev && \
 
 
5
  rm -rf /var/lib/apt/lists/*
6
 
 
 
 
 
7
  RUN useradd -m -u 1000 user
8
  USER user
9
  ENV PATH="/home/user/.local/bin:$PATH"
10
 
11
  WORKDIR /app
12
- COPY --chown=user requirements.txt ./
13
 
14
- # Install dependencies in stages to handle build dependencies
15
  RUN pip install --upgrade pip && \
16
- pip install --no-cache-dir packaging ninja wheel setuptools && \
17
- pip install --no-cache-dir torch==2.2.2 && \
18
- pip install --no-cache-dir -r requirements.txt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  COPY --chown=user . .
21
 
 
1
+ FROM nvidia/cuda:12.1-devel-ubuntu22.04
2
 
3
+ # Install Python 3.10
4
  RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ python3.10 python3.10-dev python3-pip python3.10-venv \
6
+ git gcc g++ libglib2.0-0 libsm6 libxext6 libxrender-dev \
7
+ build-essential curl && \
8
  rm -rf /var/lib/apt/lists/*
9
 
10
+ # Create symbolic links for python
11
+ RUN ln -s /usr/bin/python3.10 /usr/bin/python && \
12
+ ln -s /usr/bin/python3.10 /usr/bin/python3
13
+
14
  RUN useradd -m -u 1000 user
15
  USER user
16
  ENV PATH="/home/user/.local/bin:$PATH"
17
 
18
  WORKDIR /app
 
19
 
20
+ # Install dependencies step by step untuk menghindari konflik
21
  RUN pip install --upgrade pip && \
22
+ pip install --no-cache-dir packaging ninja wheel setuptools numpy
23
+
24
+ # Install PyTorch dengan CUDA support
25
+ RUN pip install --no-cache-dir torch==2.2.2 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
26
+
27
+ # Install dependencies lain sebelum GUI-Actor
28
+ RUN pip install --no-cache-dir \
29
+ transformers \
30
+ datasets \
31
+ Pillow \
32
+ accelerate \
33
+ scipy \
34
+ qwen-vl-utils \
35
+ fastapi \
36
+ "uvicorn[standard]"
37
+
38
+ # Install GUI-Actor package terakhir (includes flash-attn)
39
+ RUN pip install --no-cache-dir "git+https://github.com/microsoft/GUI-Actor.git"
40
 
41
  COPY --chown=user . .
42
 
app.py CHANGED
@@ -6,60 +6,113 @@ from io import BytesIO
6
  import base64
7
  import torch
8
 
 
 
9
  from transformers import Qwen2VLProcessor
 
10
  from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
11
  from gui_actor.inference import inference
12
 
13
  app = FastAPI()
14
 
15
- # Load model
16
- model_name = "microsoft/GUI-Actor-2B-Qwen2-VL"
17
- processor = Qwen2VLProcessor.from_pretrained(model_name)
18
- tokenizer = processor.tokenizer
 
 
 
 
 
19
  model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
20
- model_name,
21
- torch_dtype=torch.float32, # use float32 for CPU
22
- device_map=None, # don't map to cuda
23
- attn_implementation=None,
24
  ).eval()
25
 
26
-
27
  class Base64Request(BaseModel):
28
  image_base64: str
29
  instruction: str
30
 
31
-
32
  @app.post("/click/base64")
33
  async def predict_click_base64(data: Base64Request):
34
- # Decode base64 to image
35
- image_data = base64.b64decode(data.image_base64.split(",")[-1])
36
- pil_image = Image.open(BytesIO(image_data)).convert("RGB")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- conversation = [
39
- {
40
- "role": "system",
41
- "content": [
42
- {
43
- "type": "text",
44
- "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
45
- }
46
- ]
47
- },
48
- {
49
- "role": "user",
50
- "content": [
51
- {
52
- "type": "image",
53
- "image": pil_image,
54
- },
55
- {
56
- "type": "text",
57
- "text": data.instruction,
58
- },
59
- ],
60
- },
61
- ]
62
 
63
- pred = inference(conversation, model, tokenizer, processor, use_placeholder=True, topk=3)
64
- px, py = pred["topk_points"][0]
65
- return JSONResponse(content={"x": round(px, 4), "y": round(py, 4)})
 
6
  import base64
7
  import torch
8
 
9
+ # Import sesuai dokumentasi GUI-Actor
10
+ from qwen_vl_utils import process_vision_info
11
  from transformers import Qwen2VLProcessor
12
+ from gui_actor.constants import chat_template
13
  from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
14
  from gui_actor.inference import inference
15
 
16
  app = FastAPI()
17
 
18
+ # Load model sesuai dokumentasi
19
+ model_name_or_path = "microsoft/GUI-Actor-2B-Qwen2-VL"
20
+ data_processor = Qwen2VLProcessor.from_pretrained(model_name_or_path)
21
+ tokenizer = data_processor.tokenizer
22
+
23
+ # Modifikasi untuk CPU atau GPU
24
+ device = "cuda" if torch.cuda.is_available() else "cpu"
25
+ torch_dtype = torch.bfloat16 if device == "cuda" else torch.float32
26
+
27
  model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
28
+ model_name_or_path,
29
+ torch_dtype=torch_dtype,
30
+ device_map=device if device == "cuda" else None,
31
+ attn_implementation="flash_attention_2" if device == "cuda" else None
32
  ).eval()
33
 
 
34
  class Base64Request(BaseModel):
35
  image_base64: str
36
  instruction: str
37
 
 
38
  @app.post("/click/base64")
39
  async def predict_click_base64(data: Base64Request):
40
+ try:
41
+ # Decode base64 to image
42
+ image_data = base64.b64decode(data.image_base64.split(",")[-1])
43
+ pil_image = Image.open(BytesIO(image_data)).convert("RGB")
44
+
45
+ conversation = [
46
+ {
47
+ "role": "system",
48
+ "content": [
49
+ {
50
+ "type": "text",
51
+ "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
52
+ }
53
+ ]
54
+ },
55
+ {
56
+ "role": "user",
57
+ "content": [
58
+ {
59
+ "type": "image",
60
+ "image": pil_image,
61
+ },
62
+ {
63
+ "type": "text",
64
+ "text": data.instruction,
65
+ },
66
+ ],
67
+ },
68
+ ]
69
+
70
+ # Inference menggunakan fungsi dari GUI-Actor
71
+ pred = inference(
72
+ conversation,
73
+ model,
74
+ tokenizer,
75
+ data_processor,
76
+ use_placeholder=True,
77
+ topk=3
78
+ )
79
+
80
+ px, py = pred["topk_points"][0]
81
+
82
+ return JSONResponse(content={
83
+ "x": round(px, 4),
84
+ "y": round(py, 4),
85
+ "all_points": [[round(x, 4), round(y, 4)] for x, y in pred["topk_points"]],
86
+ "success": True
87
+ })
88
+
89
+ except Exception as e:
90
+ return JSONResponse(
91
+ content={
92
+ "error": str(e),
93
+ "success": False
94
+ },
95
+ status_code=500
96
+ )
97
+
98
+ @app.get("/health")
99
+ async def health_check():
100
+ return {
101
+ "status": "healthy",
102
+ "model": model_name_or_path,
103
+ "device": device,
104
+ "torch_dtype": str(torch_dtype)
105
+ }
106
 
107
+ # Endpoint tambahan untuk testing dengan form data
108
+ @app.post("/click/form")
109
+ async def predict_click_form(
110
+ image_base64: str = Form(...),
111
+ instruction: str = Form(...)
112
+ ):
113
+ data = Base64Request(image_base64=image_base64, instruction=instruction)
114
+ return await predict_click_base64(data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ if __name__ == "__main__":
117
+ import uvicorn
118
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt CHANGED
@@ -6,6 +6,9 @@ transformers
6
  datasets
7
  Pillow
8
  torch==2.2.2
 
 
9
  accelerate
10
  scipy
11
- git+https://github.com/microsoft/GUI-Actor.git
 
 
6
  datasets
7
  Pillow
8
  torch==2.2.2
9
+ torchvision
10
+ torchaudio
11
  accelerate
12
  scipy
13
+ qwen-vl-utils
14
+ git+https://github.com/microsoft/GUI-Actor.git