Spaces:

abiyyufahri
/

GUI-Agent

Sleeping

App Files Files Community

abiyyufahri commited on Jul 24

Commit

e670b79

1 Parent(s): 755e79c

Install error fix attemp 6

Browse files

Files changed (3) hide show

Dockerfile +28 -7
app.py +93 -40
requirements.txt +4 -1

Dockerfile CHANGED Viewed

@@ -1,21 +1,42 @@
-FROM python:3.10-slim
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    git gcc libglib2.0-0 libsm6 libxext6 libxrender-dev && \
     rm -rf /var/lib/apt/lists/*
 RUN useradd -m -u 1000 user
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
-COPY --chown=user requirements.txt ./
-# Install dependencies in stages to handle build dependencies
 RUN pip install --upgrade pip && \
-    pip install --no-cache-dir packaging ninja wheel setuptools && \
-    pip install --no-cache-dir torch==2.2.2 && \
-    pip install --no-cache-dir -r requirements.txt
 COPY --chown=user . .

+FROM nvidia/cuda:12.1-devel-ubuntu22.04
+# Install Python 3.10
 RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3.10 python3.10-dev python3-pip python3.10-venv \
+    git gcc g++ libglib2.0-0 libsm6 libxext6 libxrender-dev \
+    build-essential curl && \
     rm -rf /var/lib/apt/lists/*
+# Create symbolic links for python
+RUN ln -s /usr/bin/python3.10 /usr/bin/python && \
+    ln -s /usr/bin/python3.10 /usr/bin/python3
 RUN useradd -m -u 1000 user
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
+# Install dependencies step by step untuk menghindari konflik
 RUN pip install --upgrade pip && \
+    pip install --no-cache-dir packaging ninja wheel setuptools numpy
+# Install PyTorch dengan CUDA support
+RUN pip install --no-cache-dir torch==2.2.2 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+# Install dependencies lain sebelum GUI-Actor
+RUN pip install --no-cache-dir \
+    transformers \
+    datasets \
+    Pillow \
+    accelerate \
+    scipy \
+    qwen-vl-utils \
+    fastapi \
+    "uvicorn[standard]"
+# Install GUI-Actor package terakhir (includes flash-attn)
+RUN pip install --no-cache-dir "git+https://github.com/microsoft/GUI-Actor.git"
 COPY --chown=user . .

app.py CHANGED Viewed

@@ -6,60 +6,113 @@ from io import BytesIO
 import base64
 import torch
 from transformers import Qwen2VLProcessor
 from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
 from gui_actor.inference import inference
 app = FastAPI()
-# Load model
-model_name = "microsoft/GUI-Actor-2B-Qwen2-VL"
-processor = Qwen2VLProcessor.from_pretrained(model_name)
-tokenizer = processor.tokenizer
 model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
-    model_name,
-    torch_dtype=torch.float32,  # use float32 for CPU
-    device_map=None,            # don't map to cuda
-    attn_implementation=None,
 ).eval()
 class Base64Request(BaseModel):
     image_base64: str
     instruction: str
 @app.post("/click/base64")
 async def predict_click_base64(data: Base64Request):
-    # Decode base64 to image
-    image_data = base64.b64decode(data.image_base64.split(",")[-1])
-    pil_image = Image.open(BytesIO(image_data)).convert("RGB")
-    conversation = [
-        {
-            "role": "system",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
-                }
-            ]
-        },
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "image",
-                    "image": pil_image,
-                },
-                {
-                    "type": "text",
-                    "text": data.instruction,
-                },
-            ],
-        },
-    ]
-    pred = inference(conversation, model, tokenizer, processor, use_placeholder=True, topk=3)
-    px, py = pred["topk_points"][0]
-    return JSONResponse(content={"x": round(px, 4), "y": round(py, 4)})

 import base64
 import torch
+# Import sesuai dokumentasi GUI-Actor
+from qwen_vl_utils import process_vision_info
 from transformers import Qwen2VLProcessor
+from gui_actor.constants import chat_template
 from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
 from gui_actor.inference import inference
 app = FastAPI()
+# Load model sesuai dokumentasi
+model_name_or_path = "microsoft/GUI-Actor-2B-Qwen2-VL"
+data_processor = Qwen2VLProcessor.from_pretrained(model_name_or_path)
+tokenizer = data_processor.tokenizer
+# Modifikasi untuk CPU atau GPU
+device = "cuda" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.bfloat16 if device == "cuda" else torch.float32
 model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
+    model_name_or_path,
+    torch_dtype=torch_dtype,
+    device_map=device if device == "cuda" else None,
+    attn_implementation="flash_attention_2" if device == "cuda" else None
 ).eval()
 class Base64Request(BaseModel):
     image_base64: str
     instruction: str
 @app.post("/click/base64")
 async def predict_click_base64(data: Base64Request):
+    try:
+        # Decode base64 to image
+        image_data = base64.b64decode(data.image_base64.split(",")[-1])
+        pil_image = Image.open(BytesIO(image_data)).convert("RGB")
+        conversation = [
+            {
+                "role": "system",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
+                    }
+                ]
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": pil_image,
+                    },
+                    {
+                        "type": "text",
+                        "text": data.instruction,
+                    },
+                ],
+            },
+        ]
+        # Inference menggunakan fungsi dari GUI-Actor
+        pred = inference(
+            conversation,
+            model,
+            tokenizer,
+            data_processor,
+            use_placeholder=True,
+            topk=3
+        )
+        px, py = pred["topk_points"][0]
+        return JSONResponse(content={
+            "x": round(px, 4),
+            "y": round(py, 4),
+            "all_points": [[round(x, 4), round(y, 4)] for x, y in pred["topk_points"]],
+            "success": True
+        })
+    except Exception as e:
+        return JSONResponse(
+            content={
+                "error": str(e),
+                "success": False
+            },
+            status_code=500
+        )
+@app.get("/health")
+async def health_check():
+    return {
+        "status": "healthy",
+        "model": model_name_or_path,
+        "device": device,
+        "torch_dtype": str(torch_dtype)
+    }
+# Endpoint tambahan untuk testing dengan form data
+@app.post("/click/form")
+async def predict_click_form(
+    image_base64: str = Form(...),
+    instruction: str = Form(...)
+):
+    data = Base64Request(image_base64=image_base64, instruction=instruction)
+    return await predict_click_base64(data)
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt CHANGED Viewed

@@ -6,6 +6,9 @@ transformers
 datasets
 Pillow
 torch==2.2.2
 accelerate
 scipy
-git+https://github.com/microsoft/GUI-Actor.git

 datasets
 Pillow
 torch==2.2.2
+torchvision
+torchaudio
 accelerate
 scipy
+qwen-vl-utils
+git+https://github.com/microsoft/GUI-Actor.git