Spaces:

Stremly
/

uitars

Running

App Files Files Community

Stremly commited on Jun 24

Commit

e995be0

verified ·

1 Parent(s): 4244b33

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -30

app.py CHANGED Viewed

@@ -5,71 +5,130 @@ import torch
 from PIL import Image, ImageDraw
 import gradio as gr
-from transformers import Qwen2_5_VLForConditionalGeneration
-from transformers import AutoProcessor
 from qwen_vl_utils import process_vision_info  # include this file in your repo if not pip-installable
 # ---- model & processor loaded on CPU ----
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     "ByteDance-Seed/UI-TARS-1.5-7B",
     device_map="auto",
-    torch_dtype=torch.float32,    # CPU-friendly
 )
 processor = AutoProcessor.from_pretrained(
     "ByteDance-Seed/UI-TARS-1.5-7B",
     size={"shortest_edge": 256 * 28 * 28, "longest_edge": 1344 * 28 * 28},
     use_fast=True,
 )
-def draw_point(image: Image.Image, point=None, radius=5):
     img = image.copy()
     if point:
         x, y = point[0] * img.width, point[1] * img.height
         ImageDraw.Draw(img).ellipse(
-            (x - radius, y - radius, x + radius, y + radius), fill='red'
         )
     return img
 @spaces.GPU
-def navigate(image, task, platform):
-    messages = [
-        {"role": "user", "content": [{"type": "text", "text": f"You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='"},
-            {"type": "image_url", "image_url": image}
-        ]}
     ]
-    # prepare inputs
-    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     images, videos = process_vision_info(messages)
-    inputs = processor(text=[text], images=images, videos=videos, padding=True, return_tensors="pt")
-    inputs = inputs.to("cuda")
-    # generate
     generated = model.generate(**inputs, max_new_tokens=128)
-    trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated)]
-    out = processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    # optionally parse JSON and draw point
     try:
-        actions = ast.literal_eval(out)
         for act in actions if isinstance(actions, list) else [actions]:
-            pos = act.get('position')
-            if pos and isinstance(pos, list) and len(pos)==2:
-                image = draw_point(image, pos)
-        return image, out
-    except:
-        return image, out
 demo = gr.Interface(
     fn=navigate,
     inputs=[
         gr.Image(type="pil", label="Screenshot"),
-        gr.Textbox(lines=1, placeholder="e.g. Search the weather for New York", label="Task"),
         gr.Dropdown(choices=["web", "phone"], value="web", label="Platform"),
     ],
-    outputs=[gr.Image(label="With Click Point"), gr.Textbox(label="Raw Action JSON")],
-    title="ShowUI-2B Navigation Demo",
 )
 demo.launch(
@@ -77,4 +136,4 @@ demo.launch(
     server_port=7860,
     share=False,      # or True if you need a public link
     ssr_mode=False,   # turn off experimental SSR so the process blocks
-)

 from PIL import Image, ImageDraw
 import gradio as gr
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info  # include this file in your repo if not pip-installable
 # ---- model & processor loaded on CPU ----
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     "ByteDance-Seed/UI-TARS-1.5-7B",
     device_map="auto",
+    torch_dtype=torch.float32,    # CPU‑friendly
 )
 processor = AutoProcessor.from_pretrained(
     "ByteDance-Seed/UI-TARS-1.5-7B",
     size={"shortest_edge": 256 * 28 * 28, "longest_edge": 1344 * 28 * 28},
     use_fast=True,
 )
+def draw_point(image: Image.Image, point=None, radius: int = 5):
+    """Overlay a red dot on the screenshot where the model clicked."""
     img = image.copy()
     if point:
         x, y = point[0] * img.width, point[1] * img.height
         ImageDraw.Draw(img).ellipse(
+            (x - radius, y - radius, x + radius, y + radius), fill="red"
         )
     return img
 @spaces.GPU
+def navigate(screenshot, task: str, platform: str, history):
+    """Run one inference step on the GUI‑reasoning model.
+    Args:
+        screenshot (PIL.Image): Latest UI screenshot.
+        task (str): Natural‑language task description.
+        platform (str): Either "web" or "phone" for prompt conditioning.
+        history (list | str | None): Previous messages list. Accepts either an
+            actual Python list (via gr.JSON) or a JSON/Python‑literal string.
+    """
+    # ───────────────────── normalise history input ──────────────────────────
+    if history in (None, ""):
+        history_list = []
+    else:
+        if isinstance(history, str):
+            try:
+                history_list = ast.literal_eval(history)
+            except Exception as exc:
+                raise ValueError("`history` must be a JSON/Python list: " + str(exc))
+        else:
+            history_list = history
+        if not isinstance(history_list, list):
+            raise ValueError("`history` must be a list of messages.")
+    # ─────────────────── construct current user message ─────────────────────
+    prompt_header = (
+        "You are a GUI agent. You are given a task and your action history, "
+        "with screenshots. You need to perform the next action to complete "
+        "the task.\n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n"
+        "## Action Space\nclick(start_box='...') / type(...)\n\n"
+        f"### Task\n{task}"
+    )
+    current_content = [
+        {"type": "text", "text": prompt_header},
+        {"type": "image_url", "image_url": screenshot},
     ]
+    messages = history_list + [{"role": "user", "content": current_content}]
+    # ─────────────────────────── model forward ─────────────────────────────
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
     images, videos = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=images,
+        videos=videos,
+        padding=True,
+        return_tensors="pt",
+    ).to("cuda")
     generated = model.generate(**inputs, max_new_tokens=128)
+    trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated)
+    ]
+    raw_out = processor.batch_decode(
+        trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )[0]
+    # ─────── draw predicted click for quick visual verification (optional) ──────
     try:
+        actions = ast.literal_eval(raw_out)
         for act in actions if isinstance(actions, list) else [actions]:
+            pos = act.get("position")
+            if pos and isinstance(pos, list) and len(pos) == 2:
+                screenshot = draw_point(screenshot, pos)
+    except Exception:
+        # decoding failed → just return original screenshot
+        pass
+    return screenshot, raw_out
+# ────────────────────────── Gradio interface ───────────────────────────────
 demo = gr.Interface(
     fn=navigate,
     inputs=[
         gr.Image(type="pil", label="Screenshot"),
+        gr.Textbox(
+            lines=1,
+            placeholder="e.g. Search the weather for New York",
+            label="Task",
+        ),
         gr.Dropdown(choices=["web", "phone"], value="web", label="Platform"),
+        gr.JSON(label="Conversation History (list)", value=[]),
+    ],
+    outputs=[
+        gr.Image(label="With Click Point"),
+        gr.Textbox(label="Raw Action JSON"),
     ],
+    title="ShowUI‑2B Navigation Demo",
 )
 demo.launch(
     server_port=7860,
     share=False,      # or True if you need a public link
     ssr_mode=False,   # turn off experimental SSR so the process blocks
+)