Spaces:

Stremly
/

uitars

Sleeping

App Files Files Community

Abs6187 commited on Jul 8

Commit

bd4d774

1 Parent(s): 01c1f7b

Updated app.py

Browse files

Files changed (1) hide show

app.py +61 -82

app.py CHANGED Viewed

@@ -1,74 +1,58 @@
-# app.py
-import spaces
 import ast
-import torch
-from PIL import Image, ImageDraw
-import gradio as gr
 import base64
 from io import BytesIO
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
-from qwen_vl_utils import process_vision_info  # include this file in your repo if not pip-installable
-_MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     "ByteDance-Seed/UI-TARS-1.5-7B",
     device_map="auto",
     torch_dtype=torch.float16
 )
-_PROCESSOR = AutoProcessor.from_pretrained(
     "ByteDance-Seed/UI-TARS-1.5-7B",
-    size={"shortest_edge": 100 * 28 * 28, "longest_edge": 16384 * 28 * 28},  # sane res
     use_fast=True,
 )
-model = _MODEL
-processor = _PROCESSOR
-def draw_point(image: Image.Image, point=None, radius: int = 5):
-    """Overlay a red dot on the screenshot where the model clicked."""
-    img = image.copy()
-    if point:
-        x, y = point[0] * img.width, point[1] * img.height
-        ImageDraw.Draw(img).ellipse(
-            (x - radius, y - radius, x + radius, y + radius), fill="red"
-        )
     return img
 @spaces.GPU
 def navigate(screenshot, task: str):
-    """Run one inference step on the GUI‑reasoning model.
-    Args:
-        screenshot (PIL.Image): Latest UI screenshot.
-        task (str): Natural‑language task description
-        history (list | str | None): Previous messages list. Accepts either an
-            actual Python list (via gr.JSON) or a JSON/Python‑literal string.
-    """
-    # ───────────────────── normalise history input ──────────────────────────
-    messages=[]
     prompt_header = (
-            "You are a GUI agent. You are given a task and your action history, with screenshots."
-            "You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='<|box_start|>(x1, y1)<|box_end|>')\nleft_double(start_box='<|box_start|>(x1, y1)<|box_end|>')\nright_single(start_box='<|box_start|<(x1, y1)>|box_end|>')\ndrag(start_box='<|box_start|>(x1, y1)<|box_end|>', end_box='<|box_start|>(x3, y3)<|box_end|>')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part. Always use 'win' instead of 'meta' key\n\n"
-            f"## User Instruction\n{task}"
-        )
-    current = {"role":"user","content":[{"type":"text","text":prompt_header},{"type": "image_url", "image_url":screenshot}]}
-    messages.append(current)
-    #New Comment 1
-    # ─────────────────────────── model forward ─────────────────────────────
     images, videos = process_vision_info(messages)
-    text = processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
     inputs = processor(
         text=[text],
         images=images,
@@ -77,50 +61,45 @@ def navigate(screenshot, task: str):
         return_tensors="pt",
     ).to("cuda")
-    generated = model.generate(**inputs, max_new_tokens=128)
-    trimmed = [
-        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated)
-    ]
     raw_out = processor.batch_decode(
-        trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
     )[0]
-    # ─────── draw predicted click for quick visual verification (optional) ──────
     try:
-        actions = ast.literal_eval(raw_out)
-        for act in actions if isinstance(actions, list) else [actions]:
-            pos = act.get("position")
-            if pos and isinstance(pos, list) and len(pos) == 2:
-                screenshot = draw_point(screenshot, pos)
     except Exception:
-        # decoding failed → just return original screenshot
         pass
     return screenshot, raw_out, messages
-# ────────────────────────── Gradio interface ───────────────────────────────
-demo = gr.Interface(
-    fn=navigate,
-    inputs=[
-        gr.Image(type="pil", label="Screenshot"),
-        gr.Textbox(
-            lines=1,
-            placeholder="e.g. Search the weather for New York",
-            label="Task",
-        )
-    ],
-    outputs=[
-        gr.Image(label="With Click Point"),
-        gr.Textbox(label="Raw Action JSON"),
-        gr.JSON(label="Updated Conversation History")
-    ],
-    title="UI-Tars Navigation Demo",
-)
 demo.launch(
     server_name="0.0.0.0",
     server_port=7860,
-    share=False,      # or True if you need a public link
-    ssr_mode=False,   # turn off experimental SSR so the process blocks
 )

 import ast
 import base64
 from io import BytesIO
+import spaces
+import torch
+from PIL import Image, ImageDraw
+import gradio as gr
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info     # keep this file in repo
+# ─── Model & Processor ──────────────────────────────────────────────────────
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     "ByteDance-Seed/UI-TARS-1.5-7B",
     device_map="auto",
     torch_dtype=torch.float16
 )
+processor = AutoProcessor.from_pretrained(
     "ByteDance-Seed/UI-TARS-1.5-7B",
+    size={"shortest_edge": 100 * 28 * 28, "longest_edge": 16384 * 28 * 28},
     use_fast=True,
 )
+# ─── Helpers ────────────────────────────────────────────────────────────────
+def draw_point(img: Image.Image, pos=None, r: int = 5):
+    if pos:
+        x, y = pos[0] * img.width, pos[1] * img.height
+        ImageDraw.Draw(img).ellipse((x - r, y - r, x + r, y + r), fill="red")
     return img
+# ─── Core Inference ─────────────────────────────────────────────────────────
 @spaces.GPU
 def navigate(screenshot, task: str):
     prompt_header = (
+        "You are an expert GUI agent.\n"
+        "Given the task and previous screenshots, decide only the next action.\n\n"
+        "=== TASK ===\n"
+        f"{task}\n"
+        "==============\n\n"
+        "Respond strictly as JSON: {\"Thought\": \"…\", \"Action\": {…}}\n"
+        "Valid actions: click, left_double, right_single, drag, hotkey, type, scroll, wait, finished.\n"
+        "Use English for Thought. Summarize your plan in one sentence."
+    )
+    messages = [{
+        "role": "user",
+        "content": [
+            {"type": "text", "text": prompt_header},
+            {"type": "image_url", "image_url": screenshot}
+        ]
+    }]
     images, videos = process_vision_info(messages)
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[text],
         images=images,
         return_tensors="pt",
     ).to("cuda")
+    gen_ids = model.generate(**inputs, max_new_tokens=128)
     raw_out = processor.batch_decode(
+        [out[len(inp):] for inp, out in zip(inputs.input_ids, gen_ids)],
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False
     )[0]
     try:
+        for act in ast.literal_eval(raw_out) if isinstance(ast.literal_eval(raw_out), list) else [ast.literal_eval(raw_out)]:
+            p = act.get("position")
+            if p and len(p) == 2:
+                screenshot = draw_point(screenshot, p)
     except Exception:
         pass
     return screenshot, raw_out, messages
+# ─── UI ─────────────────────────────────────────────────────────────────────
+with gr.Blocks(title="UI-Tars Navigation Demo", theme=gr.themes.Soft()) as demo:
+    with gr.Row():
+        with gr.Column(scale=3):
+            in_img = gr.Image(type="pil", height=380, show_label=False)
+            in_task = gr.Textbox(
+                placeholder="Describe what you want to do (e.g. “Open Gmail and compose a message”)",
+                lines=1,
+                label="Task"
+            )
+            run_btn = gr.Button("Run", variant="primary")
+        with gr.Column(scale=4):
+            out_img = gr.Image(label="With Click Point", height=380)
+            out_raw = gr.Textbox(label="Raw Action JSON", interactive=False)
+            with gr.Accordion("Conversation History", open=False):
+                out_hist = gr.JSON()
+    run_btn.click(navigate, inputs=[in_img, in_task], outputs=[out_img, out_raw, out_hist])
 demo.launch(
     server_name="0.0.0.0",
     server_port=7860,
+    share=False,
+    ssr_mode=False,
 )