Spaces:

Stremly
/

uitars

Running

App Files Files Community

Stremly commited on Jun 27

Commit

65f9291

verified ·

1 Parent(s): bb332d3

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -72

app.py CHANGED Viewed

@@ -11,16 +11,10 @@ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info  # include this file in your repo if not pip-installable
 # ---- model & processor loaded on CPU ----
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    "ByteDance-Seed/UI-TARS-1.5-7B",
-    device_map="auto",
-    torch_dtype=torch.float16,    # CPU‑friendly
-)
-processor = AutoProcessor.from_pretrained(
-    "ByteDance-Seed/UI-TARS-1.5-7B",
-    size={"shortest_edge": 100 * 28 * 28, "longest_edge": 16384 * 28 * 28},
-    use_fast=True,
-)
 def draw_point(image: Image.Image, point=None, radius: int = 5):
@@ -46,72 +40,109 @@ def navigate(screenshot, task: str, platform: str, history):
             actual Python list (via gr.JSON) or a JSON/Python‑literal string.
     """
-    # ───────────────────── normalise history input ──────────────────────────
-    messages=[]
-    if isinstance(history, str):
-        try:
-            messages= ast.literal_eval(history)
-        except Exception as exc:
-            raise ValueError("`history` must be a JSON/Python list: " + str(exc))
-    else:
-        messages = history
-    prompt_header = (
-            "You are a GUI agent. You are given a task and your action history, with screenshots."
-            "You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nleft_double(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nright_single(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\ndrag(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', end_box='\u003c|box_start|\u003e(x3, y3)\u003c|box_end|\u003e')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\n\n"
-            f"## User Instruction\n{task}"
         )
-    current = {"role":"user","content":[{"type":"text","text":prompt_header},{"type": "image_url", "image_url":screenshot}]}
-    messages.append(current)
-    # ─────────────────────────── model forward ─────────────────────────────
-    images, videos = process_vision_info(messages)
-    i=0
-    for message in messages:
-        if message['role'] == 'user' and isinstance(message.get('content'), list):
-            for item in message['content']:
-                if item.get('type') == 'image_url' and isinstance(item.get('image_url'), str):
-                    item['image_url'] = images[i]
-                    i+=1
-    text = processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-    print("\nimages\n:",images)
-    print("\ntext\n",text)
-    print("\nmessages\n",messages)
-    inputs = processor(
-        text=[text],
-        images=images,
-        videos=videos,
-        padding=True,
-        return_tensors="pt",
-    ).to("cuda")
-    generated = model.generate(**inputs, max_new_tokens=128)
-    trimmed = [
-        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated)
-    ]
-    raw_out = processor.batch_decode(
-        trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )[0]
-    # ─────── draw predicted click for quick visual verification (optional) ──────
     try:
-        actions = ast.literal_eval(raw_out)
-        for act in actions if isinstance(actions, list) else [actions]:
-            pos = act.get("position")
-            if pos and isinstance(pos, list) and len(pos) == 2:
-                screenshot = draw_point(screenshot, pos)
-    except Exception:
-        # decoding failed → just return original screenshot
-        pass
-    return screenshot, raw_out, messages
 # ────────────────────────── Gradio interface ───────────────────────────────

 from qwen_vl_utils import process_vision_info  # include this file in your repo if not pip-installable
 # ---- model & processor loaded on CPU ----
+# ─── lazy-load cache ──────────────────────────────────────────
+_MODEL = None                 # will hold the quantised weights
+_PROCESSOR = None             # will hold the resized processor
 def draw_point(image: Image.Image, point=None, radius: int = 5):
             actual Python list (via gr.JSON) or a JSON/Python‑literal string.
     """
+     # ------- on-demand model / processor load -------------------------
+    if _MODEL is None:
+        from transformers import BitsAndBytesConfig
+        # 4-bit quantisation (~6 GB on H200)
+        bnb_cfg = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
         )
+        _MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            "ByteDance-Seed/UI-TARS-1.5-7B",
+            quantization_config=bnb_cfg,
+            device_map="auto",
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+        )
+        _PROCESSOR = AutoProcessor.from_pretrained(
+            "ByteDance-Seed/UI-TARS-1.5-7B",
+            size={"shortest_edge": 512, "longest_edge": 1344},  # sane res
+            use_fast=True,
+        )
+        # use mem-efficient attention kernels
+        torch.backends.cuda.enable_flash_sdp(False)
+        torch.backends.cuda.enable_mem_efficient_sdp(True)
+        model = _MODEL
+        processor = _PROCESSOR
+    # ───────────────────── normalise history input ──────────────────────────
     try:
+        messages=[]
+        if isinstance(history, str):
+            try:
+                messages= ast.literal_eval(history)
+            except Exception as exc:
+                raise ValueError("`history` must be a JSON/Python list: " + str(exc))
+        else:
+            messages = history
+        prompt_header = (
+                "You are a GUI agent. You are given a task and your action history, with screenshots."
+                "You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nleft_double(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nright_single(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\ndrag(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', end_box='\u003c|box_start|\u003e(x3, y3)\u003c|box_end|\u003e')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\n\n"
+                f"## User Instruction\n{task}"
+            )
+        current = {"role":"user","content":[{"type":"text","text":prompt_header},{"type": "image_url", "image_url":screenshot}]}
+        messages.append(current)
+        # ─────────────────────────── model forward ─────────────────────────────
+        images, videos = process_vision_info(messages)
+        i=0
+        for message in messages:
+            if message['role'] == 'user' and isinstance(message.get('content'), list):
+                for item in message['content']:
+                    if item.get('type') == 'image_url' and isinstance(item.get('image_url'), str):
+                        item['image_url'] = images[i]
+                        i+=1
+        text = processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        print("\nimages\n:",images)
+        print("\ntext\n",text)
+        print("\nmessages\n",messages)
+        inputs = processor(
+            text=[text],
+            images=images,
+            videos=videos,
+            padding=True,
+            return_tensors="pt",
+        ).to("cuda")
+        generated = model.generate(**inputs, max_new_tokens=128)
+        trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated)
+        ]
+        raw_out = processor.batch_decode(
+            trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        # ─────── draw predicted click for quick visual verification (optional) ──────
+        try:
+            actions = ast.literal_eval(raw_out)
+            for act in actions if isinstance(actions, list) else [actions]:
+                pos = act.get("position")
+                if pos and isinstance(pos, list) and len(pos) == 2:
+                    screenshot = draw_point(screenshot, pos)
+        except Exception:
+            # decoding failed → just return original screenshot
+            pass
+        return screenshot, raw_out, messages
+    finally:                               # ← always executed
+        torch.cuda.empty_cache()           # free unused blocks
+        torch.cuda.ipc_collect()           # defrag for next call
 # ────────────────────────── Gradio interface ───────────────────────────────