Spaces:

Stremly
/

uitars

Running

App Files Files Community

Stremly commited on Jun 27

Commit

15c1569

verified ·

1 Parent(s): 4c75728

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -40

app.py CHANGED Viewed

@@ -11,10 +11,30 @@ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info  # include this file in your repo if not pip-installable
 # ---- model & processor loaded on CPU ----
-# ─── lazy-load cache ──────────────────────────────────────────
-_MODEL = None                 # will hold the quantised weights
-_PROCESSOR = None             # will hold the resized processor
 def draw_point(image: Image.Image, point=None, radius: int = 5):
@@ -39,38 +59,7 @@ def navigate(screenshot, task: str, platform: str, history):
         history (list | str | None): Previous messages list. Accepts either an
             actual Python list (via gr.JSON) or a JSON/Python‑literal string.
     """
-    global _MODEL, _PROCESSOR
-     # ------- on-demand model / processor load -------------------------
-    if _MODEL is None:
-        from transformers import BitsAndBytesConfig
-        # 4-bit quantisation (~6 GB on H200)
-        bnb_cfg = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=torch.float16,
-            bnb_4bit_use_double_quant=True,
-        )
-        _MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-            "ByteDance-Seed/UI-TARS-1.5-7B",
-            quantization_config=bnb_cfg,
-            device_map="auto",
-            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
-        )
-        _PROCESSOR = AutoProcessor.from_pretrained(
-            "ByteDance-Seed/UI-TARS-1.5-7B",
-            size={"shortest_edge": 512, "longest_edge": 1344},  # sane res
-            use_fast=True,
-        )
-        # use mem-efficient attention kernels
-        torch.backends.cuda.enable_flash_sdp(False)
-        torch.backends.cuda.enable_mem_efficient_sdp(True)
-        model = _MODEL
-        processor = _PROCESSOR
     # ───────────────────── normalise history input ──────────────────────────
     try:
@@ -139,11 +128,6 @@ def navigate(screenshot, task: str, platform: str, history):
             pass
         return screenshot, raw_out, messages
-    finally:                               # ← always executed
-        torch.cuda.empty_cache()           # free unused blocks
-        torch.cuda.ipc_collect()           # defrag for next call
 # ────────────────────────── Gradio interface ───────────────────────────────

 from qwen_vl_utils import process_vision_info  # include this file in your repo if not pip-installable
 # ---- model & processor loaded on CPU ----
+from transformers import BitsAndBytesConfig
+# 4-bit quantisation (~6 GB on H200)
+bnb_cfg = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_use_double_quant=True,
+)
+_MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    "ByteDance-Seed/UI-TARS-1.5-7B",
+    quantization_config=bnb_cfg,
+    device_map="auto",
+    torch_dtype=torch.float16
+)
+_PROCESSOR = AutoProcessor.from_pretrained(
+    "ByteDance-Seed/UI-TARS-1.5-7B",
+    size={"shortest_edge": 512, "longest_edge": 1344},  # sane res
+    use_fast=True,
+)
+model = _MODEL
+processor = _PROCESSOR
 def draw_point(image: Image.Image, point=None, radius: int = 5):
         history (list | str | None): Previous messages list. Accepts either an
             actual Python list (via gr.JSON) or a JSON/Python‑literal string.
     """
     # ───────────────────── normalise history input ──────────────────────────
     try:
             pass
         return screenshot, raw_out, messages
 # ────────────────────────── Gradio interface ───────────────────────────────