Harshit0414 commited on
Commit
cf3d408
·
1 Parent(s): 0863035

code added

Browse files
Files changed (2) hide show
  1. app.py +103 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio demo for UI‑TARS 1.5‑7B (image‑text‑to‑text) on Hugging Face Spaces.
3
+ Save this file as **app.py** and add a *requirements.txt* with the packages
4
+ listed below. Then create a new **Python** Space, upload both files and
5
+ commit — the Space will build and serve the app automatically.
6
+
7
+ requirements.txt (suggested versions)
8
+ -------------------------------------
9
+ transformers==4.41.0
10
+ accelerate>=0.29.0
11
+ torch>=2.2
12
+ sentencepiece # needed for many multilingual models
13
+ bitsandbytes # optional: enables 4‑bit quantization if Space has GPU
14
+ pillow
15
+ gradio>=4.33
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from typing import List, Dict, Any
21
+
22
+ import gradio as gr
23
+ from PIL import Image
24
+ from transformers import pipeline
25
+ import base64
26
+
27
+ def load_model():
28
+ """Load the UI‑TARS multimodal pipeline once at startup."""
29
+ print("Loading UI‑TARS 1.5‑7B… this may take a while the first time.")
30
+ return pipeline(
31
+ "image-text-to-text",
32
+ model="ByteDance-Seed/UI-TARS-1.5-7B",
33
+ device_map="auto", # automatically use GPU if available
34
+ )
35
+
36
+
37
+ pipe = load_model()
38
+
39
+
40
+ def answer_question(image: Image.Image, question: str) -> str:
41
+ """Run the model on the provided image & question and return its answer."""
42
+ if image is None or not question.strip():
43
+ return "Please supply **both** an image and a question."
44
+
45
+ base64_image = base64.b64encode(image.tobytes()).decode('utf-8')
46
+
47
+ # Compose a messages list in the expected multimodal chat format.
48
+ messages: List[Dict[str, Any]] = [
49
+ {
50
+ "role": "user",
51
+ "content": [
52
+ {"type": "text", "text": f"You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='<|box_start|>(x1, y1)<|box_end|>')\nleft_double(start_box='<|box_start|>(x1, y1)<|box_end|>')\nright_single(start_box='<|box_start|>(x1, y1)<|box_end|>')\ndrag(start_box='<|box_start|>(x1, y1)<|box_end|>', end_box='<|box_start|>(x3, y3)<|box_end|>')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use Chinese in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\n\n## User Instruction\n{question.strip()}"},
53
+ ],
54
+ },
55
+ {
56
+ "role":"user",
57
+ "content": [
58
+ {"type": "image_url",
59
+ "image_url": base64_image},
60
+ ],
61
+ }
62
+ ]
63
+
64
+ # The pipeline returns a list with one dict when `messages` is passed via
65
+ # the `text` keyword. We extract the generated text robustly.
66
+ outputs = pipe(text=messages)
67
+
68
+ if isinstance(outputs, list):
69
+ first = outputs[0]
70
+ if isinstance(first, dict) and "generated_text" in first:
71
+ return first["generated_text"].strip()
72
+ return str(first)
73
+
74
+ return str(outputs)
75
+
76
+
77
+ demo = gr.Interface(
78
+ fn=answer_question,
79
+ inputs=[
80
+ gr.Image(type="pil", label="Upload image"),
81
+ gr.Textbox(label="Ask a question about the image", placeholder="e.g. What animal is on the candy?"),
82
+ ],
83
+ outputs=gr.Textbox(label="UI‑TARS answer"),
84
+ title="UI‑TARS 1.5‑7B – Visual Q&A",
85
+ description=(
86
+ "Upload an image and ask a question. The **UI‑TARS 1.5‑7B** model will "
87
+ "answer based on the visual content. Runs completely on‑device in this Space."
88
+ ),
89
+ examples=[
90
+ [
91
+ "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG",
92
+ "What animal is on the candy?",
93
+ ]
94
+ ],
95
+ cache_examples=True,
96
+ allow_flagging="never",
97
+ )
98
+
99
+
100
+ if __name__ == "__main__":
101
+ # Spaces automatically call `demo.launch()`, but running locally this
102
+ # guard lets you execute `python app.py` for quick tests.
103
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ transformers
2
+ accelerate
3
+ torch
4
+ sentencepiece
5
+ bitsandbytes
6
+ pillow
7
+ gradio