mrfakename commited on
Commit
d65703f
·
verified ·
1 Parent(s): c7d7380

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -40
app.py CHANGED
@@ -1,49 +1,60 @@
1
  from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
2
  from qwen_vl_utils import process_vision_info
 
3
 
4
  # Default: Load the model on the available device(s)
5
  model = Qwen2VLForConditionalGeneration.from_pretrained(
6
  "OS-Copilot/OS-Atlas-Base-7B", torch_dtype="auto", device_map="auto"
7
  )
8
  processor = AutoProcessor.from_pretrained("OS-Copilot/OS-Atlas-Base-7B")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- messages = [
11
- {
12
- "role": "user",
13
- "content": [
14
- {
15
- "type": "image",
16
- "image": "./exmaples/images/web_6f93090a-81f6-489e-bb35-1a2838b18c01.png",
17
- },
18
- {"type": "text", "text": "In this UI screenshot, what is the position of the element corresponding to the command \"switch language of current page\" (with bbox)?"},
19
- ],
20
- }
21
- ]
22
-
23
-
24
- # Preparation for inference
25
- text = processor.apply_chat_template(
26
- messages, tokenize=False, add_generation_prompt=True
27
- )
28
- image_inputs, video_inputs = process_vision_info(messages)
29
- inputs = processor(
30
- text=[text],
31
- images=image_inputs,
32
- videos=video_inputs,
33
- padding=True,
34
- return_tensors="pt",
35
- )
36
- inputs = inputs.to("cuda")
37
-
38
- # Inference: Generation of the output
39
- generated_ids = model.generate(**inputs, max_new_tokens=128)
40
-
41
- generated_ids_trimmed = [
42
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
43
- ]
44
-
45
- output_text = processor.batch_decode(
46
- generated_ids_trimmed, skip_special_tokens=False, clean_up_tokenization_spaces=False
47
- )
48
- print(output_text)
49
- # <|object_ref_start|>language switch<|object_ref_end|><|box_start|>(576,12),(592,42)<|box_end|><|im_end|>
 
1
  from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
2
  from qwen_vl_utils import process_vision_info
3
+ import spaces
4
 
5
  # Default: Load the model on the available device(s)
6
  model = Qwen2VLForConditionalGeneration.from_pretrained(
7
  "OS-Copilot/OS-Atlas-Base-7B", torch_dtype="auto", device_map="auto"
8
  )
9
  processor = AutoProcessor.from_pretrained("OS-Copilot/OS-Atlas-Base-7B")
10
+ @spaces.GPU
11
+ def run(image, message):
12
+ messages = [
13
+ {
14
+ "role": "user",
15
+ "content": [
16
+ {
17
+ "type": "image",
18
+ "image": "image,
19
+ },
20
+ {"type": "text", "text": message},
21
+ ],
22
+ }
23
+ ]
24
+
25
+
26
+ # Preparation for inference
27
+ text = processor.apply_chat_template(
28
+ messages, tokenize=False, add_generation_prompt=True
29
+ )
30
+ image_inputs, video_inputs = process_vision_info(messages)
31
+ inputs = processor(
32
+ text=[text],
33
+ images=image_inputs,
34
+ videos=video_inputs,
35
+ padding=True,
36
+ return_tensors="pt",
37
+ )
38
+ inputs = inputs.to("cuda")
39
+
40
+ # Inference: Generation of the output
41
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
42
+
43
+ generated_ids_trimmed = [
44
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
45
+ ]
46
+
47
+ output_text = processor.batch_decode(
48
+ generated_ids_trimmed, skip_special_tokens=False, clean_up_tokenization_spaces=False
49
+ )
50
+ return output_text
51
+ # <|object_ref_start|>language switch<|object_ref_end|><|box_start|>(576,12),(592,42)<|box_end|><|im_end|>
52
 
53
+ with gr.Blocks() as demo:
54
+ gr.Markdown("# Unofficial OS-Atlas demo")
55
+ image = gr.Image(label="Image")
56
+ text = gr.Textbox(label="Prompt")
57
+ btn = gr.Button("Generate", variant="primary")
58
+ output = gr.Textbox(interactive=False)
59
+ btn.click(run, inputs=[image, text], outputs=output)
60
+ demo.queue().launch()