mrfakename commited on
Commit
c7d7380
·
verified ·
1 Parent(s): 9e80389

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -47
app.py CHANGED
@@ -1,54 +1,49 @@
1
- import subprocess
2
- subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
3
 
4
- import requests
5
- import torch
6
- import gradio as gr
7
- from PIL import Image
8
- from transformers import AutoModelForCausalLM, AutoProcessor
9
 
10
- model_id_or_path = "rhymes-ai/Aria"
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- model = AutoModelForCausalLM.from_pretrained(model_id_or_path, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)
13
 
14
- processor = AutoProcessor.from_pretrained(model_id_or_path, trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- @spaces.GPU
17
- def run():
18
-
19
- image_path = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png"
20
-
21
- image = Image.open(requests.get(image_path, stream=True).raw)
22
-
23
- messages = [
24
- {
25
- "role": "user",
26
- "content": [
27
- {"text": None, "type": "image"},
28
- {"text": "what is the image?", "type": "text"},
29
- ],
30
- }
31
- ]
32
-
33
- text = processor.apply_chat_template(messages, add_generation_prompt=True)
34
- inputs = processor(text=text, images=image, return_tensors="pt")
35
- inputs["pixel_values"] = inputs["pixel_values"].to(model.dtype)
36
- inputs = {k: v.to(model.device) for k, v in inputs.items()}
37
- with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
38
- output = model.generate(
39
- **inputs,
40
- max_new_tokens=500,
41
- stop_strings=["<|im_end|>"],
42
- tokenizer=processor.tokenizer,
43
- do_sample=True,
44
- temperature=0.9,
45
- )
46
- output_ids = output[0][inputs["input_ids"].shape[1]:]
47
- result = processor.decode(output_ids, skip_special_tokens=True)
48
 
49
- with gr.Blocks() as demo:
50
- btn = gr.Button("Run")
51
- out = gr.Markdown()
52
- btn.click(run, outputs=out)
53
 
54
- demo.queue().launch()
 
 
 
 
 
1
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
2
+ from qwen_vl_utils import process_vision_info
3
 
4
+ # Default: Load the model on the available device(s)
5
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
6
+ "OS-Copilot/OS-Atlas-Base-7B", torch_dtype="auto", device_map="auto"
7
+ )
8
+ processor = AutoProcessor.from_pretrained("OS-Copilot/OS-Atlas-Base-7B")
9
 
10
+ messages = [
11
+ {
12
+ "role": "user",
13
+ "content": [
14
+ {
15
+ "type": "image",
16
+ "image": "./exmaples/images/web_6f93090a-81f6-489e-bb35-1a2838b18c01.png",
17
+ },
18
+ {"type": "text", "text": "In this UI screenshot, what is the position of the element corresponding to the command \"switch language of current page\" (with bbox)?"},
19
+ ],
20
+ }
21
+ ]
22
 
 
23
 
24
+ # Preparation for inference
25
+ text = processor.apply_chat_template(
26
+ messages, tokenize=False, add_generation_prompt=True
27
+ )
28
+ image_inputs, video_inputs = process_vision_info(messages)
29
+ inputs = processor(
30
+ text=[text],
31
+ images=image_inputs,
32
+ videos=video_inputs,
33
+ padding=True,
34
+ return_tensors="pt",
35
+ )
36
+ inputs = inputs.to("cuda")
37
 
38
+ # Inference: Generation of the output
39
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ generated_ids_trimmed = [
42
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
43
+ ]
 
44
 
45
+ output_text = processor.batch_decode(
46
+ generated_ids_trimmed, skip_special_tokens=False, clean_up_tokenization_spaces=False
47
+ )
48
+ print(output_text)
49
+ # <|object_ref_start|>language switch<|object_ref_end|><|box_start|>(576,12),(592,42)<|box_end|><|im_end|>