Spaces:
Running
on
Zero
Running
on
Zero
import subprocess | |
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) | |
import requests | |
import torch | |
import gradio as gr | |
from PIL import Image | |
from transformers import AutoModelForCausalLM, AutoProcessor | |
model_id_or_path = "rhymes-ai/Aria" | |
model = AutoModelForCausalLM.from_pretrained(model_id_or_path, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True) | |
processor = AutoProcessor.from_pretrained(model_id_or_path, trust_remote_code=True) | |
def run(): | |
image_path = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png" | |
image = Image.open(requests.get(image_path, stream=True).raw) | |
messages = [ | |
{ | |
"role": "user", | |
"content": [ | |
{"text": None, "type": "image"}, | |
{"text": "what is the image?", "type": "text"}, | |
], | |
} | |
] | |
text = processor.apply_chat_template(messages, add_generation_prompt=True) | |
inputs = processor(text=text, images=image, return_tensors="pt") | |
inputs["pixel_values"] = inputs["pixel_values"].to(model.dtype) | |
inputs = {k: v.to(model.device) for k, v in inputs.items()} | |
with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.bfloat16): | |
output = model.generate( | |
**inputs, | |
max_new_tokens=500, | |
stop_strings=["<|im_end|>"], | |
tokenizer=processor.tokenizer, | |
do_sample=True, | |
temperature=0.9, | |
) | |
output_ids = output[0][inputs["input_ids"].shape[1]:] | |
result = processor.decode(output_ids, skip_special_tokens=True) | |
with gr.Blocks() as demo: | |
btn = gr.Button("Run") | |
out = gr.Markdown() | |
btn.click(run, outputs=out) | |
demo.queue().launch() |