falcon-vision / app.py
Tonic's picture
Update app.py
01a426b verified
raw
history blame
1.37 kB
import gradio as gr
from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor
from PIL import Image
import requests
import torch
import spaces
# Load the processor and model
processor = LlavaNextProcessor.from_pretrained("tiiuae/falcon-11B-vlm", tokenizer_class='PreTrainedTokenizerFast')
model = LlavaNextForConditionalGeneration.from_pretrained("tiiuae/falcon-11B-vlm", torch_dtype=torch.bfloat16).to('cuda:0')
@spaces.GPU
def generate_paragraph(image_url):
cats_image = Image.open(requests.get(image_url, stream=True).raw)
instruction = 'Write a long paragraph about this picture.'
prompt = f"User:<image>\n{instruction} Falcon:"
inputs = processor(prompt, images=cats_image, return_tensors="pt", padding=True).to('cuda:0')
output = model.generate(**inputs, max_new_tokens=256)
generated_captions = processor.decode(output[0], skip_special_tokens=True).strip()
return generated_captions
# Define the Gradio interface
interface = gr.Interface(
fn=generate_paragraph,
inputs=gr.Textbox(label="Image URL"),
outputs=gr.Textbox(label="Generated Paragraph"),
title="Image to Paragraph Generation",
description="Enter the URL of an image, and the model will generate a descriptive paragraph about the image."
)
# Launch the Gradio interface
interface.launch()