Build

Paused

File size: 2,008 Bytes

7ec133b
 
 
 
 
cb78e99
7ec133b
 
a4115fd
 
 
 
7ec133b
 
702cb53
89e1517
 
a58a8ea
7ec133b
cb78e99
89e1517
7ec133b
89e1517
 
 
702cb53
89e1517
 
7ec133b
89e1517
7ec133b
a4001c8
702cb53
7ec133b
 
 
 
 
702cb53
 
6ca8deb
7ec133b
9da530e
1c6e643
7ec133b

import gradio as gr
from PIL import Image
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

torch.set_default_device("cuda")

# Initialize the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("ManishThota/Sparrow",
                                             torch_dtype=torch.float16, 
                                             device_map="auto",
                                             trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("ManishThota/Sparrow", trust_remote_code=True)

def predict_answer(image, question, max_tokens):
    #Set inputs
    text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
    image = image.convert("RGB")
    
    input_ids = tokenizer(text, return_tensors='pt').input_ids.to("cuda:0", torch.float16)
    image_tensor = model.image_preprocess(image)
    
    #Generate the answer
    output_ids = model.generate(
        input_ids,
        max_new_tokens=max_tokens,
        images=image_tensor,
        use_cache=True)[0]
    
    return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()

def gradio_predict(image, question, max_tokens):
    answer = predict_answer(image, question, max_tokens)
    return answer

# Define the Gradio interface
iface = gr.Interface(
    fn=gradio_predict,
    inputs=[gr.Image(type="pil", label="Upload or Drag an Image"), 
            gr.Textbox(label="Question", placeholder="e.g. What are the colors of the bus in the image?", scale=4),
            gr.Slider(2, 100, value=25, label="Count", info="Choose between 2 and 100")],
    outputs=gr.TextArea(label="Answer"),
    title="Sparrow - Tiny 3B | Visual Question Answering",
    description="An interactive chat model that can answer questions about images in Academic contest.",
)

# Launch the app
iface.queue().launch(debug=True)