Spaces:
Paused
Paused
File size: 3,043 Bytes
7ec133b cb78e99 7ec133b 8323cfd 7ec133b 89e1517 7ec133b 89e1517 7ec133b 89e1517 7ec133b 702cb53 89e1517 a58a8ea 7ec133b cb78e99 89e1517 7ec133b 89e1517 702cb53 89e1517 7ec133b 89e1517 7ec133b a4001c8 702cb53 7ec133b 702cb53 6ca8deb 7ec133b 9da530e 1c6e643 7ec133b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import gradio as gr
from PIL import Image
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# # Set default device to CUDA for GPU acceleration
# device = 'cuda' if torch.cuda.is_available() else "cpu"
torch.set_default_device("cuda")
# Initialize the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("ManishThota/Sparrow", torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True).to(device)
tokenizer = AutoTokenizer.from_pretrained("ManishThota/Sparrow", trust_remote_code=True)
# def predict_answer(image, question):
# # Convert PIL image to RGB if not already
# image = image.convert("RGB")
# # # Format the text input for the model
# # text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question} ASSISTANT:"
# # Tokenize the text input
# encoding = tokenizer(image, question, return_tensors='pt').to(device)
# out = model.generate(**encoding)
# # Preprocess the image for the model
# generated_text = tokenizer.decode(out[0], skip_special_tokens=True)
# # # Generate the answer
# # output_ids = model.generate(
# # input_ids,
# # max_new_tokens=100,
# # images=image_tensor,
# # use_cache=True)[0]
# # # Decode the generated tokens to get the answer
# # answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
# return generated_text
def predict_answer(image, question, max_tokens):
#Set inputs
text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
image = image.convert("RGB")
input_ids = tokenizer(text, return_tensors='pt').input_ids.to("cuda:0", torch.float16)
image_tensor = model.image_preprocess(image)
#Generate the answer
output_ids = model.generate(
input_ids,
max_new_tokens=max_tokens,
images=image_tensor,
use_cache=True)[0]
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
def gradio_predict(image, question, max_tokens):
answer = predict_answer(image, question, max_tokens)
return answer
# Define the Gradio interface
iface = gr.Interface(
fn=gradio_predict,
inputs=[gr.Image(type="pil", label="Upload or Drag an Image"),
gr.Textbox(label="Question", placeholder="e.g. What are the colors of the bus in the image?", scale=4),
gr.Slider(2, 100, value=25, label="Count", info="Choose between 2 and 100")],
outputs=gr.TextArea(label="Answer"),
title="Sparrow - Tiny 3B | Visual Question Answering",
description="An interactive chat model that can answer questions about images in Academic contest.",
)
# Launch the app
iface.queue().launch(debug=True)
|