File size: 3,043 Bytes
7ec133b
 
 
 
 
cb78e99
 
 
7ec133b
 
8323cfd
 
 
7ec133b
 
89e1517
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ec133b
89e1517
 
7ec133b
89e1517
7ec133b
702cb53
89e1517
 
a58a8ea
7ec133b
cb78e99
89e1517
7ec133b
89e1517
 
 
702cb53
89e1517
 
7ec133b
89e1517
7ec133b
a4001c8
702cb53
7ec133b
 
 
 
 
702cb53
 
6ca8deb
7ec133b
9da530e
1c6e643
7ec133b
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import gradio as gr
from PIL import Image
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# # Set default device to CUDA for GPU acceleration
# device = 'cuda' if torch.cuda.is_available() else "cpu"
torch.set_default_device("cuda")

# Initialize the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("ManishThota/Sparrow",  torch_dtype=torch.float16, 
    device_map="auto",
    trust_remote_code=True).to(device)
tokenizer = AutoTokenizer.from_pretrained("ManishThota/Sparrow", trust_remote_code=True)

# def predict_answer(image, question):
#     # Convert PIL image to RGB if not already
#     image = image.convert("RGB")
    
#     # # Format the text input for the model
#     # text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question} ASSISTANT:"
    
#     # Tokenize the text input
#     encoding = tokenizer(image, question, return_tensors='pt').to(device)

#     out = model.generate(**encoding)
#     # Preprocess the image for the model
#     generated_text = tokenizer.decode(out[0], skip_special_tokens=True)
    
#     # # Generate the answer
#     # output_ids = model.generate(
#     #     input_ids,
#     #     max_new_tokens=100,
#     #     images=image_tensor,
#     #     use_cache=True)[0]
    
#     # # Decode the generated tokens to get the answer
#     # answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
    
#     return generated_text

def predict_answer(image, question, max_tokens):
    #Set inputs
    text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
    image = image.convert("RGB")
    
    input_ids = tokenizer(text, return_tensors='pt').input_ids.to("cuda:0", torch.float16)
    image_tensor = model.image_preprocess(image)
    
    #Generate the answer
    output_ids = model.generate(
        input_ids,
        max_new_tokens=max_tokens,
        images=image_tensor,
        use_cache=True)[0]
    
    return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()

def gradio_predict(image, question, max_tokens):
    answer = predict_answer(image, question, max_tokens)
    return answer

# Define the Gradio interface
iface = gr.Interface(
    fn=gradio_predict,
    inputs=[gr.Image(type="pil", label="Upload or Drag an Image"), 
            gr.Textbox(label="Question", placeholder="e.g. What are the colors of the bus in the image?", scale=4),
            gr.Slider(2, 100, value=25, label="Count", info="Choose between 2 and 100")],
    outputs=gr.TextArea(label="Answer"),
    title="Sparrow - Tiny 3B | Visual Question Answering",
    description="An interactive chat model that can answer questions about images in Academic contest.",
)

# Launch the app
iface.queue().launch(debug=True)