File size: 1,513 Bytes
171793b
 
 
b396086
 
828b75a
b396086
171793b
02caab5
2572bcd
58ed5ef
171793b
 
99fb211
39fbf51
 
 
 
 
2b389ac
71bd0a4
 
2572bcd
42e3288
58ed5ef
8b7f407
063aeac
58ed5ef
 
 
171793b
 
 
99fb211
 
171793b
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import gradio as gr
import spaces

import torch
torch.jit.script = lambda f: f
# torch.autocast = lambda device_type, dtype: torch.autocast(device_type, torch.float)

# Initialize the model only once
# if torch.cuda.is_available(): 
# model_pipe = VQAScore(model="clip-flant5-xl", device="cpu")  # our recommended scoring model
# print("Model initialized!")

@spaces.GPU
def generate(model_name, image, text):
    
    from t2v_metrics import VQAScore, list_all_vqascore_models
    
    print(list_all_vqascore_models())
    
    # print("Model_name:", model_name)
    print("Image:", image)
    print("Text:", text)
    model_pipe = VQAScore(model="clip-flant5-xl")  # our recommended scoring model
    # print("Model initialized, now moving to cuda")
    model_pipe.to("cuda")
    print("Generating!")
    # with torch.autocast(device_type='cuda'):
    # with torch.autocast(device_type='cuda', dtype=torch.float):
    result = model_pipe(images=[image], texts=[text])
    return result

iface = gr.Interface(
    fn=generate,  # function to call
    inputs=[gr.Dropdown(["clip-flant5-xl", "clip-flant5-xxl"], label="Model Name"), gr.Image(type="filepath"), gr.Textbox(label="Prompt")],  # define the types of inputs
    # inputs=[gr.Image(type="filepath"), gr.Textbox(label="Prompt")],  # define the types of inputs
    outputs="number",  # define the type of output
    title="VQAScore",  # title of the app
    description="This model evaluates the similarity between an image and a text prompt."
).launch()