Spaces:

zhiqiulin
/

VQAScore

Running on Zero

File size: 1,518 Bytes

171793b
 
 
b396086
 
828b75a
b396086
61669ec
 
171793b
 
 
02caab5
e58c712
1fd93ff
171793b
 
99fb211
2b389ac
71bd0a4
 
1fd93ff
42e3288
4536949
8b7f407
063aeac
6376e7e
 
 
 
171793b
 
 
99fb211
 
171793b

import gradio as gr
import spaces

import torch
torch.jit.script = lambda f: f
# torch.autocast = lambda device_type, dtype: torch.autocast(device_type, torch.float)

from t2v_metrics import VQAScore, list_all_vqascore_models

print(list_all_vqascore_models())

# Initialize the model only once
# if torch.cuda.is_available(): 
model_pipe = VQAScore(model="clip-flant5-xl", device="cpu")  # our recommended scoring model
print("Model initialized!")

@spaces.GPU
def generate(model_name, image, text):
    # print("Model_name:", model_name)
    print("Image:", image)
    print("Text:", text)
    # model_pipe = VQAScore(model="clip-flant5-xl")  # our recommended scoring model
    # print("Model initialized, now moving to cuda")
    # model_pipe.to("cuda")
    print("Generating!")
    # with torch.autocast(device_type='cuda'):
    # with torch.autocast(device_type='cuda', dtype=torch.float):
    #     result = model_pipe(images=[image], texts=[text])
    #     return result
    return 10

iface = gr.Interface(
    fn=generate,  # function to call
    inputs=[gr.Dropdown(["clip-flant5-xl", "clip-flant5-xxl"], label="Model Name"), gr.Image(type="filepath"), gr.Textbox(label="Prompt")],  # define the types of inputs
    # inputs=[gr.Image(type="filepath"), gr.Textbox(label="Prompt")],  # define the types of inputs
    outputs="number",  # define the type of output
    title="VQAScore",  # title of the app
    description="This model evaluates the similarity between an image and a text prompt."
).launch()