Spaces:

zhiqiulin
/

VQAScore

Running on Zero

File size: 1,513 Bytes

171793b
 
 
b396086
 
828b75a
b396086
171793b
02caab5
2572bcd
58ed5ef
171793b
 
99fb211
39fbf51
 
 
 
 
2b389ac
71bd0a4
 
2572bcd
42e3288
58ed5ef
8b7f407
063aeac
58ed5ef
 
 
171793b
 
 
99fb211
 
171793b

import gradio as gr
import spaces

import torch
torch.jit.script = lambda f: f
# torch.autocast = lambda device_type, dtype: torch.autocast(device_type, torch.float)

# Initialize the model only once
# if torch.cuda.is_available(): 
# model_pipe = VQAScore(model="clip-flant5-xl", device="cpu")  # our recommended scoring model
# print("Model initialized!")

@spaces.GPU
def generate(model_name, image, text):
    
    from t2v_metrics import VQAScore, list_all_vqascore_models
    
    print(list_all_vqascore_models())
    
    # print("Model_name:", model_name)
    print("Image:", image)
    print("Text:", text)
    model_pipe = VQAScore(model="clip-flant5-xl")  # our recommended scoring model
    # print("Model initialized, now moving to cuda")
    model_pipe.to("cuda")
    print("Generating!")
    # with torch.autocast(device_type='cuda'):
    # with torch.autocast(device_type='cuda', dtype=torch.float):
    result = model_pipe(images=[image], texts=[text])
    return result

iface = gr.Interface(
    fn=generate,  # function to call
    inputs=[gr.Dropdown(["clip-flant5-xl", "clip-flant5-xxl"], label="Model Name"), gr.Image(type="filepath"), gr.Textbox(label="Prompt")],  # define the types of inputs
    # inputs=[gr.Image(type="filepath"), gr.Textbox(label="Prompt")],  # define the types of inputs
    outputs="number",  # define the type of output
    title="VQAScore",  # title of the app
    description="This model evaluates the similarity between an image and a text prompt."
).launch()