import gradio as gr import spaces import torch torch.jit.script = lambda f: f torch.autocast = lambda device_type, dtype: lambda f: f from t2v_metrics import VQAScore, list_all_vqascore_models print(list_all_vqascore_models()) # Initialize the model only once # if torch.cuda.is_available(): # model_pipe = VQAScore(model="clip-flant5-xl", device="cpu") # our recommended scoring model # print("Model initialized!") @spaces.GPU def generate(model_name, image, text): # print("Model_name:", model_name) print("Image:", image) print("Text:", text) # model_pipe.to("cuda") model_pipe = VQAScore(model="clip-flant5-xl") # our recommended scoring model print("Generating!") return model_pipe(images=[image], texts=[text]) iface = gr.Interface( fn=generate, # function to call inputs=[gr.Dropdown(["clip-flant5-xl", "clip-flant5-xxl"], label="Model Name"), gr.Image(type="filepath"), gr.Textbox(label="Prompt")], # define the types of inputs # inputs=[gr.Image(type="filepath"), gr.Textbox(label="Prompt")], # define the types of inputs outputs="number", # define the type of output title="VQAScore", # title of the app description="This model evaluates the similarity between an image and a text prompt." ).launch()