Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -59,12 +59,12 @@ def audio_text_zeroshot(audio, text_list):
|
|
59 |
return score_dict
|
60 |
|
61 |
|
62 |
-
def video_text_zeroshot(
|
63 |
-
|
64 |
labels = [label.strip(" ") for label in text_list.strip(" ").split("|")]
|
65 |
inputs = {
|
66 |
ModalityType.TEXT: data.load_and_transform_text(labels, device),
|
67 |
-
ModalityType.VISION: data.
|
68 |
}
|
69 |
|
70 |
with torch.no_grad():
|
@@ -82,20 +82,21 @@ def video_text_zeroshot(video, text_list):
|
|
82 |
|
83 |
return score_dict
|
84 |
|
|
|
85 |
|
86 |
def inference(
|
87 |
task,
|
88 |
text_list=None,
|
89 |
image=None,
|
90 |
audio=None,
|
91 |
-
|
92 |
):
|
93 |
if task == "image-text":
|
94 |
result = image_text_zeroshot(image, text_list)
|
95 |
elif task == "audio-text":
|
96 |
result = audio_text_zeroshot(audio, text_list)
|
97 |
elif task == "video-text":
|
98 |
-
result = video_text_zeroshot(
|
99 |
else:
|
100 |
raise NotImplementedError
|
101 |
return result
|
@@ -116,7 +117,7 @@ def main():
|
|
116 |
gr.inputs.Textbox(lines=1, label="Candidate texts"),
|
117 |
gr.inputs.Image(type="filepath", label="Input image"),
|
118 |
gr.inputs.Audio(type="filepath", label="Input audio"),
|
119 |
-
|
120 |
]
|
121 |
|
122 |
iface = gr.Interface(
|
|
|
59 |
return score_dict
|
60 |
|
61 |
|
62 |
+
def video_text_zeroshot(image, text_list):
|
63 |
+
image_paths = [image]
|
64 |
labels = [label.strip(" ") for label in text_list.strip(" ").split("|")]
|
65 |
inputs = {
|
66 |
ModalityType.TEXT: data.load_and_transform_text(labels, device),
|
67 |
+
ModalityType.VISION: data.load_and_transform_vision_data(image_paths, device),
|
68 |
}
|
69 |
|
70 |
with torch.no_grad():
|
|
|
82 |
|
83 |
return score_dict
|
84 |
|
85 |
+
|
86 |
|
87 |
def inference(
|
88 |
task,
|
89 |
text_list=None,
|
90 |
image=None,
|
91 |
audio=None,
|
92 |
+
image2=None,
|
93 |
):
|
94 |
if task == "image-text":
|
95 |
result = image_text_zeroshot(image, text_list)
|
96 |
elif task == "audio-text":
|
97 |
result = audio_text_zeroshot(audio, text_list)
|
98 |
elif task == "video-text":
|
99 |
+
result = video_text_zeroshot(image2, text_list)
|
100 |
else:
|
101 |
raise NotImplementedError
|
102 |
return result
|
|
|
117 |
gr.inputs.Textbox(lines=1, label="Candidate texts"),
|
118 |
gr.inputs.Image(type="filepath", label="Input image"),
|
119 |
gr.inputs.Audio(type="filepath", label="Input audio"),
|
120 |
+
gr.inputs.Image(type="filepath", label="Input image"),
|
121 |
]
|
122 |
|
123 |
iface = gr.Interface(
|