Rajagopal commited on
Commit
9a9792d
·
1 Parent(s): cd10cf2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -6
app.py CHANGED
@@ -59,12 +59,12 @@ def audio_text_zeroshot(audio, text_list):
59
  return score_dict
60
 
61
 
62
- def video_text_zeroshot(video, text_list):
63
- video_paths = [video]
64
  labels = [label.strip(" ") for label in text_list.strip(" ").split("|")]
65
  inputs = {
66
  ModalityType.TEXT: data.load_and_transform_text(labels, device),
67
- ModalityType.VISION: data.load_and_transform_video_data(video_paths, device),
68
  }
69
 
70
  with torch.no_grad():
@@ -82,20 +82,21 @@ def video_text_zeroshot(video, text_list):
82
 
83
  return score_dict
84
 
 
85
 
86
  def inference(
87
  task,
88
  text_list=None,
89
  image=None,
90
  audio=None,
91
- video=None,
92
  ):
93
  if task == "image-text":
94
  result = image_text_zeroshot(image, text_list)
95
  elif task == "audio-text":
96
  result = audio_text_zeroshot(audio, text_list)
97
  elif task == "video-text":
98
- result = video_text_zeroshot(video, text_list)
99
  else:
100
  raise NotImplementedError
101
  return result
@@ -116,7 +117,7 @@ def main():
116
  gr.inputs.Textbox(lines=1, label="Candidate texts"),
117
  gr.inputs.Image(type="filepath", label="Input image"),
118
  gr.inputs.Audio(type="filepath", label="Input audio"),
119
- gr.inputs.Video(type=None, label="Input video"),
120
  ]
121
 
122
  iface = gr.Interface(
 
59
  return score_dict
60
 
61
 
62
+ def video_text_zeroshot(image, text_list):
63
+ image_paths = [image]
64
  labels = [label.strip(" ") for label in text_list.strip(" ").split("|")]
65
  inputs = {
66
  ModalityType.TEXT: data.load_and_transform_text(labels, device),
67
+ ModalityType.VISION: data.load_and_transform_vision_data(image_paths, device),
68
  }
69
 
70
  with torch.no_grad():
 
82
 
83
  return score_dict
84
 
85
+
86
 
87
  def inference(
88
  task,
89
  text_list=None,
90
  image=None,
91
  audio=None,
92
+ image2=None,
93
  ):
94
  if task == "image-text":
95
  result = image_text_zeroshot(image, text_list)
96
  elif task == "audio-text":
97
  result = audio_text_zeroshot(audio, text_list)
98
  elif task == "video-text":
99
+ result = video_text_zeroshot(image2, text_list)
100
  else:
101
  raise NotImplementedError
102
  return result
 
117
  gr.inputs.Textbox(lines=1, label="Candidate texts"),
118
  gr.inputs.Image(type="filepath", label="Input image"),
119
  gr.inputs.Audio(type="filepath", label="Input audio"),
120
+ gr.inputs.Image(type="filepath", label="Input image"),
121
  ]
122
 
123
  iface = gr.Interface(