Shivdutta commited on
Commit
7ac8e01
Β·
verified Β·
1 Parent(s): 5dfce3a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -5
app.py CHANGED
@@ -7,8 +7,6 @@ from peft import PeftModel
7
  import torch.nn as nn
8
  import whisperx
9
  import os
10
-
11
-
12
  clip_model_name = "openai/clip-vit-base-patch32"
13
  phi_model_name = "microsoft/phi-2"
14
  tokenizer = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
@@ -18,10 +16,18 @@ IMAGE_TOKEN_ID = 23893 # token for word comment
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
  clip_embed = 768
20
  phi_embed = 2560
 
 
 
 
 
 
 
 
 
21
  nn.GELU(),
22
  nn.Linear(phi_embed, phi_embed)
23
  )
24
-
25
  def forward(self, x):
26
  x = self.pre_norm(x)
27
  return x + self.proj(x)
@@ -54,6 +60,9 @@ def model_generate_ans(img=None,img_audio=None,val_q=None):
54
  val_image_embeds = projection(clip_val_outputs)
55
  val_image_embeds = resblock(val_image_embeds).to(torch.float16)
56
 
 
 
 
57
  val_combined_embeds.append(val_image_embeds)
58
  val_combined_embeds.append(img_token_embeds)
59
 
@@ -92,7 +101,6 @@ def model_generate_ans(img=None,img_audio=None,val_q=None):
92
  return predicted_captions_decoded
93
 
94
 
95
-
96
  with gr.Blocks() as demo:
97
 
98
  gr.Markdown(
@@ -108,7 +116,6 @@ with gr.Blocks() as demo:
108
  img_input = gr.Image(label='Image',type="pil")
109
  img_audio = gr.Audio(label="Audio Query", sources=['microphone', 'upload'], type='filepath')
110
  img_question = gr.Text(label ='Text Query')
111
-
112
  with gr.Column():
113
  img_answer = gr.Text(label ='Answer')
114
 
 
7
  import torch.nn as nn
8
  import whisperx
9
  import os
 
 
10
  clip_model_name = "openai/clip-vit-base-patch32"
11
  phi_model_name = "microsoft/phi-2"
12
  tokenizer = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
 
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
  clip_embed = 768
18
  phi_embed = 2560
19
+ compute_type = "float32"
20
+ audio_batch_size = 16
21
+
22
+ class SimpleResBlock(nn.Module):
23
+ def __init__(self, phi_embed):
24
+ super().__init__()
25
+ self.pre_norm = nn.LayerNorm(phi_embed)
26
+ self.proj = nn.Sequential(
27
+ nn.Linear(phi_embed, phi_embed),
28
  nn.GELU(),
29
  nn.Linear(phi_embed, phi_embed)
30
  )
 
31
  def forward(self, x):
32
  x = self.pre_norm(x)
33
  return x + self.proj(x)
 
60
  val_image_embeds = projection(clip_val_outputs)
61
  val_image_embeds = resblock(val_image_embeds).to(torch.float16)
62
 
63
+ img_token_tensor = torch.tensor(IMAGE_TOKEN_ID).to(device)
64
+ img_token_embeds = merged_model.model.embed_tokens(img_token_tensor).unsqueeze(0).unsqueeze(0)
65
+
66
  val_combined_embeds.append(val_image_embeds)
67
  val_combined_embeds.append(img_token_embeds)
68
 
 
101
  return predicted_captions_decoded
102
 
103
 
 
104
  with gr.Blocks() as demo:
105
 
106
  gr.Markdown(
 
116
  img_input = gr.Image(label='Image',type="pil")
117
  img_audio = gr.Audio(label="Audio Query", sources=['microphone', 'upload'], type='filepath')
118
  img_question = gr.Text(label ='Text Query')
 
119
  with gr.Column():
120
  img_answer = gr.Text(label ='Answer')
121