Shivdutta commited on
Commit
fa8b743
Β·
verified Β·
1 Parent(s): 3de9321

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -2
app.py CHANGED
@@ -9,6 +9,10 @@ import whisperx
9
  import os
10
  clip_model_name = "openai/clip-vit-base-patch32"
11
  phi_model_name = "microsoft/phi-2"
 
 
 
 
12
  tokenizer = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
13
  processor = AutoProcessor.from_pretrained(clip_model_name)
14
  tokenizer.pad_token = tokenizer.eos_token
@@ -19,6 +23,8 @@ phi_embed = 2560
19
  compute_type = "float32"
20
  audio_batch_size = 16
21
 
 
 
22
  class SimpleResBlock(nn.Module):
23
  def __init__(self, phi_embed):
24
  super().__init__()
@@ -33,19 +39,29 @@ class SimpleResBlock(nn.Module):
33
  return x + self.proj(x)
34
 
35
  # models
 
 
 
 
36
  clip_model = CLIPVisionModel.from_pretrained(clip_model_name).to(device)
37
  projection = torch.nn.Linear(clip_embed, phi_embed).to(device)
38
  resblock = SimpleResBlock(phi_embed).to(device)
39
  phi_model = AutoModelForCausalLM.from_pretrained(phi_model_name,trust_remote_code=True).to(device)
40
- # Assuming you have defined 'device' and 'compute_type' elsewhere
41
  audio_model = whisperx.load_model("tiny", device, compute_type=compute_type, asr_options={'max_new_tokens': 2048, 'clip_timestamps': True, 'hallucination_silence_threshold': 0.25})
42
 
43
  # load weights
 
 
44
  model_to_merge = PeftModel.from_pretrained(phi_model,os.path.join(os.getcwd(), 'model_chkpt/lora_adaptor'))
45
  merged_model = model_to_merge.merge_and_unload()
46
  projection.load_state_dict(torch.load(os.path.join(os.getcwd(),'model_chkpt/finetunned_projection.pth'),map_location=torch.device(device)))
47
  resblock.load_state_dict(torch.load(os.path.join(os.getcwd(),'model_chkpt/finetuned_resblock.pth'),map_location=torch.device(device)))
48
 
 
 
 
 
 
49
  def model_generate_ans(img=None,img_audio=None,val_q=None):
50
 
51
  max_generate_length = 100
@@ -110,7 +126,7 @@ with gr.Blocks() as demo:
110
  /* General Layout */
111
  body {
112
  font-family: 'Arial', sans-serif;
113
- background-color: #ffe4e1; /* Soft pastel pink */
114
  margin: 0;
115
  padding: 0;
116
  }
 
9
  import os
10
  clip_model_name = "openai/clip-vit-base-patch32"
11
  phi_model_name = "microsoft/phi-2"
12
+ # Tokenizers and Processors: The tokenizer tokenizes text, and the processor handles preprocessing for images.
13
+ # Embedding sizes: clip_embed (768) is for the CLIP model, and phi_embed (2560) is for the Phi-2 model.
14
+ # Device: It selects CUDA if a GPU is available, otherwise, it uses the CPU.
15
+ # IMAGE_TOKEN_ID: Token ID reserved for images.
16
  tokenizer = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
17
  processor = AutoProcessor.from_pretrained(clip_model_name)
18
  tokenizer.pad_token = tokenizer.eos_token
 
23
  compute_type = "float32"
24
  audio_batch_size = 16
25
 
26
+ # This defines a simple residual block that uses a layer normalization (LayerNorm) followed by two linear layers with a GELU activation function in between.
27
+ # The block is used to add learned transformations to the embeddings, which helps in stabilizing learning and improving generalization.
28
  class SimpleResBlock(nn.Module):
29
  def __init__(self, phi_embed):
30
  super().__init__()
 
39
  return x + self.proj(x)
40
 
41
  # models
42
+ # CLIP Vision Model: Pretrained on visual tasks, outputs image embeddings.
43
+ # Projection Layer: Projects the clip_embed (768) dimensions to phi_embed (2560) to match the embedding sizes for downstream tasks.
44
+ # Residual Block: Uses the custom SimpleResBlock to process the embeddings further.
45
+ # Phi-2 Model: The language model handles text generation tasks.
46
  clip_model = CLIPVisionModel.from_pretrained(clip_model_name).to(device)
47
  projection = torch.nn.Linear(clip_embed, phi_embed).to(device)
48
  resblock = SimpleResBlock(phi_embed).to(device)
49
  phi_model = AutoModelForCausalLM.from_pretrained(phi_model_name,trust_remote_code=True).to(device)
 
50
  audio_model = whisperx.load_model("tiny", device, compute_type=compute_type, asr_options={'max_new_tokens': 2048, 'clip_timestamps': True, 'hallucination_silence_threshold': 0.25})
51
 
52
  # load weights
53
+ # LoRA Weights: The LoRA-adapted model merges with the Phi-2 model for fine-tuning.
54
+ # Loading Finetuned Layers: The pre-trained weights for the projection layer and residual block are loaded for further use.
55
  model_to_merge = PeftModel.from_pretrained(phi_model,os.path.join(os.getcwd(), 'model_chkpt/lora_adaptor'))
56
  merged_model = model_to_merge.merge_and_unload()
57
  projection.load_state_dict(torch.load(os.path.join(os.getcwd(),'model_chkpt/finetunned_projection.pth'),map_location=torch.device(device)))
58
  resblock.load_state_dict(torch.load(os.path.join(os.getcwd(),'model_chkpt/finetuned_resblock.pth'),map_location=torch.device(device)))
59
 
60
+
61
+ # Image Handling: Extracts image embeddings, passes through CLIP and a projection layer.
62
+ # Audio Handling: Transcribes audio with WhisperX, tokenizes it, and embeds the tokens.
63
+ # Text Handling: Tokenizes the text query and embeds it.
64
+ # Generating Response: The model generates tokens sequentially, combining inputs from images, audio, and text, and predicting the next token until it generates a full response.
65
  def model_generate_ans(img=None,img_audio=None,val_q=None):
66
 
67
  max_generate_length = 100
 
126
  /* General Layout */
127
  body {
128
  font-family: 'Arial', sans-serif;
129
+ background-color: #ffe4e1;
130
  margin: 0;
131
  padding: 0;
132
  }