Spaces:

Shivdutta
/

S30-MultiModalGPT

Runtime error

App Files Files Community

Shivdutta commited on Oct 6, 2024

Commit

fa8b743

verified ·

1 Parent(s): 3de9321

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -2

app.py CHANGED Viewed

@@ -9,6 +9,10 @@ import whisperx
 import os
 clip_model_name = "openai/clip-vit-base-patch32"
 phi_model_name  = "microsoft/phi-2"
 tokenizer  = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
 processor  = AutoProcessor.from_pretrained(clip_model_name)
 tokenizer.pad_token = tokenizer.eos_token
@@ -19,6 +23,8 @@ phi_embed  = 2560
 compute_type = "float32"
 audio_batch_size = 16
 class SimpleResBlock(nn.Module):
     def __init__(self, phi_embed):
         super().__init__()
@@ -33,19 +39,29 @@ class SimpleResBlock(nn.Module):
         return x + self.proj(x)
 # models
 clip_model = CLIPVisionModel.from_pretrained(clip_model_name).to(device)
 projection = torch.nn.Linear(clip_embed, phi_embed).to(device)
 resblock = SimpleResBlock(phi_embed).to(device)
 phi_model = AutoModelForCausalLM.from_pretrained(phi_model_name,trust_remote_code=True).to(device)
-# Assuming you have defined 'device' and 'compute_type' elsewhere
 audio_model = whisperx.load_model("tiny", device, compute_type=compute_type, asr_options={'max_new_tokens': 2048, 'clip_timestamps': True, 'hallucination_silence_threshold': 0.25})
 # load weights
 model_to_merge = PeftModel.from_pretrained(phi_model,os.path.join(os.getcwd(), 'model_chkpt/lora_adaptor'))
 merged_model   = model_to_merge.merge_and_unload()
 projection.load_state_dict(torch.load(os.path.join(os.getcwd(),'model_chkpt/finetunned_projection.pth'),map_location=torch.device(device)))
 resblock.load_state_dict(torch.load(os.path.join(os.getcwd(),'model_chkpt/finetuned_resblock.pth'),map_location=torch.device(device)))
 def model_generate_ans(img=None,img_audio=None,val_q=None):
     max_generate_length = 100
@@ -110,7 +126,7 @@ with gr.Blocks() as demo:
     /* General Layout */
     body {
         font-family: 'Arial', sans-serif;
-        background-color: #ffe4e1; /* Soft pastel pink */
         margin: 0;
         padding: 0;
     }

 import os
 clip_model_name = "openai/clip-vit-base-patch32"
 phi_model_name  = "microsoft/phi-2"
+# Tokenizers and Processors: The tokenizer tokenizes text, and the processor handles preprocessing for images.
+# Embedding sizes: clip_embed (768) is for the CLIP model, and phi_embed (2560) is for the Phi-2 model.
+# Device: It selects CUDA if a GPU is available, otherwise, it uses the CPU.
+# IMAGE_TOKEN_ID: Token ID reserved for images.
 tokenizer  = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
 processor  = AutoProcessor.from_pretrained(clip_model_name)
 tokenizer.pad_token = tokenizer.eos_token
 compute_type = "float32"
 audio_batch_size = 16
+# This defines a simple residual block that uses a layer normalization (LayerNorm) followed by two linear layers with a GELU activation function in between.
+# The block is used to add learned transformations to the embeddings, which helps in stabilizing learning and improving generalization.
 class SimpleResBlock(nn.Module):
     def __init__(self, phi_embed):
         super().__init__()
         return x + self.proj(x)
 # models
+# CLIP Vision Model: Pretrained on visual tasks, outputs image embeddings.
+# Projection Layer: Projects the clip_embed (768) dimensions to phi_embed (2560) to match the embedding sizes for downstream tasks.
+# Residual Block: Uses the custom SimpleResBlock to process the embeddings further.
+# Phi-2 Model: The language model handles text generation tasks.
 clip_model = CLIPVisionModel.from_pretrained(clip_model_name).to(device)
 projection = torch.nn.Linear(clip_embed, phi_embed).to(device)
 resblock = SimpleResBlock(phi_embed).to(device)
 phi_model = AutoModelForCausalLM.from_pretrained(phi_model_name,trust_remote_code=True).to(device)
 audio_model = whisperx.load_model("tiny", device, compute_type=compute_type, asr_options={'max_new_tokens': 2048, 'clip_timestamps': True, 'hallucination_silence_threshold': 0.25})
 # load weights
+# LoRA Weights: The LoRA-adapted model merges with the Phi-2 model for fine-tuning.
+# Loading Finetuned Layers: The pre-trained weights for the projection layer and residual block are loaded for further use.
 model_to_merge = PeftModel.from_pretrained(phi_model,os.path.join(os.getcwd(), 'model_chkpt/lora_adaptor'))
 merged_model   = model_to_merge.merge_and_unload()
 projection.load_state_dict(torch.load(os.path.join(os.getcwd(),'model_chkpt/finetunned_projection.pth'),map_location=torch.device(device)))
 resblock.load_state_dict(torch.load(os.path.join(os.getcwd(),'model_chkpt/finetuned_resblock.pth'),map_location=torch.device(device)))
+# Image Handling: Extracts image embeddings, passes through CLIP and a projection layer.
+# Audio Handling: Transcribes audio with WhisperX, tokenizes it, and embeds the tokens.
+# Text Handling: Tokenizes the text query and embeds it.
+# Generating Response: The model generates tokens sequentially, combining inputs from images, audio, and text, and predicting the next token until it generates a full response.
 def model_generate_ans(img=None,img_audio=None,val_q=None):
     max_generate_length = 100
     /* General Layout */
     body {
         font-family: 'Arial', sans-serif;
+        background-color: #ffe4e1;
         margin: 0;
         padding: 0;
     }