Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -9,6 +9,10 @@ import whisperx
|
|
9 |
import os
|
10 |
clip_model_name = "openai/clip-vit-base-patch32"
|
11 |
phi_model_name = "microsoft/phi-2"
|
|
|
|
|
|
|
|
|
12 |
tokenizer = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
|
13 |
processor = AutoProcessor.from_pretrained(clip_model_name)
|
14 |
tokenizer.pad_token = tokenizer.eos_token
|
@@ -19,6 +23,8 @@ phi_embed = 2560
|
|
19 |
compute_type = "float32"
|
20 |
audio_batch_size = 16
|
21 |
|
|
|
|
|
22 |
class SimpleResBlock(nn.Module):
|
23 |
def __init__(self, phi_embed):
|
24 |
super().__init__()
|
@@ -33,19 +39,29 @@ class SimpleResBlock(nn.Module):
|
|
33 |
return x + self.proj(x)
|
34 |
|
35 |
# models
|
|
|
|
|
|
|
|
|
36 |
clip_model = CLIPVisionModel.from_pretrained(clip_model_name).to(device)
|
37 |
projection = torch.nn.Linear(clip_embed, phi_embed).to(device)
|
38 |
resblock = SimpleResBlock(phi_embed).to(device)
|
39 |
phi_model = AutoModelForCausalLM.from_pretrained(phi_model_name,trust_remote_code=True).to(device)
|
40 |
-
# Assuming you have defined 'device' and 'compute_type' elsewhere
|
41 |
audio_model = whisperx.load_model("tiny", device, compute_type=compute_type, asr_options={'max_new_tokens': 2048, 'clip_timestamps': True, 'hallucination_silence_threshold': 0.25})
|
42 |
|
43 |
# load weights
|
|
|
|
|
44 |
model_to_merge = PeftModel.from_pretrained(phi_model,os.path.join(os.getcwd(), 'model_chkpt/lora_adaptor'))
|
45 |
merged_model = model_to_merge.merge_and_unload()
|
46 |
projection.load_state_dict(torch.load(os.path.join(os.getcwd(),'model_chkpt/finetunned_projection.pth'),map_location=torch.device(device)))
|
47 |
resblock.load_state_dict(torch.load(os.path.join(os.getcwd(),'model_chkpt/finetuned_resblock.pth'),map_location=torch.device(device)))
|
48 |
|
|
|
|
|
|
|
|
|
|
|
49 |
def model_generate_ans(img=None,img_audio=None,val_q=None):
|
50 |
|
51 |
max_generate_length = 100
|
@@ -110,7 +126,7 @@ with gr.Blocks() as demo:
|
|
110 |
/* General Layout */
|
111 |
body {
|
112 |
font-family: 'Arial', sans-serif;
|
113 |
-
background-color: #ffe4e1;
|
114 |
margin: 0;
|
115 |
padding: 0;
|
116 |
}
|
|
|
9 |
import os
|
10 |
clip_model_name = "openai/clip-vit-base-patch32"
|
11 |
phi_model_name = "microsoft/phi-2"
|
12 |
+
# Tokenizers and Processors: The tokenizer tokenizes text, and the processor handles preprocessing for images.
|
13 |
+
# Embedding sizes: clip_embed (768) is for the CLIP model, and phi_embed (2560) is for the Phi-2 model.
|
14 |
+
# Device: It selects CUDA if a GPU is available, otherwise, it uses the CPU.
|
15 |
+
# IMAGE_TOKEN_ID: Token ID reserved for images.
|
16 |
tokenizer = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
|
17 |
processor = AutoProcessor.from_pretrained(clip_model_name)
|
18 |
tokenizer.pad_token = tokenizer.eos_token
|
|
|
23 |
compute_type = "float32"
|
24 |
audio_batch_size = 16
|
25 |
|
26 |
+
# This defines a simple residual block that uses a layer normalization (LayerNorm) followed by two linear layers with a GELU activation function in between.
|
27 |
+
# The block is used to add learned transformations to the embeddings, which helps in stabilizing learning and improving generalization.
|
28 |
class SimpleResBlock(nn.Module):
|
29 |
def __init__(self, phi_embed):
|
30 |
super().__init__()
|
|
|
39 |
return x + self.proj(x)
|
40 |
|
41 |
# models
|
42 |
+
# CLIP Vision Model: Pretrained on visual tasks, outputs image embeddings.
|
43 |
+
# Projection Layer: Projects the clip_embed (768) dimensions to phi_embed (2560) to match the embedding sizes for downstream tasks.
|
44 |
+
# Residual Block: Uses the custom SimpleResBlock to process the embeddings further.
|
45 |
+
# Phi-2 Model: The language model handles text generation tasks.
|
46 |
clip_model = CLIPVisionModel.from_pretrained(clip_model_name).to(device)
|
47 |
projection = torch.nn.Linear(clip_embed, phi_embed).to(device)
|
48 |
resblock = SimpleResBlock(phi_embed).to(device)
|
49 |
phi_model = AutoModelForCausalLM.from_pretrained(phi_model_name,trust_remote_code=True).to(device)
|
|
|
50 |
audio_model = whisperx.load_model("tiny", device, compute_type=compute_type, asr_options={'max_new_tokens': 2048, 'clip_timestamps': True, 'hallucination_silence_threshold': 0.25})
|
51 |
|
52 |
# load weights
|
53 |
+
# LoRA Weights: The LoRA-adapted model merges with the Phi-2 model for fine-tuning.
|
54 |
+
# Loading Finetuned Layers: The pre-trained weights for the projection layer and residual block are loaded for further use.
|
55 |
model_to_merge = PeftModel.from_pretrained(phi_model,os.path.join(os.getcwd(), 'model_chkpt/lora_adaptor'))
|
56 |
merged_model = model_to_merge.merge_and_unload()
|
57 |
projection.load_state_dict(torch.load(os.path.join(os.getcwd(),'model_chkpt/finetunned_projection.pth'),map_location=torch.device(device)))
|
58 |
resblock.load_state_dict(torch.load(os.path.join(os.getcwd(),'model_chkpt/finetuned_resblock.pth'),map_location=torch.device(device)))
|
59 |
|
60 |
+
|
61 |
+
# Image Handling: Extracts image embeddings, passes through CLIP and a projection layer.
|
62 |
+
# Audio Handling: Transcribes audio with WhisperX, tokenizes it, and embeds the tokens.
|
63 |
+
# Text Handling: Tokenizes the text query and embeds it.
|
64 |
+
# Generating Response: The model generates tokens sequentially, combining inputs from images, audio, and text, and predicting the next token until it generates a full response.
|
65 |
def model_generate_ans(img=None,img_audio=None,val_q=None):
|
66 |
|
67 |
max_generate_length = 100
|
|
|
126 |
/* General Layout */
|
127 |
body {
|
128 |
font-family: 'Arial', sans-serif;
|
129 |
+
background-color: #ffe4e1;
|
130 |
margin: 0;
|
131 |
padding: 0;
|
132 |
}
|