Spaces:

AkashDataScience
/

Phi-3_multimodel_assistant

Sleeping

App Files Files Community

AkashDataScience commited on Oct 13, 2024

Commit

b52bed7

1 Parent(s): 2d191f6

Added inference

Browse files

Files changed (2) hide show

app.py +54 -1
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import torch.nn as nn
 from model import Projections
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import gradio as gr
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 projections = Projections(512, 3072)
@@ -47,7 +48,59 @@ whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)
 whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
 def infer(message, history):
-    return message.keys()
 examples=[{'text':"I am planning to buy a dog and a cat. Suggest some breeds that get along with each other"},
           {'text':"Explain biased coin flip"},

 from model import Projections
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import gradio as gr
+import librosa
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 projections = Projections(512, 3072)
 whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
 def infer(message, history):
+    max_generate_length = 100
+    combined_embeds = []
+    with torch.no_grad():
+        if message['file']:
+            projected_image_embeds = None
+            audio_text_embeds = None
+            for path in message['file']:
+                if path.endswith(('.jpg', '.png', '.jpeg')):
+                    image = clip_preprocess(Image.open(path)).unsqueeze(0).to(device)
+                    image_features = clip_model.encode_image(image)
+                    projected_image_embeds = projections(image_features.to(torch.bfloat16)).unsqueeze(0)
+                elif path.endswith(('.mp3', '.wav')):
+                    # Load and preprocess the audio
+                    speech, rate = librosa.load(path, sr=16000)
+                    input_features = whisper_processor(speech, return_tensors="pt", sampling_rate=16000).input_features
+                    predicted_ids = whisper_model.generate(input_features)
+                    transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
+                    prompt = tokenizer.apply_chat_template([{"from": "human", "value": transcription}], tokenize=False, add_generation_prompt=True)
+                    prompt_tokens = tokenizer(prompt, padding=True, truncation=True, max_length=2048, return_tensors="pt")['input_ids']
+                    audio_text_embeds = model.get_input_embeddings()(prompt_tokens)
+            if projected_image_embeds:
+                combined_embeds.append(projected_image_embeds)
+            if audio_text_embeds:
+                combined_embeds.append(audio_text_embeds)
+        if  message['text']:
+            prompt = tokenizer.apply_chat_template([{"from": "human", "value": transcription}], tokenize=False, add_generation_prompt=True)
+            prompt_tokens = tokenizer(prompt, padding=True, truncation=True, max_length=2048, return_tensors="pt")['input_ids']
+            text_embeds = model.get_input_embeddings()(prompt_tokens)
+            combined_embeds.append(text_embeds)
+        combined_embeds = torch.cat(combined_embeds,dim=1)
+        #val_combined_embeds = torch.cat([val_image_embeds, img_token_embeds, val_q_embeds], dim=1) # 4, 69, 2560
+        predicted_caption = torch.full((1,max_generate_length),50256).to(device)
+        for g in range(max_generate_length):
+            phi_output_logits = model(inputs_embeds=combined_embeds)['logits'] # 4, 69, 51200
+            predicted_word_token_logits = phi_output_logits[:, -1, :].unsqueeze(1) # 4,1,51200
+            predicted_word_token = torch.argmax(predicted_word_token_logits, dim = -1) # 4,1
+            predicted_caption[:,g] = predicted_word_token.view(1,-1)
+            next_token_embeds = model.get_input_embeddings()(prompt_tokens) # 4,1,2560
+            combined_embeds = torch.cat([combined_embeds, next_token_embeds], dim=1)
+        predicted_captions_decoded = tokenizer.batch_decode(predicted_caption,ignore_index = 50256)[0]
+    return predicted_captions_decoded
 examples=[{'text':"I am planning to buy a dog and a cat. Suggest some breeds that get along with each other"},
           {'text':"Explain biased coin flip"},

requirements.txt CHANGED Viewed

@@ -3,6 +3,8 @@ clip @ git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8
 colorama==0.4.6
 datasets==3.0.0
 dill==0.3.8
 multiprocess==0.70.16
 numpy==1.26.4
 pandas==2.2.2

 colorama==0.4.6
 datasets==3.0.0
 dill==0.3.8
+gradio==5.0.2
+librosa==0.10.2
 multiprocess==0.70.16
 numpy==1.26.4
 pandas==2.2.2