whisper-vs-distil-whisper

Runtime error

App Files Files Community

sanchit-gandhi commited on Nov 3, 2023

Commit

172ec24

1 Parent(s): 9e35e59

tidy

Browse files

Files changed (1) hide show

app.py +26 -6

app.py CHANGED Viewed

@@ -1,11 +1,12 @@
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 from transformers.utils import is_flash_attn_2_available
 import torch
 import gradio as gr
 import time
-import os
 BATCH_SIZE = 16
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
@@ -15,10 +16,11 @@ model = AutoModelForSpeechSeq2Seq.from_pretrained(
     "openai/whisper-large-v2", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, use_flash_attention_2=use_flash_attention_2
 )
 distilled_model = AutoModelForSpeechSeq2Seq.from_pretrained(
-    "distil-whisper/distil-large-v2", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, use_flash_attention_2=use_flash_attention_2, token=TOKEN
 )
 if not use_flash_attention_2:
     model = model.to_bettertransformer()
     distilled_model = distilled_model.to_bettertransformer()
@@ -49,6 +51,7 @@ distil_pipe = pipeline(
     chunk_length_s=15,
     torch_dtype=torch_dtype,
     device=device,
 )
 distil_pipe_forward = distil_pipe._forward
@@ -56,6 +59,20 @@ def transcribe(inputs):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please record or upload an audio file before submitting your request.")
     def _forward_distil_time(*args, **kwargs):
         global distil_runtime
         start_time = time.time()
@@ -92,7 +109,7 @@ if __name__ == "__main__":
                     "
                   >
                     <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
-                      Distil-Whisper VS Whisper
                     </h1>
                   </div>
                 </div>
@@ -100,8 +117,11 @@ if __name__ == "__main__":
         )
         gr.HTML(
             f"""
-            This demo evaluates the <a href="https://huggingface.co/distil-whisper/distil-large-v2"> Distil-Whisper </a> model
-            against the <a href="https://huggingface.co/openai/whisper-large-v2"> Whisper </a> model.
             """
         )
         audio = gr.components.Audio(type="filepath", label="Audio input")
@@ -117,4 +137,4 @@ if __name__ == "__main__":
             inputs=audio,
             outputs=[distil_transcription, distil_runtime, transcription, runtime],
         )
-    demo.queue().launch()

 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 from transformers.utils import is_flash_attn_2_available
+from transformers.pipelines.audio_utils import ffmpeg_read
 import torch
 import gradio as gr
 import time
 BATCH_SIZE = 16
+MAX_AUDIO_MINS = 30  # maximum audio input in minutes
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
     "openai/whisper-large-v2", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, use_flash_attention_2=use_flash_attention_2
 )
 distilled_model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    "distil-whisper/distil-large-v2", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, use_flash_attention_2=use_flash_attention_2
 )
 if not use_flash_attention_2:
+    # use flash attention from pytorch sdpa
     model = model.to_bettertransformer()
     distilled_model = distilled_model.to_bettertransformer()
     chunk_length_s=15,
     torch_dtype=torch_dtype,
     device=device,
+    generate_kwargs={"language": "en", "task": "transcribe"},
 )
 distil_pipe_forward = distil_pipe._forward
     if inputs is None:
         raise gr.Error("No audio file submitted! Please record or upload an audio file before submitting your request.")
+    with open(inputs, "rb") as f:
+        inputs = f.read()
+    inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
+    audio_length_mins = len(inputs) / pipe.feature_extractor.sampling_rate / 60
+    if audio_length_mins > MAX_AUDIO_MINS:
+        raise gr.Error(
+            f"To ensure fair usage of the Space, the maximum audio length permitted is {MAX_AUDIO_MINS} minutes."
+            f"Got an audio of length {round(audio_length_mins, 3)} minutes."
+        )
+    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
     def _forward_distil_time(*args, **kwargs):
         global distil_runtime
         start_time = time.time()
                     "
                   >
                     <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
+                      Whisper vs Distil-Whisper
                     </h1>
                   </div>
                 </div>
         )
         gr.HTML(
             f"""
+            This demo shows a speed comparison between <a href="https://huggingface.co/openai/whisper-large-v2"> Whisper </a>
+            and <a href="https://huggingface.co/distil-whisper/distil-large-v2"> Distil-Whisper </a> for the same audio
+            file input. Both models use the <a href="https://huggingface.co/distil-whisper/distil-large-v2#long-form-transcription"> chunked long-form transcription algorithm </a>
+            in 🤗 Transformers with Flash Attention support. To ensure fair usage of the Space, we ask that audio
+            file inputs are kept to < 30 mins.
             """
         )
         audio = gr.components.Audio(type="filepath", label="Audio input")
             inputs=audio,
             outputs=[distil_transcription, distil_runtime, transcription, runtime],
         )
+    demo.queue(max_size=10).launch()