whisper-jax

Runtime error

App Files Files Community

sanchit-gandhi commited on Apr 12, 2023

Commit

1923ff8

1 Parent(s): c220da3

back to batched (b2b)

Browse files

Files changed (1) hide show

app.py +23 -9

app.py CHANGED Viewed

@@ -24,7 +24,6 @@ language_names = sorted(TO_LANGUAGE_CODE.keys())
 CHUNK_LENGTH_S = 30
 BATCH_SIZE = 16
 NUM_PROC = 8
-SAMPLING_RATE = 16000
 FILE_LIMIT_MB = 1000
@@ -71,7 +70,10 @@ def forward(batch, task=None, return_timestamps=False):
 if __name__ == "__main__":
-    def transcribe_audio(microphone, file_upload, task, return_timestamps):
         warn_output = ""
         if (microphone is not None) and (file_upload is not None):
             warn_output = (
@@ -80,19 +82,31 @@ if __name__ == "__main__":
             )
         elif (microphone is None) and (file_upload is None):
-            return "ERROR: You have to either use the microphone or upload an audio file"
         inputs = microphone if microphone is not None else file_upload
         with open(inputs, "rb") as f:
             inputs = f.read()
-        inputs = ffmpeg_read(inputs, SAMPLING_RATE)
-        inputs = {"array": base64.b64encode(inputs.tobytes()).decode(), "sampling_rate": SAMPLING_RATE}
-        text, timestamps = inference(inputs=inputs, task=task, return_timestamps=return_timestamps)
-        return warn_output + text, timestamps
     def _return_yt_html_embed(yt_url):
         video_id = yt_url.split("?v=")[-1]
@@ -110,7 +124,7 @@ if __name__ == "__main__":
         return html_embed_str, text, timestamps
     audio_chunked = gr.Interface(
-        fn=transcribe_audio,
         inputs=[
             gr.inputs.Audio(source="microphone", optional=True, type="filepath"),
             gr.inputs.Audio(source="upload", optional=True, type="filepath"),
@@ -152,5 +166,5 @@ if __name__ == "__main__":
     with demo:
         gr.TabbedInterface([audio_chunked, youtube], ["Transcribe Audio", "Transcribe YouTube"])
-    demo.queue(concurrency_count=5, max_size=10)
     demo.launch()

 CHUNK_LENGTH_S = 30
 BATCH_SIZE = 16
 NUM_PROC = 8
 FILE_LIMIT_MB = 1000
 if __name__ == "__main__":
+    processor = WhisperPrePostProcessor.from_pretrained("openai/whisper-large-v2")
+    pool = Pool(NUM_PROC)
+    def transcribe_chunked_audio(microphone, file_upload, task, return_timestamps):
         warn_output = ""
         if (microphone is not None) and (file_upload is not None):
             warn_output = (
             )
         elif (microphone is None) and (file_upload is None):
+            return "ERROR: You have to either use the microphone or upload an audio file", None
         inputs = microphone if microphone is not None else file_upload
+        file_size_mb = os.stat(inputs).st_size / (1024 * 1024)
+        if file_size_mb > FILE_LIMIT_MB:
+            return f"ERROR: File size exceeds file size limit. Got file of size {file_size_mb:.2f}MB for a limit of {FILE_LIMIT_MB}MB.", None
         with open(inputs, "rb") as f:
             inputs = f.read()
+        inputs = ffmpeg_read(inputs, processor.feature_extractor.sampling_rate)
+        inputs = {"array": inputs, "sampling_rate": processor.feature_extractor.sampling_rate}
+        dataloader = processor.preprocess_batch(inputs, chunk_length_s=CHUNK_LENGTH_S, batch_size=BATCH_SIZE)
+        try:
+            model_outputs = pool.map(partial(forward, task=task, return_timestamps=return_timestamps), dataloader)
+        except ValueError as err:
+            # pre-processor does all the necessary compatibility checks for our audio inputs
+            return err, None
+        post_processed = processor.postprocess(model_outputs, return_timestamps=return_timestamps)
+        timestamps = post_processed.get("chunks")
+        return warn_output + post_processed["text"], timestamps
     def _return_yt_html_embed(yt_url):
         video_id = yt_url.split("?v=")[-1]
         return html_embed_str, text, timestamps
     audio_chunked = gr.Interface(
+        fn=transcribe_chunked_audio,
         inputs=[
             gr.inputs.Audio(source="microphone", optional=True, type="filepath"),
             gr.inputs.Audio(source="upload", optional=True, type="filepath"),
     with demo:
         gr.TabbedInterface([audio_chunked, youtube], ["Transcribe Audio", "Transcribe YouTube"])
+    demo.queue(concurrency_count=3, max_size=10)
     demo.launch()