Spaces:

NealCaren
/

TranscribeX

Runtime error

App Files Files Community

NealCaren commited on Jun 22, 2023

Commit

159d2d1

1 Parent(s): 4e5530a

Upload 2 files

Browse files

Files changed (2) hide show

app.py +71 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import whisperx
+import streamlit as st
+import torch
+import tempfile
+import subprocess
+def transcribe(audio_file):
+    if torch.cuda.is_available():
+        device = "gpu"
+    else:
+        device = "cpu"
+    batch_size = 16 # reduce if low on GPU mem
+    compute_type = "int8" # change to "float16" if high on GPU mem (may reduce accuracy)
+    YOUR_HF_TOKEN = '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'
+    # load audio file
+    audio_bytes = uploaded_file.getvalue()
+    with open(temp_file, 'wb') as f:
+        f.write(audio_bytes)
+    # 1. Transcribe with original whisper (batched)
+    model = whisperx.load_model("tiny", device = device, compute_type=compute_type)
+    audio = whisperx.load_audio(temp_file)
+    result = model.transcribe(audio, batch_size=batch_size)
+    st.write("Transcribed! Here's what we have so far:")
+    st.write(result["segments"]) # before alignment
+    # delete model if low on GPU resources
+    # import gc; gc.collect(); torch.cuda.empty_cache(); del model
+    # 2. Align whisper output
+    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
+    st.write("Aligned! Here's what we have so far:")
+    st.write(result["segments"]) # after alignment
+    # delete model if low on GPU resources
+    # import gc; gc.collect(); torch.cuda.empty_cache(); del model_a
+    # 3. Assign speaker labels
+    diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
+    # add min/max number of speakers if known
+    diarize_segments = diarize_model(audio_file)
+    # diarize_model(audio_file, min_speakers=min_speakers, max_speakers=max_speakers)
+    result = whisperx.assign_word_speakers(diarize_segments, result)
+    st.write(diarize_segments)
+    st.write(result["segments"]) # segments are now assigned speaker IDs
+st.title("Automated Transcription")
+form = st.form(key='my_form')
+uploaded_file = form.file_uploader("Choose a file")
+submit = form.form_submit_button("Transcribe!")
+if submit:
+    #temporary file to store audio_file
+    tmp_dir = tempfile.TemporaryDirectory()
+    temp_file = tmp_dir.name + '/mono.wav'
+    cmd = f"ffmpeg -y -i {uploaded_file} -acodec pcm_s16le -ar 16000 -ac 1 {temp_file}"
+    subprocess.Popen(cmd, shell=True).wait()
+    transcribe(temp_file)

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+git+https://github.com/m-bain/whisperx.git
+streamlit
+pandas