Spaces:

allenai
/

OLMoASR

Runtime error

App Files Files Community

Huong commited on Aug 14

Commit

265ea18

1 Parent(s): 704a4fe

Add application file

Browse files

Files changed (1) hide show

app.py +428 -0

app.py ADDED Viewed

	@@ -0,0 +1,428 @@

+import gradio as gr
+from gradio_rich_textbox import RichTextbox
+import torchaudio
+import re
+import librosa
+import torch
+import numpy as np
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+from whisper.normalizers import EnglishTextNormalizer
+from whisper import audio, DecodingOptions
+from whisper.tokenizer import get_tokenizer
+from whisper.decoding import detect_language
+from olmoasr import load_model
+from bs4 import BeautifulSoup
+device = "cuda" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+hf_model_path = "checkpoints/medium_hf_demo"
+olmoasr_ckpt = (
+    "checkpoints/eval_latesttrain_00524288_medium_fsdp-train_grad-acc_bfloat16_inf.pt"
+)
+hf_model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    hf_model_path, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+)
+hf_model.to(device).eval()
+processor = AutoProcessor.from_pretrained(hf_model_path)
+olmoasr_model = load_model(
+    name=olmoasr_ckpt, device=device, inference=True, in_memory=True
+)
+olmoasr_model.to(device).eval()
+normalizer = EnglishTextNormalizer()
+def stereo_to_mono(waveform):
+    # Check if the waveform is stereo
+    if waveform.shape[0] == 2:
+        # Average the two channels to convert to mono
+        mono_waveform = np.mean(waveform, axis=0)
+        return mono_waveform
+    else:
+        # If already mono, return as is
+        return waveform
+def hf_chunk_transcribe(audio_file, timestamp_text, transcription_text):
+    hf_transcriber = pipeline(
+        "automatic-speech-recognition",
+        model=hf_model,
+        tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        torch_dtype=torch_dtype,
+        device=device,
+        chunk_length_s=30,
+    )
+    waveform, sample_rate = librosa.load(audio_file, sr=None, mono=False)
+    waveform = stereo_to_mono(waveform)
+    print(waveform.shape)
+    if sample_rate != 16000:
+        waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=16000)
+    result = hf_transcriber(waveform, return_timestamps=True)
+    print(f"{result['text']=}\n")
+    print(f"{result['chunks']=}\n")
+    # text = result["text"].strip().replace("\n", " ")
+    # text = re.sub(r"(foreign|foreign you|you)\s*$", "", text)
+    chunks, text = hf_process_chunks(result["chunks"])
+    print(f"{chunks=}\n")
+    print(f"{text=}\n")
+    # Edit components
+    transSoup = BeautifulSoup(transcription_text, "html.parser")
+    transText = transSoup.find(id="transcriptionText")
+    if transText:
+        transText.clear()
+        transText.append(BeautifulSoup(text, "html.parser"))
+    timeSoup = BeautifulSoup(timestamp_text, "html.parser")
+    timeText = timeSoup.find(id="timestampText")
+    if timeText:
+        timeText.clear()
+        timeText.append(BeautifulSoup(chunks, "html.parser"))
+    return str(timeSoup), str(transSoup)
+def olmoasr_seq_transcribe(audio_file, timestamp_text, transcription_text):
+    waveform, sample_rate = librosa.load(audio_file, sr=None, mono=False)
+    waveform = stereo_to_mono(waveform)
+    print(waveform.shape)
+    if sample_rate != 16000:
+        waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=16000)
+    options = dict(
+        task="transcribe",
+        language="en",
+        without_timestamps=False,
+        beam_size=5,
+        best_of=5,
+    )
+    result = olmoasr_model.transcribe(waveform, verbose=False, **options)
+    print(f"{result['text']=}\n")
+    print(f"{result['segments']=}\n")
+    # text = result["text"].strip().replace("\n", " ")
+    # text = re.sub(r"(foreign|foreign you|Thank you for watching!|. you)\s*$", "", text)
+    chunks, text = olmoasr_process_chunks(result["segments"])
+    print(f"{chunks=}\n")
+    print(f"{text=}\n")
+    # Edit components
+    transSoup = BeautifulSoup(transcription_text, "html.parser")
+    transText = transSoup.find(id="transcriptionText")
+    if transText:
+        transText.clear()
+        transText.append(BeautifulSoup(text, "html.parser"))
+    timeSoup = BeautifulSoup(timestamp_text, "html.parser")
+    timeText = timeSoup.find(id="timestampText")
+    if timeText:
+        timeText.clear()
+        timeText.append(BeautifulSoup(chunks, "html.parser"))
+    return str(timeSoup), str(transSoup)
+def hf_seq_transcribe(audio_file, timestamp_text, transcription_text):
+    hf_transcriber = pipeline(
+        "automatic-speech-recognition",
+        model=hf_model,
+        tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        torch_dtype=torch_dtype,
+        device=device,
+    )
+    waveform, sample_rate = librosa.load(audio_file, sr=None, mono=False)
+    waveform = stereo_to_mono(waveform)
+    print(waveform.shape)
+    if sample_rate != 16000:
+        waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=16000)
+    result = hf_transcriber(
+        waveform,
+        return_timestamps=True,
+    )
+    print(f"{result['text']=}\n")
+    print(f"{result['chunks']=}\n")
+    # text = result["text"].strip().replace("\n", " ")
+    # text = re.sub(r"(foreign|foreign you|you)\s*$", "", text)
+    chunks, text = hf_seq_process_chunks(result["chunks"])
+    print(f"{text=}\n")
+    print(f"{chunks=}\n")
+    # Edit components
+    transSoup = BeautifulSoup(transcription_text, "html.parser")
+    transText = transSoup.find(id="transcriptionText")
+    if transText:
+        transText.clear()
+        transText.append(BeautifulSoup(text, "html.parser"))
+    timeSoup = BeautifulSoup(timestamp_text, "html.parser")
+    timeText = timeSoup.find(id="timestampText")
+    if timeText:
+        timeText.clear()
+        timeText.append(BeautifulSoup(chunks, "html.parser"))
+    return str(timeSoup), str(transSoup)
+def main_transcribe(inference_strategy, audio_file, timestamp_text, transcription_text):
+    if inference_strategy == "HuggingFace Chunking":
+        return hf_chunk_transcribe(audio_file, timestamp_text, transcription_text)
+    elif inference_strategy == "OLMoASR Sequential":
+        return olmoasr_seq_transcribe(audio_file, timestamp_text, transcription_text)
+    elif inference_strategy == "HuggingFace Sequential":
+        return hf_seq_transcribe(audio_file, timestamp_text, transcription_text)
+def olmoasr_process_chunks(chunks):
+    processed_chunks = []
+    processed_chunks_text = []
+    for chunk in chunks:
+        text = chunk["text"].strip()
+        if not re.match(
+            r"\s*(foreign you|foreign|Thank you for watching!|you there|you)\s*$", text
+        ):
+            if text.strip() == "":
+                continue
+            start = chunk["start"]
+            end = chunk["end"]
+            pattern = r"\n(?!\d+\.\d+\s*-->)"
+            text = re.sub(pattern, "", text)
+            processed_chunks_text.append(text.strip())
+            processed_chunks.append(f"{start:.2f} --> {end:.2f}: {text} <br>")
+        else:
+            break
+    print(f"{processed_chunks=}\n")
+    print(f"{processed_chunks_text=}\n")
+    print(
+        re.search(r"\s*foreign\s*$", processed_chunks_text[-1])
+        if processed_chunks_text
+        else None
+    )
+    if processed_chunks_text and re.search(
+        r"\s*foreign\s*$", processed_chunks_text[-1]
+    ):
+        processed_chunks_text[-1] = re.sub(
+            r"\s*foreign\s*$", "", processed_chunks_text[-1]
+        )
+        processed_chunks[-1] = re.sub(r"foreign\s*<br>", "<br>", processed_chunks[-1])
+    return "\n".join(processed_chunks), " ".join(processed_chunks_text)
+def hf_process_chunks(chunks):
+    processed_chunks = []
+    processed_chunks_text = []
+    for chunk in chunks:
+        text = chunk["text"].strip()
+        if not re.match(r"(foreign you|foreign|you there|you)\s*$", text):
+            if text.strip() == "":
+                continue
+            start = chunk["timestamp"][0]
+            end = chunk["timestamp"][1]
+            pattern = r"\n(?!\d+\.\d+\s*-->)"
+            text = re.sub(pattern, "", text)
+            processed_chunks_text.append(text.strip())
+            processed_chunks.append(f"{start:.2f} --> {end:.2f}: {text.strip()} <br>")
+        else:
+            break
+    print(f"{processed_chunks=}\n")
+    print(f"{processed_chunks_text=}\n")
+    print(
+        re.search(r"\s*foreign\s*$", processed_chunks_text[-1])
+        if processed_chunks_text
+        else None
+    )
+    if processed_chunks_text and re.search(
+        r"\s*foreign\s*$", processed_chunks_text[-1]
+    ):
+        processed_chunks_text[-1] = re.sub(
+            r"\s*foreign\s*$", "", processed_chunks_text[-1]
+        )
+        processed_chunks[-1] = re.sub(r"foreign\s*<br>", "<br>", processed_chunks[-1])
+    return "\n".join(processed_chunks), " ".join(processed_chunks_text)
+def hf_seq_process_chunks(chunks):
+    processed_chunks = []
+    processed_chunks_text = []
+    delta_time = 0.0
+    global_start = chunks[0]["timestamp"][0]
+    prev_end = -1.0
+    prev_dur = 0.0
+    accumulate_ts = False
+    for chunk in chunks:
+        text = chunk["text"].strip()
+        if not re.match(r"\s*(foreign you|foreign|you there|you)\s*$", text):
+            if text.strip() == "":
+                continue
+            start = chunk["timestamp"][0]
+            if start < prev_end:
+                accumulate_ts = True
+            end = chunk["timestamp"][1]
+            if start < prev_end:
+                prev_dur += delta_time
+            # print(f"{prev_dur=}")
+            delta_time = end - global_start
+            # print(f"{delta_time=}")
+            prev_end = end
+            # print(f"{prev_end=}")
+            if accumulate_ts:
+                start += prev_dur
+            if accumulate_ts:
+                end += prev_dur
+            # print(f"{start=}, {end=}, {prev_dur=}")
+            pattern = r"\n(?!\d+\.\d+\s*-->)"
+            text = re.sub(pattern, "", text)
+            processed_chunks_text.append(text.strip())
+            processed_chunks.append(f"{start:.2f} --> {end:.2f}: {text.strip()} <br>")
+        else:
+            break
+    print(f"{processed_chunks=}\n")
+    print(f"{processed_chunks_text=}\n")
+    print(
+        re.search(r"\s*foreign\s*$", processed_chunks_text[-1])
+        if processed_chunks_text
+        else None
+    )
+    if processed_chunks_text and re.search(
+        r"\s*foreign\s*$", processed_chunks_text[-1]
+    ):
+        processed_chunks_text[-1] = re.sub(
+            r"\s*foreign\s*$", "", processed_chunks_text[-1]
+        )
+        processed_chunks[-1] = re.sub(r"foreign\s*<br>", "<br>", processed_chunks[-1])
+    return "\n".join(processed_chunks), " ".join(processed_chunks_text)
+original_timestamp_html = """
+    <div style="background: white; border: 1px solid #d1d5db; border-radius: 8px; padding: 16px; width: 100%; box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1); flex: 1; margin-right: 10px;">
+        <div style="color: #374151; font-size: 14px; font-weight: 500; margin-bottom: 8px;">Timestamp Text</div>
+        <div id="timestampText"; style="color: #6b7280; font-size: 14px; line-height: 1.5; min-height: 100px; font-family: system-ui, sans-serif;"></div>
+    </div>
+    """
+original_transcription_html = """
+    <div style="background: white; border: 1px solid #d1d5db; border-radius: 8px; padding: 16px; width: 100%; box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1); flex: 1; margin-right: 10px;">
+        <div style="color: #374151; font-size: 14px; font-weight: 500; margin-bottom: 8px;">Transcription Text</div>
+        <div id="transcriptionText"; style="color: #6b7280; font-size: 14px; line-height: 1.5; min-height: 100px; font-family: system-ui, sans-serif;"></div>
+    </div>
+    """
+def reset():
+    return original_timestamp_html, original_transcription_html
+event_process_js = """
+<script>
+function getTime() {
+    lastIndex = -1;
+    setInterval(() => {
+        time = document.getElementById('time');
+        timestampText = document.getElementById('timestampText');
+        if(timestampText && timestampText.innerText != '') {
+            if(time == null) {
+                timestampText.innerText = '';
+                transcriptionText = document.getElementById('transcriptionText');
+                if(transcriptionText) {
+                    transcriptionText.innerText = '';
+                }
+                lastIndex = -1;
+                return;
+            }
+            timeContent = time.textContent;
+            const parts = timeContent.split(":").map(Number);
+            currTime = parseFloat(parts[0]) * 60 + parseFloat(parts[1]);
+            currText = timestampText.innerText;
+            const matches = [...currText.matchAll(/([\d.]+)\s*-->/g)];
+            const startTimestamps = matches.map(m => parseFloat(m[1]));
+            if(startTimestamps.length != 0) {
+                correctIndex = 0;
+                for (let i = 0; i < startTimestamps.length; i++) {
+                    if (startTimestamps[i] <= currTime) {
+                        correctIndex = i;
+                    }
+                    else {
+                        break;
+                    }
+                }
+                if (lastIndex != correctIndex) {
+                    lastIndex = correctIndex;
+                    lines = currText.split('\\n');
+                    lines[correctIndex] = '<span style="background-color: #ff69b4; padding: 3px 8px; font-weight: 500; border-radius: 4px; color: white; box-shadow: 0 0 10px rgba(255, 105, 180, 0.5);">' + lines[correctIndex] + '</span>';
+                    try {
+                        timestampText.innerHTML = lines.join('<br>');
+                    }
+                    catch (e) {
+                        console.log('Not Updating!');
+                    }
+                }
+            }
+        }
+        else {
+            lastIndex = -1;
+        }
+    }, 50);
+}
+setTimeout(getTime, 1000);
+</script>
+"""
+demo = gr.Blocks(
+    head=event_process_js,
+    theme=gr.themes.Default(primary_hue="emerald", secondary_hue="green"),
+)
+with demo:
+    audio = gr.Audio(sources=["upload", "microphone"], type="filepath")
+    inf_strategy = gr.Dropdown(
+        label="Inference Strategy",
+        choices=[
+            "HuggingFace Chunking",
+            "HuggingFace Sequential",
+            "OLMoASR Sequential",
+        ],
+        value="HuggingFace Chunking",
+        multiselect=False,
+        info="Select the inference strategy for transcription.",
+        elem_id="inf_strategy",
+    )
+    main_transcribe_button = gr.Button(
+        "Transcribe",
+        variant="primary",
+    )
+    with gr.Row():
+        timestampText = gr.HTML(original_timestamp_html)
+        transcriptionText = gr.HTML(original_transcription_html)
+    inf_strategy.change(
+        fn=reset,
+        inputs=[],
+        outputs=[timestampText, transcriptionText],
+    )
+    main_transcribe_button.click(
+        fn=main_transcribe,
+        inputs=[inf_strategy, audio, timestampText, transcriptionText],
+        outputs=[timestampText, transcriptionText],
+    )
+demo.launch(share=True)