Spaces:

botario
/

wave2vec_luxembourgish

Sleeping

App Files Files Community

botarioAcc commited on Jun 16

Commit

6a91da6

verified ·

1 Parent(s): 40e63a3

Initial Commit

Browse files

Files changed (4) hide show

app.py +91 -0
config.py +24 -0
requirements.txt +9 -0
utils.py +67 -0

app.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import time
+import torch
+import gradio as gr
+from transformers.pipelines import pipeline
+import utils
+from config import (
+    MODEL_PATHS,
+    SUPPORTED_LANGUAGES,
+    CUSTOM_CSS,
+)
+# set language here: available are en, de and lb
+LANGUAGE = "lb"
+if LANGUAGE not in SUPPORTED_LANGUAGES:
+    print(f"language ({LANGUAGE}) not supported. Use one of {SUPPORTED_LANGUAGES}")
+    exit()
+else:
+    MODEL_PATH = MODEL_PATHS[LANGUAGE]
+wave2vec_pipeline = pipeline(
+    "automatic-speech-recognition",
+    model=MODEL_PATH,
+    device=0 if torch.cuda.is_available() else -1,
+    chunk_length_s=30,
+    stride_length_s=(4, 2),
+    batch_size=8,
+)
+def transcribe_gradio(audio_path: str | None) -> str:
+    """
+    Transcribe an uploaded or recorded audio file and report inference time.
+    Args:
+        audio_path: Local filesystem path to the audio file provided by Gradio;
+                    None or empty if the user hasn't recorded/uploaded anything.
+    Returns:
+        A string containing either:
+          - A warning if no file was provided,
+          - An error message if loading/decoding failed,
+          - Or the transcript followed by the elapsed inference time.
+    """
+    if not audio_path:
+        return "⚠️  Please record something or choose a file first."
+    start = time.time()
+    try:
+        transcript = utils.transcribe_file(audio_path, wave2vec_pipeline)
+    except ValueError as err:
+        return f"❌ {err}"
+    runtime = time.time() - start
+    return f"{transcript}\n\n⌛ Inference time: {runtime:.2f} s"
+# gradio interface
+with gr.Blocks(title="Wave2Vec (Luxembourgish) ", theme="soft", css=CUSTOM_CSS) as demo:
+    gr.Markdown("""
+    # 🎙️ Speech-to-Text Demo — Wave2Vec (Luxembourgish)
+    Use **Record** to capture speech live or **Upload** to select an audio file (.wav, .mp3, .flac).
+    Hit **Transcribe** to convert your recording into text, and **Clear** to reset both fields.
+    """)
+    with gr.Row():
+        audio_input = gr.Audio(
+            sources=["microphone", "upload"],
+            type="filepath",
+            label="Input audio",
+            autoplay=False,
+        )
+        output_text = gr.Textbox(
+            label="Transcript",
+            placeholder="Your transcript will appear here …",
+            show_copy_button=True,
+            lines=10,
+        )
+    with gr.Row(equal_height=True, elem_classes="centered-row") as row:
+        transcribe_btn = gr.Button("Transcribe ✨", scale=0)
+        clear_btn = gr.ClearButton(
+            [audio_input, output_text], scale=0, elem_classes="clear-btn"
+        )
+    transcribe_btn.click(transcribe_gradio, inputs=audio_input, outputs=output_text)
+if __name__ == "__main__":
+    demo.launch()

config.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from typing import Final
+TARGET_SR: Final[int] = 16_000
+SUPPORTED_LANGUAGES: Final[set[str]] = {"de", "en", "lb"}
+MODEL_PATHS: dict[str, str] = {
+    "de": "jonatasgrosman/wav2vec2-large-xlsr-53-german",
+    "en": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
+    "lb": "Lemswasabi/wav2vec2-large-xlsr-53-842h-luxembourgish-14h",
+}
+SUPPORTED_EXTS: Final[set[str]] = {".wav", ".flac", ".mp3", ".ogg", ".m4a"}
+CUSTOM_CSS: Final[str] = """
+.centered-row {
+    display: flex;
+    justify-content: center;
+}
+.clear-btn {
+    background-color: #ffcccc !important;
+}
+"""

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+accelerate>=1.7.0
+gradio>=5.34.0
+kenlm>=0.3.0
+protobuf>=6.31.1
+pyctcdecode>=0.5.0
+soundfile>=0.13.1
+torch>=2.7.1
+torchaudio>=2.7.1
+transformers>=4.52.4

utils.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch
+import torchaudio
+from pathlib import Path
+import soundfile as sf
+from typing import Any
+from config import TARGET_SR, SUPPORTED_EXTS
+def transcribe_file(path: str | Path, pipe: Any) -> str:
+    """
+    Transcribe an audio file to text using a given ASR pipeline.
+    Args:
+        path: Path or string pointing to an audio file.
+        asr_pipeline: A Hugging Face transformers pipeline object for
+                      automatic-speech-recognition. Should accept a numpy
+                      array and return a dict with key 'text'.
+    Returns:
+        The transcribed text as returned by the pipeline.
+    Raises:
+        ValueError: If loading or decoding the audio fails.
+    """
+    speech = load_resample(path)
+    return pipe(speech.numpy())["text"]  # type: ignore[index]
+def load_resample(path: str | Path, target_sr: int = TARGET_SR) -> torch.Tensor:
+    """
+    Load an audio file and resample it to the target sample rate, returning
+    a mono torch.Tensor.
+    Args:
+        path: Path or string pointing to an audio file.
+        target_sr: Desired sample rate (in Hz). Defaults to TARGET_SR from config.
+    Returns:
+        A 1-D torch.Tensor of dtype float32 sampled at target_sr.
+    Raises:
+        ValueError: If the file extension is not in SUPPORTED_EXTS.
+        ValueError: If the audio file cannot be decoded.
+    """
+    ext = Path(path).suffix.lower()
+    if ext not in SUPPORTED_EXTS:
+        raise ValueError(
+            f"Unsupported file-type “{ext or 'unknown'}”. Please upload WAV, FLAC, MP3, OGG/Opus or M4A."
+        )
+    try:
+        speech, sr = sf.read(str(path))
+    except RuntimeError as exc:
+        raise ValueError(
+            "Couldn't decode the audio file - maybe it's corrupted or in an uncommon codec."
+        ) from exc
+    speech = torch.tensor(speech).float()
+    if speech.ndim == 2:  # stereo to mono
+        speech = speech.mean(dim=1)
+    if sr != target_sr:
+        speech = torchaudio.functional.resample(
+            speech, orig_freq=sr, new_freq=target_sr
+        )
+    return speech