File size: 2,735 Bytes
6a91da6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import time

import torch
import gradio as gr
from transformers.pipelines import pipeline
import utils

from config import (
    MODEL_PATHS,
    SUPPORTED_LANGUAGES,
    CUSTOM_CSS,
)

# set language here: available are en, de and lb
LANGUAGE = "lb"
if LANGUAGE not in SUPPORTED_LANGUAGES:
    print(f"language ({LANGUAGE}) not supported. Use one of {SUPPORTED_LANGUAGES}")
    exit()
else:
    MODEL_PATH = MODEL_PATHS[LANGUAGE]


wave2vec_pipeline = pipeline(
    "automatic-speech-recognition",
    model=MODEL_PATH,
    device=0 if torch.cuda.is_available() else -1,
    chunk_length_s=30,
    stride_length_s=(4, 2),
    batch_size=8,
)


def transcribe_gradio(audio_path: str | None) -> str:
    """
    Transcribe an uploaded or recorded audio file and report inference time.

    Args:
        audio_path: Local filesystem path to the audio file provided by Gradio;
                    None or empty if the user hasn't recorded/uploaded anything.

    Returns:
        A string containing either:
          - A warning if no file was provided,
          - An error message if loading/decoding failed,
          - Or the transcript followed by the elapsed inference time.
    """
    if not audio_path:
        return "⚠️  Please record something or choose a file first."

    start = time.time()
    try:
        transcript = utils.transcribe_file(audio_path, wave2vec_pipeline)
    except ValueError as err:
        return f"❌ {err}"
    runtime = time.time() - start
    return f"{transcript}\n\nβŒ› Inference time: {runtime:.2f} s"


# gradio interface
with gr.Blocks(title="Wave2Vec (Luxembourgish) ", theme="soft", css=CUSTOM_CSS) as demo:
    gr.Markdown("""
    # πŸŽ™οΈ Speech-to-Text Demo β€” Wave2Vec (Luxembourgish) 
    Use **Record** to capture speech live or **Upload** to select an audio file (.wav, .mp3, .flac).  
    Hit **Transcribe** to convert your recording into text, and **Clear** to reset both fields.
    """)

    with gr.Row():
        audio_input = gr.Audio(
            sources=["microphone", "upload"],
            type="filepath",
            label="Input audio",
            autoplay=False,
        )
        output_text = gr.Textbox(
            label="Transcript",
            placeholder="Your transcript will appear here …",
            show_copy_button=True,
            lines=10,
        )

    with gr.Row(equal_height=True, elem_classes="centered-row") as row:
        transcribe_btn = gr.Button("Transcribe ✨", scale=0)
        clear_btn = gr.ClearButton(
            [audio_input, output_text], scale=0, elem_classes="clear-btn"
        )

    transcribe_btn.click(transcribe_gradio, inputs=audio_input, outputs=output_text)


if __name__ == "__main__":
    demo.launch()