botarioAcc commited on
Commit
6a91da6
·
verified ·
1 Parent(s): 40e63a3

Initial Commit

Browse files
Files changed (4) hide show
  1. app.py +91 -0
  2. config.py +24 -0
  3. requirements.txt +9 -0
  4. utils.py +67 -0
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ import torch
4
+ import gradio as gr
5
+ from transformers.pipelines import pipeline
6
+ import utils
7
+
8
+ from config import (
9
+ MODEL_PATHS,
10
+ SUPPORTED_LANGUAGES,
11
+ CUSTOM_CSS,
12
+ )
13
+
14
+ # set language here: available are en, de and lb
15
+ LANGUAGE = "lb"
16
+ if LANGUAGE not in SUPPORTED_LANGUAGES:
17
+ print(f"language ({LANGUAGE}) not supported. Use one of {SUPPORTED_LANGUAGES}")
18
+ exit()
19
+ else:
20
+ MODEL_PATH = MODEL_PATHS[LANGUAGE]
21
+
22
+
23
+ wave2vec_pipeline = pipeline(
24
+ "automatic-speech-recognition",
25
+ model=MODEL_PATH,
26
+ device=0 if torch.cuda.is_available() else -1,
27
+ chunk_length_s=30,
28
+ stride_length_s=(4, 2),
29
+ batch_size=8,
30
+ )
31
+
32
+
33
+ def transcribe_gradio(audio_path: str | None) -> str:
34
+ """
35
+ Transcribe an uploaded or recorded audio file and report inference time.
36
+
37
+ Args:
38
+ audio_path: Local filesystem path to the audio file provided by Gradio;
39
+ None or empty if the user hasn't recorded/uploaded anything.
40
+
41
+ Returns:
42
+ A string containing either:
43
+ - A warning if no file was provided,
44
+ - An error message if loading/decoding failed,
45
+ - Or the transcript followed by the elapsed inference time.
46
+ """
47
+ if not audio_path:
48
+ return "⚠️ Please record something or choose a file first."
49
+
50
+ start = time.time()
51
+ try:
52
+ transcript = utils.transcribe_file(audio_path, wave2vec_pipeline)
53
+ except ValueError as err:
54
+ return f"❌ {err}"
55
+ runtime = time.time() - start
56
+ return f"{transcript}\n\n⌛ Inference time: {runtime:.2f} s"
57
+
58
+
59
+ # gradio interface
60
+ with gr.Blocks(title="Wave2Vec (Luxembourgish) ", theme="soft", css=CUSTOM_CSS) as demo:
61
+ gr.Markdown("""
62
+ # 🎙️ Speech-to-Text Demo — Wave2Vec (Luxembourgish)
63
+ Use **Record** to capture speech live or **Upload** to select an audio file (.wav, .mp3, .flac).
64
+ Hit **Transcribe** to convert your recording into text, and **Clear** to reset both fields.
65
+ """)
66
+
67
+ with gr.Row():
68
+ audio_input = gr.Audio(
69
+ sources=["microphone", "upload"],
70
+ type="filepath",
71
+ label="Input audio",
72
+ autoplay=False,
73
+ )
74
+ output_text = gr.Textbox(
75
+ label="Transcript",
76
+ placeholder="Your transcript will appear here …",
77
+ show_copy_button=True,
78
+ lines=10,
79
+ )
80
+
81
+ with gr.Row(equal_height=True, elem_classes="centered-row") as row:
82
+ transcribe_btn = gr.Button("Transcribe ✨", scale=0)
83
+ clear_btn = gr.ClearButton(
84
+ [audio_input, output_text], scale=0, elem_classes="clear-btn"
85
+ )
86
+
87
+ transcribe_btn.click(transcribe_gradio, inputs=audio_input, outputs=output_text)
88
+
89
+
90
+ if __name__ == "__main__":
91
+ demo.launch()
config.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Final
2
+
3
+
4
+ TARGET_SR: Final[int] = 16_000
5
+
6
+ SUPPORTED_LANGUAGES: Final[set[str]] = {"de", "en", "lb"}
7
+
8
+ MODEL_PATHS: dict[str, str] = {
9
+ "de": "jonatasgrosman/wav2vec2-large-xlsr-53-german",
10
+ "en": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
11
+ "lb": "Lemswasabi/wav2vec2-large-xlsr-53-842h-luxembourgish-14h",
12
+ }
13
+
14
+ SUPPORTED_EXTS: Final[set[str]] = {".wav", ".flac", ".mp3", ".ogg", ".m4a"}
15
+
16
+ CUSTOM_CSS: Final[str] = """
17
+ .centered-row {
18
+ display: flex;
19
+ justify-content: center;
20
+ }
21
+ .clear-btn {
22
+ background-color: #ffcccc !important;
23
+ }
24
+ """
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ accelerate>=1.7.0
2
+ gradio>=5.34.0
3
+ kenlm>=0.3.0
4
+ protobuf>=6.31.1
5
+ pyctcdecode>=0.5.0
6
+ soundfile>=0.13.1
7
+ torch>=2.7.1
8
+ torchaudio>=2.7.1
9
+ transformers>=4.52.4
utils.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ from pathlib import Path
4
+ import soundfile as sf
5
+ from typing import Any
6
+
7
+
8
+ from config import TARGET_SR, SUPPORTED_EXTS
9
+
10
+
11
+ def transcribe_file(path: str | Path, pipe: Any) -> str:
12
+ """
13
+ Transcribe an audio file to text using a given ASR pipeline.
14
+
15
+ Args:
16
+ path: Path or string pointing to an audio file.
17
+ asr_pipeline: A Hugging Face transformers pipeline object for
18
+ automatic-speech-recognition. Should accept a numpy
19
+ array and return a dict with key 'text'.
20
+
21
+ Returns:
22
+ The transcribed text as returned by the pipeline.
23
+
24
+ Raises:
25
+ ValueError: If loading or decoding the audio fails.
26
+ """
27
+ speech = load_resample(path)
28
+ return pipe(speech.numpy())["text"] # type: ignore[index]
29
+
30
+
31
+ def load_resample(path: str | Path, target_sr: int = TARGET_SR) -> torch.Tensor:
32
+ """
33
+ Load an audio file and resample it to the target sample rate, returning
34
+ a mono torch.Tensor.
35
+
36
+ Args:
37
+ path: Path or string pointing to an audio file.
38
+ target_sr: Desired sample rate (in Hz). Defaults to TARGET_SR from config.
39
+
40
+ Returns:
41
+ A 1-D torch.Tensor of dtype float32 sampled at target_sr.
42
+
43
+ Raises:
44
+ ValueError: If the file extension is not in SUPPORTED_EXTS.
45
+ ValueError: If the audio file cannot be decoded.
46
+ """
47
+ ext = Path(path).suffix.lower()
48
+ if ext not in SUPPORTED_EXTS:
49
+ raise ValueError(
50
+ f"Unsupported file-type “{ext or 'unknown'}”. Please upload WAV, FLAC, MP3, OGG/Opus or M4A."
51
+ )
52
+
53
+ try:
54
+ speech, sr = sf.read(str(path))
55
+ except RuntimeError as exc:
56
+ raise ValueError(
57
+ "Couldn't decode the audio file - maybe it's corrupted or in an uncommon codec."
58
+ ) from exc
59
+
60
+ speech = torch.tensor(speech).float()
61
+ if speech.ndim == 2: # stereo to mono
62
+ speech = speech.mean(dim=1)
63
+ if sr != target_sr:
64
+ speech = torchaudio.functional.resample(
65
+ speech, orig_freq=sr, new_freq=target_sr
66
+ )
67
+ return speech