Spaces:
Sleeping
Sleeping
Initial Commit
Browse files
app.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import gradio as gr
|
5 |
+
from transformers.pipelines import pipeline
|
6 |
+
import utils
|
7 |
+
|
8 |
+
from config import (
|
9 |
+
MODEL_PATHS,
|
10 |
+
SUPPORTED_LANGUAGES,
|
11 |
+
CUSTOM_CSS,
|
12 |
+
)
|
13 |
+
|
14 |
+
# set language here: available are en, de and lb
|
15 |
+
LANGUAGE = "lb"
|
16 |
+
if LANGUAGE not in SUPPORTED_LANGUAGES:
|
17 |
+
print(f"language ({LANGUAGE}) not supported. Use one of {SUPPORTED_LANGUAGES}")
|
18 |
+
exit()
|
19 |
+
else:
|
20 |
+
MODEL_PATH = MODEL_PATHS[LANGUAGE]
|
21 |
+
|
22 |
+
|
23 |
+
wave2vec_pipeline = pipeline(
|
24 |
+
"automatic-speech-recognition",
|
25 |
+
model=MODEL_PATH,
|
26 |
+
device=0 if torch.cuda.is_available() else -1,
|
27 |
+
chunk_length_s=30,
|
28 |
+
stride_length_s=(4, 2),
|
29 |
+
batch_size=8,
|
30 |
+
)
|
31 |
+
|
32 |
+
|
33 |
+
def transcribe_gradio(audio_path: str | None) -> str:
|
34 |
+
"""
|
35 |
+
Transcribe an uploaded or recorded audio file and report inference time.
|
36 |
+
|
37 |
+
Args:
|
38 |
+
audio_path: Local filesystem path to the audio file provided by Gradio;
|
39 |
+
None or empty if the user hasn't recorded/uploaded anything.
|
40 |
+
|
41 |
+
Returns:
|
42 |
+
A string containing either:
|
43 |
+
- A warning if no file was provided,
|
44 |
+
- An error message if loading/decoding failed,
|
45 |
+
- Or the transcript followed by the elapsed inference time.
|
46 |
+
"""
|
47 |
+
if not audio_path:
|
48 |
+
return "⚠️ Please record something or choose a file first."
|
49 |
+
|
50 |
+
start = time.time()
|
51 |
+
try:
|
52 |
+
transcript = utils.transcribe_file(audio_path, wave2vec_pipeline)
|
53 |
+
except ValueError as err:
|
54 |
+
return f"❌ {err}"
|
55 |
+
runtime = time.time() - start
|
56 |
+
return f"{transcript}\n\n⌛ Inference time: {runtime:.2f} s"
|
57 |
+
|
58 |
+
|
59 |
+
# gradio interface
|
60 |
+
with gr.Blocks(title="Wave2Vec (Luxembourgish) ", theme="soft", css=CUSTOM_CSS) as demo:
|
61 |
+
gr.Markdown("""
|
62 |
+
# 🎙️ Speech-to-Text Demo — Wave2Vec (Luxembourgish)
|
63 |
+
Use **Record** to capture speech live or **Upload** to select an audio file (.wav, .mp3, .flac).
|
64 |
+
Hit **Transcribe** to convert your recording into text, and **Clear** to reset both fields.
|
65 |
+
""")
|
66 |
+
|
67 |
+
with gr.Row():
|
68 |
+
audio_input = gr.Audio(
|
69 |
+
sources=["microphone", "upload"],
|
70 |
+
type="filepath",
|
71 |
+
label="Input audio",
|
72 |
+
autoplay=False,
|
73 |
+
)
|
74 |
+
output_text = gr.Textbox(
|
75 |
+
label="Transcript",
|
76 |
+
placeholder="Your transcript will appear here …",
|
77 |
+
show_copy_button=True,
|
78 |
+
lines=10,
|
79 |
+
)
|
80 |
+
|
81 |
+
with gr.Row(equal_height=True, elem_classes="centered-row") as row:
|
82 |
+
transcribe_btn = gr.Button("Transcribe ✨", scale=0)
|
83 |
+
clear_btn = gr.ClearButton(
|
84 |
+
[audio_input, output_text], scale=0, elem_classes="clear-btn"
|
85 |
+
)
|
86 |
+
|
87 |
+
transcribe_btn.click(transcribe_gradio, inputs=audio_input, outputs=output_text)
|
88 |
+
|
89 |
+
|
90 |
+
if __name__ == "__main__":
|
91 |
+
demo.launch()
|
config.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Final
|
2 |
+
|
3 |
+
|
4 |
+
TARGET_SR: Final[int] = 16_000
|
5 |
+
|
6 |
+
SUPPORTED_LANGUAGES: Final[set[str]] = {"de", "en", "lb"}
|
7 |
+
|
8 |
+
MODEL_PATHS: dict[str, str] = {
|
9 |
+
"de": "jonatasgrosman/wav2vec2-large-xlsr-53-german",
|
10 |
+
"en": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
|
11 |
+
"lb": "Lemswasabi/wav2vec2-large-xlsr-53-842h-luxembourgish-14h",
|
12 |
+
}
|
13 |
+
|
14 |
+
SUPPORTED_EXTS: Final[set[str]] = {".wav", ".flac", ".mp3", ".ogg", ".m4a"}
|
15 |
+
|
16 |
+
CUSTOM_CSS: Final[str] = """
|
17 |
+
.centered-row {
|
18 |
+
display: flex;
|
19 |
+
justify-content: center;
|
20 |
+
}
|
21 |
+
.clear-btn {
|
22 |
+
background-color: #ffcccc !important;
|
23 |
+
}
|
24 |
+
"""
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate>=1.7.0
|
2 |
+
gradio>=5.34.0
|
3 |
+
kenlm>=0.3.0
|
4 |
+
protobuf>=6.31.1
|
5 |
+
pyctcdecode>=0.5.0
|
6 |
+
soundfile>=0.13.1
|
7 |
+
torch>=2.7.1
|
8 |
+
torchaudio>=2.7.1
|
9 |
+
transformers>=4.52.4
|
utils.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torchaudio
|
3 |
+
from pathlib import Path
|
4 |
+
import soundfile as sf
|
5 |
+
from typing import Any
|
6 |
+
|
7 |
+
|
8 |
+
from config import TARGET_SR, SUPPORTED_EXTS
|
9 |
+
|
10 |
+
|
11 |
+
def transcribe_file(path: str | Path, pipe: Any) -> str:
|
12 |
+
"""
|
13 |
+
Transcribe an audio file to text using a given ASR pipeline.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
path: Path or string pointing to an audio file.
|
17 |
+
asr_pipeline: A Hugging Face transformers pipeline object for
|
18 |
+
automatic-speech-recognition. Should accept a numpy
|
19 |
+
array and return a dict with key 'text'.
|
20 |
+
|
21 |
+
Returns:
|
22 |
+
The transcribed text as returned by the pipeline.
|
23 |
+
|
24 |
+
Raises:
|
25 |
+
ValueError: If loading or decoding the audio fails.
|
26 |
+
"""
|
27 |
+
speech = load_resample(path)
|
28 |
+
return pipe(speech.numpy())["text"] # type: ignore[index]
|
29 |
+
|
30 |
+
|
31 |
+
def load_resample(path: str | Path, target_sr: int = TARGET_SR) -> torch.Tensor:
|
32 |
+
"""
|
33 |
+
Load an audio file and resample it to the target sample rate, returning
|
34 |
+
a mono torch.Tensor.
|
35 |
+
|
36 |
+
Args:
|
37 |
+
path: Path or string pointing to an audio file.
|
38 |
+
target_sr: Desired sample rate (in Hz). Defaults to TARGET_SR from config.
|
39 |
+
|
40 |
+
Returns:
|
41 |
+
A 1-D torch.Tensor of dtype float32 sampled at target_sr.
|
42 |
+
|
43 |
+
Raises:
|
44 |
+
ValueError: If the file extension is not in SUPPORTED_EXTS.
|
45 |
+
ValueError: If the audio file cannot be decoded.
|
46 |
+
"""
|
47 |
+
ext = Path(path).suffix.lower()
|
48 |
+
if ext not in SUPPORTED_EXTS:
|
49 |
+
raise ValueError(
|
50 |
+
f"Unsupported file-type “{ext or 'unknown'}”. Please upload WAV, FLAC, MP3, OGG/Opus or M4A."
|
51 |
+
)
|
52 |
+
|
53 |
+
try:
|
54 |
+
speech, sr = sf.read(str(path))
|
55 |
+
except RuntimeError as exc:
|
56 |
+
raise ValueError(
|
57 |
+
"Couldn't decode the audio file - maybe it's corrupted or in an uncommon codec."
|
58 |
+
) from exc
|
59 |
+
|
60 |
+
speech = torch.tensor(speech).float()
|
61 |
+
if speech.ndim == 2: # stereo to mono
|
62 |
+
speech = speech.mean(dim=1)
|
63 |
+
if sr != target_sr:
|
64 |
+
speech = torchaudio.functional.resample(
|
65 |
+
speech, orig_freq=sr, new_freq=target_sr
|
66 |
+
)
|
67 |
+
return speech
|