Spaces:
Sleeping
Sleeping
import torch | |
import torchaudio | |
from pathlib import Path | |
import soundfile as sf | |
from typing import Any | |
from config import TARGET_SR, SUPPORTED_EXTS | |
def transcribe_file(path: str | Path, pipe: Any) -> str: | |
""" | |
Transcribe an audio file to text using a given ASR pipeline. | |
Args: | |
path: Path or string pointing to an audio file. | |
asr_pipeline: A Hugging Face transformers pipeline object for | |
automatic-speech-recognition. Should accept a numpy | |
array and return a dict with key 'text'. | |
Returns: | |
The transcribed text as returned by the pipeline. | |
Raises: | |
ValueError: If loading or decoding the audio fails. | |
""" | |
speech = load_resample(path) | |
return pipe(speech.numpy())["text"] # type: ignore[index] | |
def load_resample(path: str | Path, target_sr: int = TARGET_SR) -> torch.Tensor: | |
""" | |
Load an audio file and resample it to the target sample rate, returning | |
a mono torch.Tensor. | |
Args: | |
path: Path or string pointing to an audio file. | |
target_sr: Desired sample rate (in Hz). Defaults to TARGET_SR from config. | |
Returns: | |
A 1-D torch.Tensor of dtype float32 sampled at target_sr. | |
Raises: | |
ValueError: If the file extension is not in SUPPORTED_EXTS. | |
ValueError: If the audio file cannot be decoded. | |
""" | |
ext = Path(path).suffix.lower() | |
if ext not in SUPPORTED_EXTS: | |
raise ValueError( | |
f"Unsupported file-type β{ext or 'unknown'}β. Please upload WAV, FLAC, MP3, OGG/Opus or M4A." | |
) | |
try: | |
speech, sr = sf.read(str(path)) | |
except RuntimeError as exc: | |
raise ValueError( | |
"Couldn't decode the audio file - maybe it's corrupted or in an uncommon codec." | |
) from exc | |
speech = torch.tensor(speech).float() | |
if speech.ndim == 2: # stereo to mono | |
speech = speech.mean(dim=1) | |
if sr != target_sr: | |
speech = torchaudio.functional.resample( | |
speech, orig_freq=sr, new_freq=target_sr | |
) | |
return speech | |