Spaces:

ridgerun-ai
/

parakeet-tdt-0.6b-v2

Running

File size: 3,211 Bytes

import gradio as gr
import nemo.collections.asr as nemo_asr
import numpy as np
from scipy import signal

TARGET_SR = 16_000  # Hz
TITLE = "NVIDIA's Parakeet TDT 0.6B v2 Demo"
DESCRIPTION = """## Description

NVIDIA's parakeet-tdt-0.6b-v2 is a 600-million-parameter automatic \
speech recognition (ASR) model designed for high-quality English \
transcription, featuring support for punctuation, capitalization, \
and accurate timestamp prediction. This is a state-of-the-art model \
ideal for: accurate word-level timestamp predictions, automatic \
punctuation and capitalization, robust performance on spoken numbers, \
and song lyrics transcription.

### License

The license is comercial friendly:

> GOVERNING TERMS: Use of this model is governed by the [CC-BY-4.0 license](https://creativecommons.org/licenses/by/4.0/deed.en).

### Contact

Need help adding transcription to your system? [Let's talk!](mailto:[email protected]).\
At [RidgeRun.ai](https://ridgerun.ai) we'd love to help.

### Links of Interest
* [Model card](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2)
* [NVIDIA NeMO](https://github.com/NVIDIA/NeMo)

## Playground

"""

_model = None


def _to_float32(x: np.ndarray) -> np.ndarray:
    """
    Convert any integer PCM array to float32 in the range [-1, 1].
    Works for signed (int8/16/24/32) and unsigned (uint8) types,
    without hard‑coded magic numbers.
    """
    if not np.issubdtype(x.dtype, np.integer):
        # Already float – just ensure dtype is float32
        return x.astype(np.float32, copy=False)

    info = np.iinfo(x.dtype)
    x = x.astype(np.float32)

    # signed PCM (e.g. int16, int32)
    if info.min < 0:
        # ‑32768..32767 -> ‑1..1
        x /= max(abs(info.min), info.max)

    # unsigned PCM (e.g. uint8 0..255)
    else:
        # 128.0 for uint8
        midpoint = (info.max + 1) / 2
        # 0..255 -> ‑1..1
        x = (x - midpoint) / midpoint

    return x


def _resample(audio: np.ndarray, rate: int, target_rate: int) -> np.ndarray:
    if rate == target_rate:
        return audio

    # Use polyphase filtering for efficient, high‑quality resampling.
    gcd = np.gcd(rate, target_rate)
    up = target_rate // gcd
    down = rate // gcd
    resampled = signal.resample_poly(
        _to_float32(audio), up=up, down=down, axis=0
    )

    return resampled


def _invoke_model(audio: np.ndarray):
    global _model
    if not _model:
        _model = nemo_asr.models.ASRModel.from_pretrained(
            model_name="nvidia/parakeet-tdt-0.6b-v2"
        )

    return _model.transcribe(audio=audio)[0].text


def transcribe(audio: tuple[np.ndarray, int] | None):
    if not audio:
        return "No audio received. Please upload or record something"

    rate, data = audio

    data = _to_float32(data)
    data = _resample(data, rate, TARGET_SR)
    return _invoke_model(data)


app = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(
        sources=["upload", "microphone"],
        type="numpy",
        label="Upload or record audio",
    ),
    outputs=gr.Textbox(label="Transcription", show_copy_button=True),
    title=TITLE,
    description=DESCRIPTION,
)

if __name__ == "__main__":
    app.launch()