File size: 3,211 Bytes
4704268
fa1dce6
4704268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa1dce6
4704268
 
fa1dce6
4704268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa1dce6
 
4704268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9faef73
4704268
 
 
 
 
 
 
 
 
 
 
9faef73
 
4704268
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import gradio as gr
import nemo.collections.asr as nemo_asr
import numpy as np
from scipy import signal

TARGET_SR = 16_000  # Hz
TITLE = "NVIDIA's Parakeet TDT 0.6B v2 Demo"
DESCRIPTION = """## Description

NVIDIA's parakeet-tdt-0.6b-v2 is a 600-million-parameter automatic \
speech recognition (ASR) model designed for high-quality English \
transcription, featuring support for punctuation, capitalization, \
and accurate timestamp prediction. This is a state-of-the-art model \
ideal for: accurate word-level timestamp predictions, automatic \
punctuation and capitalization, robust performance on spoken numbers, \
and song lyrics transcription.

### License

The license is comercial friendly:

> GOVERNING TERMS: Use of this model is governed by the [CC-BY-4.0 license](https://creativecommons.org/licenses/by/4.0/deed.en).

### Contact

Need help adding transcription to your system? [Let's talk!](mailto:[email protected]).\
At [RidgeRun.ai](https://ridgerun.ai) we'd love to help.

### Links of Interest
* [Model card](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2)
* [NVIDIA NeMO](https://github.com/NVIDIA/NeMo)

## Playground

"""

_model = None


def _to_float32(x: np.ndarray) -> np.ndarray:
    """
    Convert any integer PCM array to float32 in the range [-1, 1].
    Works for signed (int8/16/24/32) and unsigned (uint8) types,
    without hard‑coded magic numbers.
    """
    if not np.issubdtype(x.dtype, np.integer):
        # Already float – just ensure dtype is float32
        return x.astype(np.float32, copy=False)

    info = np.iinfo(x.dtype)
    x = x.astype(np.float32)

    # signed PCM (e.g. int16, int32)
    if info.min < 0:
        # ‑32768..32767 -> ‑1..1
        x /= max(abs(info.min), info.max)

    # unsigned PCM (e.g. uint8 0..255)
    else:
        # 128.0 for uint8
        midpoint = (info.max + 1) / 2
        # 0..255 -> ‑1..1
        x = (x - midpoint) / midpoint

    return x


def _resample(audio: np.ndarray, rate: int, target_rate: int) -> np.ndarray:
    if rate == target_rate:
        return audio

    # Use polyphase filtering for efficient, high‑quality resampling.
    gcd = np.gcd(rate, target_rate)
    up = target_rate // gcd
    down = rate // gcd
    resampled = signal.resample_poly(
        _to_float32(audio), up=up, down=down, axis=0
    )

    return resampled


def _invoke_model(audio: np.ndarray):
    global _model
    if not _model:
        _model = nemo_asr.models.ASRModel.from_pretrained(
            model_name="nvidia/parakeet-tdt-0.6b-v2"
        )

    return _model.transcribe(audio=audio)[0].text


def transcribe(audio: tuple[np.ndarray, int] | None):
    if not audio:
        return "No audio received. Please upload or record something"

    rate, data = audio

    data = _to_float32(data)
    data = _resample(data, rate, TARGET_SR)
    return _invoke_model(data)


app = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(
        sources=["upload", "microphone"],
        type="numpy",
        label="Upload or record audio",
    ),
    outputs=gr.Textbox(label="Transcription", show_copy_button=True),
    title=TITLE,
    description=DESCRIPTION,
)

if __name__ == "__main__":
    app.launch()