|
import gradio as gr |
|
import nemo.collections.asr as nemo_asr |
|
import numpy as np |
|
import pandas as pd |
|
from scipy import signal |
|
|
|
TARGET_SR = 16_000 |
|
TITLE = "NVIDIA's Parakeet TDT 0.6B v2 Demo" |
|
DESCRIPTION = """## Description |
|
|
|
NVIDIA's parakeet-tdt-0.6b-v2 is a 600-million-parameter automatic \ |
|
speech recognition (ASR) model designed for high-quality English \ |
|
transcription, featuring support for punctuation, capitalization, \ |
|
and accurate timestamp prediction. This is a state-of-the-art model \ |
|
ideal for: accurate word-level timestamp predictions, automatic \ |
|
punctuation and capitalization, robust performance on spoken numbers, \ |
|
and song lyrics transcription. |
|
|
|
### License |
|
|
|
The license is comercial friendly: |
|
|
|
> GOVERNING TERMS: Use of this model is governed by the [CC-BY-4.0 license](https://creativecommons.org/licenses/by/4.0/deed.en). |
|
|
|
### Contact |
|
|
|
Need help adding transcription to your system? [Let's talk!](mailto:[email protected]).\ |
|
At [RidgeRun.ai](https://ridgerun.ai) we'd love to help. |
|
|
|
### Links of Interest |
|
* [Model card](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2) |
|
* [NVIDIA NeMO](https://github.com/NVIDIA/NeMo) |
|
|
|
## Playground |
|
|
|
""" |
|
|
|
_model = None |
|
|
|
|
|
def _to_float32(x: np.ndarray) -> np.ndarray: |
|
""" |
|
Convert any integer PCM array to float32 in the range [-1, 1]. |
|
Works for signed (int8/16/24/32) and unsigned (uint8) types, |
|
without hard‑coded magic numbers. |
|
""" |
|
if not np.issubdtype(x.dtype, np.integer): |
|
|
|
return x.astype(np.float32, copy=False) |
|
|
|
info = np.iinfo(x.dtype) |
|
x = x.astype(np.float32) |
|
|
|
|
|
if info.min < 0: |
|
|
|
x /= max(abs(info.min), info.max) |
|
|
|
|
|
else: |
|
|
|
midpoint = (info.max + 1) / 2 |
|
|
|
x = (x - midpoint) / midpoint |
|
|
|
return x |
|
|
|
|
|
def _resample(audio: np.ndarray, rate: int, target_rate: int) -> np.ndarray: |
|
if rate == target_rate: |
|
return audio |
|
|
|
|
|
gcd = np.gcd(rate, target_rate) |
|
up = target_rate // gcd |
|
down = rate // gcd |
|
resampled = signal.resample_poly( |
|
_to_float32(audio), up=up, down=down, axis=0 |
|
) |
|
|
|
return resampled |
|
|
|
|
|
def _load_model(): |
|
global _model |
|
if not _model: |
|
_model = nemo_asr.models.ASRModel.from_pretrained( |
|
model_name="nvidia/parakeet-tdt-0.6b-v2" |
|
) |
|
|
|
return _model |
|
|
|
|
|
def _to_pandas(prediction, keyword): |
|
return pd.DataFrame(prediction.timestamp[keyword])[ |
|
[keyword, "start", "end"] |
|
] |
|
|
|
|
|
def _invoke_model(model, audio: np.ndarray): |
|
prediction = model.transcribe(audio=audio, timestamps=True)[0] |
|
|
|
text = prediction.text |
|
|
|
chars = _to_pandas(prediction, "char") |
|
words = _to_pandas(prediction, "word") |
|
segments = _to_pandas(prediction, "segment") |
|
|
|
return text, chars, words, segments |
|
|
|
|
|
def transcribe(audio: tuple[np.ndarray, int] | None): |
|
if not audio: |
|
return "No audio received. Please upload or record something" |
|
|
|
rate, data = audio |
|
|
|
model = _load_model() |
|
data = _to_float32(data) |
|
data = _resample(data, rate, TARGET_SR) |
|
text, chars, words, segments = _invoke_model(model, data) |
|
|
|
return text, segments, words, chars |
|
|
|
|
|
app = gr.Interface( |
|
fn=transcribe, |
|
inputs=gr.Audio( |
|
sources=["upload", "microphone"], |
|
type="numpy", |
|
label="Upload or record audio", |
|
), |
|
outputs=[ |
|
gr.Textbox(label="Transcription", show_copy_button=True), |
|
gr.Dataframe( |
|
label="Segments", |
|
headers=["Segment", "Start", "End"], |
|
), |
|
gr.Dataframe( |
|
label="Words", |
|
headers=["Word", "Start", "End"], |
|
), |
|
gr.Dataframe( |
|
label="Characters", |
|
headers=["Character", "Start", "End"], |
|
), |
|
], |
|
title=TITLE, |
|
description=DESCRIPTION, |
|
) |
|
|
|
if __name__ == "__main__": |
|
app.launch() |
|
|