mgruner's picture
Add support for timestamps as well
c913b1a
import gradio as gr
import nemo.collections.asr as nemo_asr
import numpy as np
import pandas as pd
from scipy import signal
TARGET_SR = 16_000 # Hz
TITLE = "NVIDIA's Parakeet TDT 0.6B v2 Demo"
DESCRIPTION = """## Description
NVIDIA's parakeet-tdt-0.6b-v2 is a 600-million-parameter automatic \
speech recognition (ASR) model designed for high-quality English \
transcription, featuring support for punctuation, capitalization, \
and accurate timestamp prediction. This is a state-of-the-art model \
ideal for: accurate word-level timestamp predictions, automatic \
punctuation and capitalization, robust performance on spoken numbers, \
and song lyrics transcription.
### License
The license is comercial friendly:
> GOVERNING TERMS: Use of this model is governed by the [CC-BY-4.0 license](https://creativecommons.org/licenses/by/4.0/deed.en).
### Contact
Need help adding transcription to your system? [Let's talk!](mailto:[email protected]).\
At [RidgeRun.ai](https://ridgerun.ai) we'd love to help.
### Links of Interest
* [Model card](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2)
* [NVIDIA NeMO](https://github.com/NVIDIA/NeMo)
## Playground
"""
_model = None
def _to_float32(x: np.ndarray) -> np.ndarray:
"""
Convert any integer PCM array to float32 in the range [-1, 1].
Works for signed (int8/16/24/32) and unsigned (uint8) types,
without hard‑coded magic numbers.
"""
if not np.issubdtype(x.dtype, np.integer):
# Already float – just ensure dtype is float32
return x.astype(np.float32, copy=False)
info = np.iinfo(x.dtype)
x = x.astype(np.float32)
# signed PCM (e.g. int16, int32)
if info.min < 0:
# ‑32768..32767 -> ‑1..1
x /= max(abs(info.min), info.max)
# unsigned PCM (e.g. uint8 0..255)
else:
# 128.0 for uint8
midpoint = (info.max + 1) / 2
# 0..255 -> ‑1..1
x = (x - midpoint) / midpoint
return x
def _resample(audio: np.ndarray, rate: int, target_rate: int) -> np.ndarray:
if rate == target_rate:
return audio
# Use polyphase filtering for efficient, high‑quality resampling.
gcd = np.gcd(rate, target_rate)
up = target_rate // gcd
down = rate // gcd
resampled = signal.resample_poly(
_to_float32(audio), up=up, down=down, axis=0
)
return resampled
def _load_model():
global _model
if not _model:
_model = nemo_asr.models.ASRModel.from_pretrained(
model_name="nvidia/parakeet-tdt-0.6b-v2"
)
return _model
def _to_pandas(prediction, keyword):
return pd.DataFrame(prediction.timestamp[keyword])[
[keyword, "start", "end"]
]
def _invoke_model(model, audio: np.ndarray):
prediction = model.transcribe(audio=audio, timestamps=True)[0]
text = prediction.text
chars = _to_pandas(prediction, "char")
words = _to_pandas(prediction, "word")
segments = _to_pandas(prediction, "segment")
return text, chars, words, segments
def transcribe(audio: tuple[np.ndarray, int] | None):
if not audio:
return "No audio received. Please upload or record something"
rate, data = audio
model = _load_model()
data = _to_float32(data)
data = _resample(data, rate, TARGET_SR)
text, chars, words, segments = _invoke_model(model, data)
return text, segments, words, chars
app = gr.Interface(
fn=transcribe,
inputs=gr.Audio(
sources=["upload", "microphone"],
type="numpy",
label="Upload or record audio",
),
outputs=[
gr.Textbox(label="Transcription", show_copy_button=True),
gr.Dataframe(
label="Segments",
headers=["Segment", "Start", "End"],
),
gr.Dataframe(
label="Words",
headers=["Word", "Start", "End"],
),
gr.Dataframe(
label="Characters",
headers=["Character", "Start", "End"],
),
],
title=TITLE,
description=DESCRIPTION,
)
if __name__ == "__main__":
app.launch()