import gradio as gr import nemo.collections.asr as nemo_asr import numpy as np import pandas as pd from scipy import signal TARGET_SR = 16_000 # Hz TITLE = "NVIDIA's Parakeet TDT 0.6B v2 Demo" DESCRIPTION = """## Description NVIDIA's parakeet-tdt-0.6b-v2 is a 600-million-parameter automatic \ speech recognition (ASR) model designed for high-quality English \ transcription, featuring support for punctuation, capitalization, \ and accurate timestamp prediction. This is a state-of-the-art model \ ideal for: accurate word-level timestamp predictions, automatic \ punctuation and capitalization, robust performance on spoken numbers, \ and song lyrics transcription. ### License The license is comercial friendly: > GOVERNING TERMS: Use of this model is governed by the [CC-BY-4.0 license](https://creativecommons.org/licenses/by/4.0/deed.en). ### Contact Need help adding transcription to your system? [Let's talk!](mailto:support@ridgerun.ai).\ At [RidgeRun.ai](https://ridgerun.ai) we'd love to help. ### Links of Interest * [Model card](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2) * [NVIDIA NeMO](https://github.com/NVIDIA/NeMo) ## Playground """ _model = None def _to_float32(x: np.ndarray) -> np.ndarray: """ Convert any integer PCM array to float32 in the range [-1, 1]. Works for signed (int8/16/24/32) and unsigned (uint8) types, without hard‑coded magic numbers. """ if not np.issubdtype(x.dtype, np.integer): # Already float – just ensure dtype is float32 return x.astype(np.float32, copy=False) info = np.iinfo(x.dtype) x = x.astype(np.float32) # signed PCM (e.g. int16, int32) if info.min < 0: # ‑32768..32767 -> ‑1..1 x /= max(abs(info.min), info.max) # unsigned PCM (e.g. uint8 0..255) else: # 128.0 for uint8 midpoint = (info.max + 1) / 2 # 0..255 -> ‑1..1 x = (x - midpoint) / midpoint return x def _resample(audio: np.ndarray, rate: int, target_rate: int) -> np.ndarray: if rate == target_rate: return audio # Use polyphase filtering for efficient, high‑quality resampling. gcd = np.gcd(rate, target_rate) up = target_rate // gcd down = rate // gcd resampled = signal.resample_poly( _to_float32(audio), up=up, down=down, axis=0 ) return resampled def _load_model(): global _model if not _model: _model = nemo_asr.models.ASRModel.from_pretrained( model_name="nvidia/parakeet-tdt-0.6b-v2" ) return _model def _to_pandas(prediction, keyword): return pd.DataFrame(prediction.timestamp[keyword])[ [keyword, "start", "end"] ] def _invoke_model(model, audio: np.ndarray): prediction = model.transcribe(audio=audio, timestamps=True)[0] text = prediction.text chars = _to_pandas(prediction, "char") words = _to_pandas(prediction, "word") segments = _to_pandas(prediction, "segment") return text, chars, words, segments def transcribe(audio: tuple[np.ndarray, int] | None): if not audio: return "No audio received. Please upload or record something" rate, data = audio model = _load_model() data = _to_float32(data) data = _resample(data, rate, TARGET_SR) text, chars, words, segments = _invoke_model(model, data) return text, segments, words, chars app = gr.Interface( fn=transcribe, inputs=gr.Audio( sources=["upload", "microphone"], type="numpy", label="Upload or record audio", ), outputs=[ gr.Textbox(label="Transcription", show_copy_button=True), gr.Dataframe( label="Segments", headers=["Segment", "Start", "End"], ), gr.Dataframe( label="Words", headers=["Word", "Start", "End"], ), gr.Dataframe( label="Characters", headers=["Character", "Start", "End"], ), ], title=TITLE, description=DESCRIPTION, ) if __name__ == "__main__": app.launch()