Spaces:

ridgerun-ai
/

parakeet-tdt-0.6b-v2

Running

App Files Files Community

parakeet-tdt-0.6b-v2 / run.py

mgruner

Add support for timestamps as well

c913b1a about 2 months ago

raw

history blame contribute delete

4.08 kB

	import gradio as gr
	import nemo.collections.asr as nemo_asr
	import numpy as np
	import pandas as pd
	from scipy import signal

	TARGET_SR = 16_000 # Hz
	TITLE = "NVIDIA's Parakeet TDT 0.6B v2 Demo"
	DESCRIPTION = """## Description

	NVIDIA's parakeet-tdt-0.6b-v2 is a 600-million-parameter automatic \
	speech recognition (ASR) model designed for high-quality English \
	transcription, featuring support for punctuation, capitalization, \
	and accurate timestamp prediction. This is a state-of-the-art model \
	ideal for: accurate word-level timestamp predictions, automatic \
	punctuation and capitalization, robust performance on spoken numbers, \
	and song lyrics transcription.

	### License

	The license is comercial friendly:

	> GOVERNING TERMS: Use of this model is governed by the [CC-BY-4.0 license](https://creativecommons.org/licenses/by/4.0/deed.en).

	### Contact

	Need help adding transcription to your system? [Let's talk!](mailto:[email protected]).\
	At [RidgeRun.ai](https://ridgerun.ai) we'd love to help.

	### Links of Interest
	* [Model card](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2)
	* [NVIDIA NeMO](https://github.com/NVIDIA/NeMo)

	## Playground

	"""

	_model = None


	def _to_float32(x: np.ndarray) -> np.ndarray:
	"""
	Convert any integer PCM array to float32 in the range [-1, 1].
	Works for signed (int8/16/24/32) and unsigned (uint8) types,
	without hard‑coded magic numbers.
	"""
	if not np.issubdtype(x.dtype, np.integer):
	# Already float – just ensure dtype is float32
	return x.astype(np.float32, copy=False)

	info = np.iinfo(x.dtype)
	x = x.astype(np.float32)

	# signed PCM (e.g. int16, int32)
	if info.min < 0:
	# ‑32768..32767 -> ‑1..1
	x /= max(abs(info.min), info.max)

	# unsigned PCM (e.g. uint8 0..255)
	else:
	# 128.0 for uint8
	midpoint = (info.max + 1) / 2
	# 0..255 -> ‑1..1
	x = (x - midpoint) / midpoint

	return x


	def _resample(audio: np.ndarray, rate: int, target_rate: int) -> np.ndarray:
	if rate == target_rate:
	return audio

	# Use polyphase filtering for efficient, high‑quality resampling.
	gcd = np.gcd(rate, target_rate)
	up = target_rate // gcd
	down = rate // gcd
	resampled = signal.resample_poly(
	_to_float32(audio), up=up, down=down, axis=0
	)

	return resampled


	def _load_model():
	global _model
	if not _model:
	_model = nemo_asr.models.ASRModel.from_pretrained(
	model_name="nvidia/parakeet-tdt-0.6b-v2"
	)

	return _model


	def _to_pandas(prediction, keyword):
	return pd.DataFrame(prediction.timestamp[keyword])[
	[keyword, "start", "end"]
	]


	def _invoke_model(model, audio: np.ndarray):
	prediction = model.transcribe(audio=audio, timestamps=True)[0]

	text = prediction.text

	chars = _to_pandas(prediction, "char")
	words = _to_pandas(prediction, "word")
	segments = _to_pandas(prediction, "segment")

	return text, chars, words, segments


	def transcribe(audio: tuple[np.ndarray, int] \| None):
	if not audio:
	return "No audio received. Please upload or record something"

	rate, data = audio

	model = _load_model()
	data = _to_float32(data)
	data = _resample(data, rate, TARGET_SR)
	text, chars, words, segments = _invoke_model(model, data)

	return text, segments, words, chars


	app = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(
	sources=["upload", "microphone"],
	type="numpy",
	label="Upload or record audio",
	),
	outputs=[
	gr.Textbox(label="Transcription", show_copy_button=True),
	gr.Dataframe(
	label="Segments",
	headers=["Segment", "Start", "End"],
	),
	gr.Dataframe(
	label="Words",
	headers=["Word", "Start", "End"],
	),
	gr.Dataframe(
	label="Characters",
	headers=["Character", "Start", "End"],
	),
	],
	title=TITLE,
	description=DESCRIPTION,
	)

	if __name__ == "__main__":
	app.launch()