import gradio as gr
import nemo.collections.asr as nemo_asr
import numpy as np
import pandas as pd
from scipy import signal

TARGET_SR = 16_000  # Hz
TITLE = "NVIDIA's Parakeet TDT 0.6B v2 Demo"
DESCRIPTION = """## Description

NVIDIA's parakeet-tdt-0.6b-v2 is a 600-million-parameter automatic \
speech recognition (ASR) model designed for high-quality English \
transcription, featuring support for punctuation, capitalization, \
and accurate timestamp prediction. This is a state-of-the-art model \
ideal for: accurate word-level timestamp predictions, automatic \
punctuation and capitalization, robust performance on spoken numbers, \
and song lyrics transcription.

### License

The license is comercial friendly:

> GOVERNING TERMS: Use of this model is governed by the [CC-BY-4.0 license](https://creativecommons.org/licenses/by/4.0/deed.en).

### Contact

Need help adding transcription to your system? [Let's talk!](mailto:support@ridgerun.ai).\
At [RidgeRun.ai](https://ridgerun.ai) we'd love to help.

### Links of Interest
* [Model card](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2)
* [NVIDIA NeMO](https://github.com/NVIDIA/NeMo)

## Playground

"""

_model = None


def _to_float32(x: np.ndarray) -> np.ndarray:
    """
    Convert any integer PCM array to float32 in the range [-1, 1].
    Works for signed (int8/16/24/32) and unsigned (uint8) types,
    without hard‑coded magic numbers.
    """
    if not np.issubdtype(x.dtype, np.integer):
        # Already float – just ensure dtype is float32
        return x.astype(np.float32, copy=False)

    info = np.iinfo(x.dtype)
    x = x.astype(np.float32)

    # signed PCM (e.g. int16, int32)
    if info.min < 0:
        # ‑32768..32767 -> ‑1..1
        x /= max(abs(info.min), info.max)

    # unsigned PCM (e.g. uint8 0..255)
    else:
        # 128.0 for uint8
        midpoint = (info.max + 1) / 2
        # 0..255 -> ‑1..1
        x = (x - midpoint) / midpoint

    return x


def _resample(audio: np.ndarray, rate: int, target_rate: int) -> np.ndarray:
    if rate == target_rate:
        return audio

    # Use polyphase filtering for efficient, high‑quality resampling.
    gcd = np.gcd(rate, target_rate)
    up = target_rate // gcd
    down = rate // gcd
    resampled = signal.resample_poly(
        _to_float32(audio), up=up, down=down, axis=0
    )

    return resampled


def _load_model():
    global _model
    if not _model:
        _model = nemo_asr.models.ASRModel.from_pretrained(
            model_name="nvidia/parakeet-tdt-0.6b-v2"
        )

    return _model


def _to_pandas(prediction, keyword):
    return pd.DataFrame(prediction.timestamp[keyword])[
        [keyword, "start", "end"]
    ]


def _invoke_model(model, audio: np.ndarray):
    prediction = model.transcribe(audio=audio, timestamps=True)[0]

    text = prediction.text

    chars = _to_pandas(prediction, "char")
    words = _to_pandas(prediction, "word")
    segments = _to_pandas(prediction, "segment")

    return text, chars, words, segments


def transcribe(audio: tuple[np.ndarray, int] | None):
    if not audio:
        return "No audio received. Please upload or record something"

    rate, data = audio

    model = _load_model()
    data = _to_float32(data)
    data = _resample(data, rate, TARGET_SR)
    text, chars, words, segments = _invoke_model(model, data)

    return text, segments, words, chars


app = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(
        sources=["upload", "microphone"],
        type="numpy",
        label="Upload or record audio",
    ),
    outputs=[
        gr.Textbox(label="Transcription", show_copy_button=True),
        gr.Dataframe(
            label="Segments",
            headers=["Segment", "Start", "End"],
        ),
        gr.Dataframe(
            label="Words",
            headers=["Word", "Start", "End"],
        ),
        gr.Dataframe(
            label="Characters",
            headers=["Character", "Start", "End"],
        ),
    ],
    title=TITLE,
    description=DESCRIPTION,
)

if __name__ == "__main__":
    app.launch()