import os import gradio as gr import torch import nemo.collections.asr as nemo_asr from omegaconf import OmegaConf import time import spaces # Check if CUDA is available print(f"CUDA available: {torch.cuda.is_available()}") if torch.cuda.is_available(): print(f"CUDA device: {torch.cuda.get_device_name(0)}") model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2") print(f"Model loaded on device: {model.device}") @spaces.GPU(duration=120) # Increase duration if inference takes >60s def transcribe(audio, state=""): """ Transcribe audio in real-time """ # Skip processing if no audio is provided if audio is None: return state, state # Move model to GPU if available if torch.cuda.is_available(): print(f"CUDA device: {torch.cuda.get_device_name(0)}") model = model.cuda() # Get the sample rate from the audio sample_rate = 16000 # Default to 16kHz if not specified # Process the audio with the ASR model with torch.no_grad(): transcription = model.transcribe([audio])[0] # Append new transcription to the state if state == "": new_state = transcription else: new_state = state + " " + transcription model.cpu() return new_state, new_state # Define the Gradio interface with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo: gr.Markdown("# 🎙️ Real-time Speech-to-Text Transcription") gr.Markdown("Powered by NVIDIA NeMo and the parakeet-tdt-0.6b-v2 model") with gr.Row(): with gr.Column(scale=2): audio_input = gr.Audio( sources=["microphone"], type="numpy", streaming=True, label="Speak into your microphone" ) clear_btn = gr.Button("Clear Transcript") with gr.Column(scale=3): text_output = gr.Textbox( label="Transcription", placeholder="Your speech will appear here...", lines=10 ) streaming_text = gr.Textbox( label="Real-time Transcription", placeholder="Real-time results will appear here...", lines=2 ) # State to store the ongoing transcription state = gr.State("") # Handle the audio stream audio_input.stream( fn=transcribe, inputs=[audio_input, state], outputs=[state, streaming_text], ) # Clear the transcription def clear_transcription(): return "", "", "" clear_btn.click( fn=clear_transcription, inputs=[], outputs=[text_output, streaming_text, state] ) # Update the main text output when the state changes state.change( fn=lambda s: s, inputs=[state], outputs=[text_output] ) gr.Markdown("## 📝 Instructions") gr.Markdown(""" 1. Click the microphone button to start recording 2. Speak clearly into your microphone 3. The transcription will appear in real-time 4. Click 'Clear Transcript' to start a new transcription """) # Launch the app if __name__ == "__main__": demo.launch()