Spaces:

GavinHuang
/

asr-demo

Running on Zero

App Files Files Community

asr-demo / app.py

GavinHuang

fix: enhance audio processing in transcribe function to handle tuple input and resample to 16kHz

ac5f4c0 27 days ago

raw

history blame

4.06 kB

	import os
	import gradio as gr
	import torch
	import nemo.collections.asr as nemo_asr
	from omegaconf import OmegaConf
	import time
	import spaces

	# Check if CUDA is available
	print(f"CUDA available: {torch.cuda.is_available()}")
	if torch.cuda.is_available():
	print(f"CUDA device: {torch.cuda.get_device_name(0)}")

	model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")

	print(f"Model loaded on device: {model.device}")

	import numpy as np
	import soundfile as sf
	audio_buffer = []

	@spaces.GPU(duration=120)
	def transcribe(audio, state=""):
	global model, audio_buffer
	if audio is None or isinstance(audio, int):
	print(f"Skipping invalid audio input: {type(audio)}")
	return state, state

	print(f"Received audio input of type: {type(audio)}")
	print(f"Audio shape: {audio.shape if isinstance(audio, np.ndarray) else 'N/A'}")
	# Append NumPy array to buffer
	if isinstance(audio, tuple):
	print(f"Tuple contents: {audio}")
	# Try extracting the first element
	audio = audio[1] if len(audio) > 1 else None

	if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[1], np.ndarray):
	# Handle tuple of (sample_rate, audio_array)
	print(f"Tuple contents: {audio}")
	sample_rate, audio_data = audio
	# Resample to 16kHz for NeMo
	if sample_rate != 16000:
	print(f"Resampling from {sample_rate}Hz to 16000Hz")
	audio_data = librosa.resample(audio_data.astype(float), orig_sr=sample_rate, target_sr=16000)
	# Save to temporary WAV file
	temp_file = "temp_audio.wav"
	sf.write(temp_file, audio_data, samplerate=16000)
	print(f"Processing temporary audio file: {temp_file}")
	transcription = model.transcribe([temp_file])[0]
	os.remove(temp_file) # Clean up
	print("Temporary file removed.")

	# Clear buffer
	audio_buffer = []
	new_state = state + " " + transcription if state else transcription
	return new_state, new_state
	return state, state

	# Define the Gradio interface
	with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
	gr.Markdown("# 🎙️ Real-time Speech-to-Text Transcription")
	gr.Markdown("Powered by NVIDIA NeMo and the parakeet-tdt-0.6b-v2 model")
	with gr.Row():
	with gr.Column(scale=2):
	audio_input = gr.Audio(
	sources=["microphone"],
	type="numpy",
	streaming=True,
	label="Speak into your microphone"
	)

	clear_btn = gr.Button("Clear Transcript")

	with gr.Column(scale=3):
	text_output = gr.Textbox(
	label="Transcription",
	placeholder="Your speech will appear here...",
	lines=10
	)
	streaming_text = gr.Textbox(
	label="Real-time Transcription",
	placeholder="Real-time results will appear here...",
	lines=2
	)

	# State to store the ongoing transcription
	state = gr.State("")

	# Handle the audio stream
	audio_input.stream(
	fn=transcribe,
	inputs=[audio_input, state],
	outputs=[state, streaming_text],
	)

	# Clear the transcription
	def clear_transcription():
	return "", "", ""

	clear_btn.click(
	fn=clear_transcription,
	inputs=[],
	outputs=[text_output, streaming_text, state]
	)

	# Update the main text output when the state changes
	state.change(
	fn=lambda s: s,
	inputs=[state],
	outputs=[text_output]
	)

	gr.Markdown("## 📝 Instructions")
	gr.Markdown("""
	1. Click the microphone button to start recording
	2. Speak clearly into your microphone
	3. The transcription will appear in real-time
	4. Click 'Clear Transcript' to start a new transcription
	""")

	# Launch the app
	if __name__ == "__main__":
	demo.launch()