import gradio as gr import torch import torchaudio import numpy as np import threading import queue from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import sounddevice as sd import tempfile import wave # Load ASR Model model_name = "Futuresony/Future-sw_ASR-24-02-2025" processor = Wav2Vec2Processor.from_pretrained(model_name) model = Wav2Vec2ForCTC.from_pretrained(model_name) # Streaming Variables q = queue.Queue() streaming = True # Function to Record Audio in Chunks def callback(indata, frames, time, status): if status: print(status) q.put(indata.copy()) # Function to Continuously Transcribe Audio def transcribe_stream(): global streaming samplerate = 16000 # Model expects 16kHz audio # Start recording stream with sd.InputStream(samplerate=samplerate, channels=1, callback=callback): while streaming: audio_data = [] try: # Collect small audio chunks from the queue for _ in range(5): # Adjust to control update frequency audio_chunk = q.get(timeout=1) audio_data.append(audio_chunk) # Convert recorded chunks to numpy array audio_np = np.concatenate(audio_data, axis=0).flatten() # Process & transcribe input_values = processor(audio_np, sampling_rate=16000, return_tensors="pt").input_values with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids)[0] yield transcription # Stream output live except queue.Empty: continue # Gradio Live Interface def live_transcription(): return transcribe_stream() interface = gr.Interface( fn=live_transcription, inputs=None, outputs=gr.Textbox(label="Live Transcription"), live=True, title="Swahili Live Streaming ASR", description="Speak continuously, and the subtitles will appear in real-time.", ) # Run Transcription in Background Thread thread = threading.Thread(target=transcribe_stream) thread.daemon = True thread.start() # Launch Gradio App if __name__ == "__main__": interface.launch()