import gradio as gr
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Load your model from Hugging Face
model_name = "Futuresony/Future-sw_ASR-24-02-2025"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

# Function to transcribe audio
def transcribe(audio_file):
    speech_array, sample_rate = torchaudio.load(audio_file)
    
    # Resample to 16kHz
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    speech_array = resampler(speech_array).squeeze().numpy()
    
    # Process and transcribe
    input_values = processor(speech_array, sampling_rate=16000, return_tensors="pt").input_values
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    
    # Decode the text
    transcription = processor.batch_decode(predicted_ids)[0]
    return transcription

# Create Gradio interface
interface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Swahili ASR Transcription",
    description="Upload a Swahili audio file, and the model will transcribe the speech.",
)

# Launch the app
if __name__ == "__main__":
    interface.launch()