Futuresony commited on
Commit
b3a902e
·
verified ·
1 Parent(s): c1f383b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -0
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import torchaudio
4
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
5
+
6
+ # Load your model from Hugging Face
7
+ model_name = "Futuresony/Future-sw_ASR-24-02-2025"
8
+ processor = Wav2Vec2Processor.from_pretrained(model_name)
9
+ model = Wav2Vec2ForCTC.from_pretrained(model_name)
10
+
11
+ # Function to process live audio stream
12
+ def transcribe_live(microphone_audio):
13
+ speech_array, sample_rate = torchaudio.load(microphone_audio)
14
+
15
+ # Resample to 16kHz
16
+ resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
17
+ speech_array = resampler(speech_array).squeeze().numpy()
18
+
19
+ # Process and transcribe
20
+ input_values = processor(speech_array, sampling_rate=16000, return_tensors="pt").input_values
21
+ with torch.no_grad():
22
+ logits = model(input_values).logits
23
+ predicted_ids = torch.argmax(logits, dim=-1)
24
+
25
+ # Decode the text
26
+ transcription = processor.batch_decode(predicted_ids)[0]
27
+ return transcription
28
+
29
+ # Create Gradio interface with live microphone input
30
+ interface = gr.Interface(
31
+ fn=transcribe_live,
32
+ inputs=gr.Audio(source="microphone", type="filepath"),
33
+ outputs="text",
34
+ live=True, # Enables real-time updates
35
+ title="Live Swahili ASR Transcription",
36
+ description="Speak into your microphone, and the model will transcribe in real-time.",
37
+ )
38
+
39
+ # Launch the app
40
+ if __name__ == "__main__":
41
+ interface.launch()
42
+