Futuresony commited on
Commit
b4119c8
·
verified ·
1 Parent(s): f75518e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -43
app.py CHANGED
@@ -1,47 +1,28 @@
1
- import gradio as gr
2
- import torch
3
- import torchaudio
4
- import numpy as np
5
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
6
 
7
- # Load your trained Swahili ASR model
8
- model_name = "Futuresony/Future-sw_ASR-24-02-2025"
9
- processor = Wav2Vec2Processor.from_pretrained(model_name)
10
- model = Wav2Vec2ForCTC.from_pretrained(model_name)
11
 
12
- # Process microphone input in real-time
13
- def transcribe_live(audio):
14
- if audio is None:
15
- return ""
16
-
17
- # Convert NumPy array to PyTorch tensor
18
- speech_array = torch.from_numpy(audio).float()
19
-
20
- # Resample audio to 16kHz (if needed)
21
- sample_rate = 16000 # Since streaming provides 16kHz by default
22
-
23
- # Process input
24
- input_values = processor(speech_array, sampling_rate=sample_rate, return_tensors="pt").input_values
25
 
26
- # Predict transcription
27
- with torch.no_grad():
28
- logits = model(input_values).logits
29
- predicted_ids = torch.argmax(logits, dim=-1)
 
 
 
 
 
30
 
31
- # Decode text
32
- transcription = processor.batch_decode(predicted_ids)[0]
33
- return transcription
34
-
35
- # Create Gradio interface with real-time streaming
36
- interface = gr.Interface(
37
- fn=transcribe_live,
38
- inputs=gr.Audio(streaming=True, type="numpy"), # Live streaming input
39
- outputs=gr.Textbox(label="Live Transcription"),
40
- live=True,
41
- title="Live Swahili ASR Streaming",
42
- description="Talk and see real-time Swahili subtitles appear below!",
43
- )
44
-
45
- # Launch the live streaming ASR app
46
- if __name__ == "__main__":
47
- interface.launch()
 
1
+ import queue
2
+ import sounddevice as sd
3
+ from vosk import Model, KaldiRecognizer
4
+ import json
 
5
 
6
+ # Load Vosk Model (Download from https://alphacephei.com/vosk/models)
7
+ model = Model("model")
8
+ recognizer = KaldiRecognizer(model, 16000)
9
+ q = queue.Queue()
10
 
11
+ # Callback function to process microphone input
12
+ def callback(indata, frames, time, status):
13
+ if status:
14
+ print(status)
15
+ q.put(bytes(indata))
 
 
 
 
 
 
 
 
16
 
17
+ # Real-time transcription function
18
+ def transcribe():
19
+ with sd.RawInputStream(samplerate=16000, blocksize=8000, dtype="int16",
20
+ channels=1, callback=callback):
21
+ while True:
22
+ data = q.get()
23
+ if recognizer.AcceptWaveform(data):
24
+ result = json.loads(recognizer.Result())
25
+ print(result["text"]) # Print live transcription
26
 
27
+ # Run the transcription
28
+ transcribe()