Futuresony commited on
Commit
a934af5
·
verified ·
1 Parent(s): e44cae0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -20
app.py CHANGED
@@ -1,28 +1,78 @@
 
 
 
 
 
1
  import queue
 
2
  import sounddevice as sd
3
- from vosk import Model, KaldiRecognizer
4
- import json
5
 
6
- # Load Vosk Model (Download from https://alphacephei.com/vosk/models)
7
- model = Model("model")
8
- recognizer = KaldiRecognizer(model, 16000)
 
 
 
9
  q = queue.Queue()
 
10
 
11
- # Callback function to process microphone input
12
  def callback(indata, frames, time, status):
13
  if status:
14
  print(status)
15
- q.put(bytes(indata))
16
-
17
- # Real-time transcription function
18
- def transcribe():
19
- with sd.RawInputStream(samplerate=16000, blocksize=8000, dtype="int16",
20
- channels=1, callback=callback):
21
- while True:
22
- data = q.get()
23
- if recognizer.AcceptWaveform(data):
24
- result = json.loads(recognizer.Result())
25
- print(result["text"]) # Print live transcription
26
-
27
- # Run the transcription
28
- transcribe()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import torchaudio
4
+ import numpy as np
5
+ import threading
6
  import queue
7
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
8
  import sounddevice as sd
9
+ import tempfile
10
+ import wave
11
 
12
+ # Load ASR Model
13
+ model_name = "Futuresony/Future-sw_ASR-24-02-2025"
14
+ processor = Wav2Vec2Processor.from_pretrained(model_name)
15
+ model = Wav2Vec2ForCTC.from_pretrained(model_name)
16
+
17
+ # Streaming Variables
18
  q = queue.Queue()
19
+ streaming = True
20
 
21
+ # Function to Record Audio in Chunks
22
  def callback(indata, frames, time, status):
23
  if status:
24
  print(status)
25
+ q.put(indata.copy())
26
+
27
+ # Function to Continuously Transcribe Audio
28
+ def transcribe_stream():
29
+ global streaming
30
+ samplerate = 16000 # Model expects 16kHz audio
31
+
32
+ # Start recording stream
33
+ with sd.InputStream(samplerate=samplerate, channels=1, callback=callback):
34
+ while streaming:
35
+ audio_data = []
36
+
37
+ try:
38
+ # Collect small audio chunks from the queue
39
+ for _ in range(5): # Adjust to control update frequency
40
+ audio_chunk = q.get(timeout=1)
41
+ audio_data.append(audio_chunk)
42
+
43
+ # Convert recorded chunks to numpy array
44
+ audio_np = np.concatenate(audio_data, axis=0).flatten()
45
+
46
+ # Process & transcribe
47
+ input_values = processor(audio_np, sampling_rate=16000, return_tensors="pt").input_values
48
+ with torch.no_grad():
49
+ logits = model(input_values).logits
50
+ predicted_ids = torch.argmax(logits, dim=-1)
51
+ transcription = processor.batch_decode(predicted_ids)[0]
52
+
53
+ yield transcription # Stream output live
54
+
55
+ except queue.Empty:
56
+ continue
57
+
58
+ # Gradio Live Interface
59
+ def live_transcription():
60
+ return transcribe_stream()
61
+
62
+ interface = gr.Interface(
63
+ fn=live_transcription,
64
+ inputs=None,
65
+ outputs=gr.Textbox(label="Live Transcription"),
66
+ live=True,
67
+ title="Swahili Live Streaming ASR",
68
+ description="Speak continuously, and the subtitles will appear in real-time.",
69
+ )
70
+
71
+ # Run Transcription in Background Thread
72
+ thread = threading.Thread(target=transcribe_stream)
73
+ thread.daemon = True
74
+ thread.start()
75
+
76
+ # Launch Gradio App
77
+ if __name__ == "__main__":
78
+ interface.launch()