GavinHuang commited on
Commit
19576da
·
1 Parent(s): 177fa7b

Add real-time speech-to-text functionality using Whisper model with Gradio interface

Browse files
Files changed (2) hide show
  1. app.py +130 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import torch
4
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
5
+ import librosa
6
+ from collections import deque
7
+ import time
8
+ import spaces
9
+
10
+ # Model settings
11
+ MODEL_ID = "openai/whisper-small"
12
+ DEVICE = "cpu" # ZeroGPU uses CPU
13
+ WINDOW_SECONDS = 1.0 # Window size for transcription
14
+ OVERLAP_SECONDS = 0.5 # Overlap between windows
15
+ RATE = 16000 # Whisper expects 16kHz audio
16
+
17
+ # Initialize Whisper model and processor
18
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
19
+ MODEL_ID, low_cpu_mem_usage=True, use_safetensors=True
20
+ ).to(DEVICE)
21
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
22
+
23
+ # Global state
24
+ audio_buffer = deque()
25
+ buffer_duration = 0.0
26
+ last_transcription = ""
27
+ is_running = False
28
+
29
+ def process_audio_chunk(audio_chunk):
30
+ """Process a single audio chunk and update buffer."""
31
+ global audio_buffer, buffer_duration
32
+
33
+ # Convert audio chunk to numpy array
34
+ audio_array = np.array(audio_chunk, dtype=np.float32)
35
+ audio_buffer.append(audio_array)
36
+ buffer_duration += len(audio_array) / RATE
37
+
38
+ return audio_array
39
+
40
+ def transcribe_audio():
41
+ """Process audio buffer with sliding window and yield transcriptions."""
42
+ global audio_buffer, buffer_duration, last_transcription
43
+
44
+ window_samples = int(WINDOW_SECONDS * RATE)
45
+ overlap_samples = int(OVERLAP_SECONDS * RATE)
46
+ step_samples = window_samples - overlap_samples # Step size for sliding window
47
+
48
+ while is_running and buffer_duration >= WINDOW_SECONDS:
49
+ # Concatenate buffer into a window
50
+ audio_window = np.concatenate(list(audio_buffer))
51
+ audio_window = audio_window[:window_samples] # Trim to window size
52
+
53
+ # Process audio with Whisper
54
+ audio_window, _ = librosa.load(audio_window, sr=RATE, mono=True)
55
+ inputs = processor(audio_window, sampling_rate=RATE, return_tensors="pt").to(DEVICE)
56
+ with torch.no_grad():
57
+ predicted_ids = model.generate(inputs["input_features"])
58
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()
59
+
60
+ # Yield transcription if different from the last one
61
+ if transcription and transcription != last_transcription:
62
+ last_transcription = transcription
63
+ yield transcription
64
+
65
+ # Slide window: remove samples up to step size
66
+ samples_to_remove = step_samples
67
+ while samples_to_remove > 0 and audio_buffer:
68
+ if len(audio_buffer[0]) > samples_to_remove:
69
+ audio_buffer[0] = audio_buffer[0][samples_to_remove:]
70
+ buffer_duration -= samples_to_remove / RATE
71
+ break
72
+ else:
73
+ samples_to_remove -= len(audio_buffer[0])
74
+ buffer_duration -= len(audio_buffer[0]) / RATE
75
+ audio_buffer.popleft()
76
+
77
+ @spaces.GPU
78
+ def audio_stream(audio):
79
+ """Handle streaming audio input from Gradio."""
80
+ global is_running
81
+ if not is_running:
82
+ return "Please start transcription."
83
+
84
+ # Audio is a tuple (sample_rate, data) from Gradio
85
+ sample_rate, audio_data = audio
86
+
87
+ # Resample audio to 16kHz if needed
88
+ if sample_rate != RATE:
89
+ audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=RATE)
90
+
91
+ # Process audio chunk
92
+ process_audio_chunk(audio_data)
93
+
94
+ # Transcribe and yield results
95
+ for transcription in transcribe_audio():
96
+ yield transcription
97
+
98
+ def start_transcription():
99
+ """Start the transcription process."""
100
+ global is_running, audio_buffer, buffer_duration, last_transcription
101
+ is_running = True
102
+ audio_buffer = deque()
103
+ buffer_duration = 0.0
104
+ last_transcription = ""
105
+ return "Transcription started. Speak into the microphone."
106
+
107
+ def stop_transcription():
108
+ """Stop the transcription process."""
109
+ global is_running
110
+ is_running = False
111
+ return "Transcription stopped."
112
+
113
+ # Gradio interface
114
+ with gr.Blocks() as demo:
115
+ gr.Markdown("# Real-Time Speech-to-Text with Whisper")
116
+ gr.Markdown("Record audio using the microphone and see transcriptions in real-time. Hosted on Hugging Face Spaces with ZeroGPU.")
117
+
118
+ with gr.Row():
119
+ start_btn = gr.Button("Start Transcription")
120
+ stop_btn = gr.Button("Stop Transcription")
121
+
122
+ audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Speak Here")
123
+ output_text = gr.Textbox(label="Transcription", interactive=False)
124
+
125
+ start_btn.click(start_transcription, outputs=output_text)
126
+ stop_btn.click(stop_transcription, outputs=output_text)
127
+ audio_input.stream(audio_stream, inputs=audio_input, outputs=output_text)
128
+
129
+ # Launch the app
130
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ torch
4
+ numpy
5
+ librosa
6
+ spaces