GavinHuang commited on
Commit
51c343c
·
1 Parent(s): 19576da

Refactor transcription logic to remove unnecessary state management and simplify audio processing

Browse files
Files changed (1) hide show
  1. app.py +8 -24
app.py CHANGED
@@ -24,7 +24,6 @@ processor = AutoProcessor.from_pretrained(MODEL_ID)
24
  audio_buffer = deque()
25
  buffer_duration = 0.0
26
  last_transcription = ""
27
- is_running = False
28
 
29
  def process_audio_chunk(audio_chunk):
30
  """Process a single audio chunk and update buffer."""
@@ -45,7 +44,7 @@ def transcribe_audio():
45
  overlap_samples = int(OVERLAP_SECONDS * RATE)
46
  step_samples = window_samples - overlap_samples # Step size for sliding window
47
 
48
- while is_running and buffer_duration >= WINDOW_SECONDS:
49
  # Concatenate buffer into a window
50
  audio_window = np.concatenate(list(audio_buffer))
51
  audio_window = audio_window[:window_samples] # Trim to window size
@@ -77,10 +76,6 @@ def transcribe_audio():
77
  @spaces.GPU
78
  def audio_stream(audio):
79
  """Handle streaming audio input from Gradio."""
80
- global is_running
81
- if not is_running:
82
- return "Please start transcription."
83
-
84
  # Audio is a tuple (sample_rate, data) from Gradio
85
  sample_rate, audio_data = audio
86
 
@@ -95,35 +90,24 @@ def audio_stream(audio):
95
  for transcription in transcribe_audio():
96
  yield transcription
97
 
98
- def start_transcription():
99
- """Start the transcription process."""
100
- global is_running, audio_buffer, buffer_duration, last_transcription
101
- is_running = True
102
  audio_buffer = deque()
103
  buffer_duration = 0.0
104
  last_transcription = ""
105
- return "Transcription started. Speak into the microphone."
106
-
107
- def stop_transcription():
108
- """Stop the transcription process."""
109
- global is_running
110
- is_running = False
111
- return "Transcription stopped."
112
 
113
  # Gradio interface
114
  with gr.Blocks() as demo:
115
  gr.Markdown("# Real-Time Speech-to-Text with Whisper")
116
  gr.Markdown("Record audio using the microphone and see transcriptions in real-time. Hosted on Hugging Face Spaces with ZeroGPU.")
117
 
118
- with gr.Row():
119
- start_btn = gr.Button("Start Transcription")
120
- stop_btn = gr.Button("Stop Transcription")
121
-
122
  audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Speak Here")
123
- output_text = gr.Textbox(label="Transcription", interactive=False)
124
 
125
- start_btn.click(start_transcription, outputs=output_text)
126
- stop_btn.click(stop_transcription, outputs=output_text)
127
  audio_input.stream(audio_stream, inputs=audio_input, outputs=output_text)
128
 
129
  # Launch the app
 
24
  audio_buffer = deque()
25
  buffer_duration = 0.0
26
  last_transcription = ""
 
27
 
28
  def process_audio_chunk(audio_chunk):
29
  """Process a single audio chunk and update buffer."""
 
44
  overlap_samples = int(OVERLAP_SECONDS * RATE)
45
  step_samples = window_samples - overlap_samples # Step size for sliding window
46
 
47
+ while buffer_duration >= WINDOW_SECONDS:
48
  # Concatenate buffer into a window
49
  audio_window = np.concatenate(list(audio_buffer))
50
  audio_window = audio_window[:window_samples] # Trim to window size
 
76
  @spaces.GPU
77
  def audio_stream(audio):
78
  """Handle streaming audio input from Gradio."""
 
 
 
 
79
  # Audio is a tuple (sample_rate, data) from Gradio
80
  sample_rate, audio_data = audio
81
 
 
90
  for transcription in transcribe_audio():
91
  yield transcription
92
 
93
+ # Initialize application state
94
+ def init_app():
95
+ """Initialize the application state."""
96
+ global audio_buffer, buffer_duration, last_transcription
97
  audio_buffer = deque()
98
  buffer_duration = 0.0
99
  last_transcription = ""
100
+ return "Transcription is active. Speak into the microphone."
 
 
 
 
 
 
101
 
102
  # Gradio interface
103
  with gr.Blocks() as demo:
104
  gr.Markdown("# Real-Time Speech-to-Text with Whisper")
105
  gr.Markdown("Record audio using the microphone and see transcriptions in real-time. Hosted on Hugging Face Spaces with ZeroGPU.")
106
 
 
 
 
 
107
  audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Speak Here")
108
+ output_text = gr.Textbox(label="Transcription", value="Transcription is active. Speak into the microphone.", interactive=False)
109
 
110
+ demo.load(init_app, outputs=output_text)
 
111
  audio_input.stream(audio_stream, inputs=audio_input, outputs=output_text)
112
 
113
  # Launch the app