Added Multiple File Support

#6
Files changed (1) hide show
  1. app.py +85 -4
app.py CHANGED
@@ -7,6 +7,7 @@ import librosa
7
  import tgt.core
8
  import tgt.io3
9
  import soundfile as sf
 
10
  from transformers import pipeline
11
 
12
  # Constants
@@ -167,6 +168,51 @@ def validate_textgrid_for_intervals(audio_path, textgrid_file):
167
  raise gr.Error(f"Invalid TextGrid or audio file:\n{str(e)}")
168
 
169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  def launch_demo():
171
  initial_model = {
172
  "loaded_model": pipeline(
@@ -189,7 +235,7 @@ def launch_demo():
189
 
190
  # Dropdown for transcription type selection
191
  transcription_type = gr.Dropdown(
192
- choices=["Full Audio", "TextGrid Interval"],
193
  label="Transcription Type",
194
  value=None,
195
  interactive=True,
@@ -203,12 +249,29 @@ def launch_demo():
203
  full_transcribe_btn = gr.Button("Transcribe Full Audio", interactive=False, variant="primary")
204
  full_prediction = gr.Textbox(label="IPA Transcription", show_copy_button=True)
205
 
206
- full_textgrid_tier = gr.Textbox(label="TextGrid Tier Name", value="transcription", interactive=True)
207
 
208
  full_textgrid_contents = gr.Textbox(label="TextGrid Contents", show_copy_button=True)
209
  full_download_btn = gr.DownloadButton(label=TEXTGRID_DOWNLOAD_TEXT, interactive=False, variant="primary")
210
  full_reset_btn = gr.Button("Reset", variant="secondary")
211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  # Interval transcription section
213
  with gr.Column(visible=False) as interval_section:
214
  interval_audio = gr.Audio(type="filepath", show_download_button=True, label="Upload Audio File")
@@ -225,10 +288,11 @@ def launch_demo():
225
  transcription_type.change(
226
  fn=lambda t: (
227
  gr.update(visible=t == "Full Audio"),
 
228
  gr.update(visible=t == "TextGrid Interval"),
229
  ),
230
  inputs=transcription_type,
231
- outputs=[full_audio_section, interval_section],
232
  )
233
 
234
  # Enable full transcribe button after audio uploaded
@@ -260,7 +324,6 @@ def launch_demo():
260
  outputs=[full_download_btn],
261
  )
262
 
263
-
264
  full_reset_btn.click(
265
  fn=lambda: (None, "", "", "", gr.update(interactive=False)),
266
  outputs=[full_audio, full_prediction, full_textgrid_contents, full_download_btn],
@@ -309,6 +372,24 @@ def launch_demo():
309
  outputs=[interval_audio, interval_textgrid_file, tier_names, target_tier, interval_result, interval_download_btn],
310
  )
311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  demo.launch(max_file_size="100mb")
313
 
314
  if __name__ == "__main__":
 
7
  import tgt.core
8
  import tgt.io3
9
  import soundfile as sf
10
+ import zipfile
11
  from transformers import pipeline
12
 
13
  # Constants
 
168
  raise gr.Error(f"Invalid TextGrid or audio file:\n{str(e)}")
169
 
170
 
171
+ def transcribe_multiple_files(model_name, audio_files, model_state, tier_name):
172
+ try:
173
+ if not audio_files:
174
+ return [], None, model_state
175
+
176
+ if model_state["model_name"] != model_name:
177
+ model_state = {
178
+ "loaded_model": pipeline(task="automatic-speech-recognition", model=model_name),
179
+ "model_name": model_name,
180
+ }
181
+
182
+ table_data = []
183
+ tg_paths = []
184
+
185
+ for file in audio_files:
186
+ prediction = model_state["loaded_model"](file)["text"]
187
+ duration = librosa.get_duration(path=file)
188
+
189
+ annotation = tgt.core.Interval(0, duration, prediction)
190
+ transcription_tier = tgt.core.IntervalTier(0, duration, tier_name)
191
+ transcription_tier.add_annotation(annotation)
192
+
193
+ tg = tgt.core.TextGrid()
194
+ tg.add_tier(transcription_tier)
195
+
196
+ tg_str = tgt.io3.export_to_long_textgrid(tg)
197
+ tg_filename = Path(file).with_suffix(".TextGrid").name
198
+ tg_path = Path(TEXTGRID_DIR) / tg_filename
199
+ tg_path.write_text(tg_str)
200
+
201
+ table_data.append([Path(file).name, prediction])
202
+ tg_paths.append(tg_path)
203
+
204
+ # ZIP generation
205
+ zip_path = Path(tempfile.mkdtemp()) / "textgrids.zip"
206
+ with zipfile.ZipFile(zip_path, "w") as zipf:
207
+ for tg in tg_paths:
208
+ zipf.write(tg, arcname=tg.name)
209
+
210
+ return table_data, str(zip_path), model_state
211
+
212
+ except Exception as e:
213
+ raise gr.Error(f"Transcription failed: {str(e)}")
214
+
215
+
216
  def launch_demo():
217
  initial_model = {
218
  "loaded_model": pipeline(
 
235
 
236
  # Dropdown for transcription type selection
237
  transcription_type = gr.Dropdown(
238
+ choices=["Full Audio", "Multiple Full Audio", "TextGrid Interval"],
239
  label="Transcription Type",
240
  value=None,
241
  interactive=True,
 
249
  full_transcribe_btn = gr.Button("Transcribe Full Audio", interactive=False, variant="primary")
250
  full_prediction = gr.Textbox(label="IPA Transcription", show_copy_button=True)
251
 
252
+ full_textgrid_tier = gr.Textbox(label="TextGrid Tier Name", value="IPA", interactive=True)
253
 
254
  full_textgrid_contents = gr.Textbox(label="TextGrid Contents", show_copy_button=True)
255
  full_download_btn = gr.DownloadButton(label=TEXTGRID_DOWNLOAD_TEXT, interactive=False, variant="primary")
256
  full_reset_btn = gr.Button("Reset", variant="secondary")
257
 
258
+ # Multiple full audio transcription section
259
+ with gr.Column(visible=False) as multiple_full_audio_section:
260
+ multiple_full_audio = gr.File(file_types=[".wav"], label="Upload Audio File(s)", file_count="multiple")
261
+ multiple_full_textgrid_tier = gr.Textbox(label="TextGrid Tier Name", value="IPA")
262
+ multiple_full_transcribe_btn = gr.Button("Transcribe Audio Files", interactive=False, variant="primary")
263
+
264
+ multiple_full_table = gr.Dataframe(
265
+ headers=["Filename", "Transcription"],
266
+ interactive=False,
267
+ label="IPA Transcriptions",
268
+ datatype=["str", "str"]
269
+ )
270
+
271
+ multiple_full_zip_download_btn = gr.File(label="Download All as ZIP", interactive=False)
272
+ multiple_full_reset_btn = gr.Button("Reset", variant="secondary")
273
+
274
+
275
  # Interval transcription section
276
  with gr.Column(visible=False) as interval_section:
277
  interval_audio = gr.Audio(type="filepath", show_download_button=True, label="Upload Audio File")
 
288
  transcription_type.change(
289
  fn=lambda t: (
290
  gr.update(visible=t == "Full Audio"),
291
+ gr.update(visible=t == "Multiple Full Audio"),
292
  gr.update(visible=t == "TextGrid Interval"),
293
  ),
294
  inputs=transcription_type,
295
+ outputs=[full_audio_section, multiple_full_audio_section, interval_section],
296
  )
297
 
298
  # Enable full transcribe button after audio uploaded
 
324
  outputs=[full_download_btn],
325
  )
326
 
 
327
  full_reset_btn.click(
328
  fn=lambda: (None, "", "", "", gr.update(interactive=False)),
329
  outputs=[full_audio, full_prediction, full_textgrid_contents, full_download_btn],
 
372
  outputs=[interval_audio, interval_textgrid_file, tier_names, target_tier, interval_result, interval_download_btn],
373
  )
374
 
375
+ # Multiple full audio transcription logic
376
+ multiple_full_audio.change(
377
+ fn=lambda files: gr.update(interactive=bool(files)),
378
+ inputs=multiple_full_audio,
379
+ outputs=multiple_full_transcribe_btn,
380
+ )
381
+
382
+ multiple_full_transcribe_btn.click(
383
+ fn=transcribe_multiple_files,
384
+ inputs=[model_name, multiple_full_audio, model_state, multiple_full_textgrid_tier],
385
+ outputs=[multiple_full_table, multiple_full_zip_download_btn, model_state],
386
+ )
387
+
388
+ multiple_full_reset_btn.click(
389
+ fn=lambda: (None, "", [], None, gr.update(interactive=False)),
390
+ outputs=[multiple_full_audio, multiple_full_textgrid_tier, multiple_full_table, multiple_full_zip_download_btn, multiple_full_transcribe_btn],
391
+ )
392
+
393
  demo.launch(max_file_size="100mb")
394
 
395
  if __name__ == "__main__":