doyouknowmarc commited on
Commit
c2cfda7
·
verified ·
1 Parent(s): 66fb3d7

Init - create App.py

Browse files
Files changed (1) hide show
  1. app.py +405 -0
app.py ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import warnings
3
+ import torch
4
+ import os
5
+ import whisper
6
+ import ssl
7
+ import zipfile
8
+ from pydub import AudioSegment
9
+ from pydub.silence import detect_nonsilent
10
+ import subprocess
11
+ import tempfile
12
+ import time
13
+
14
+ ssl._create_default_https_context = ssl._create_unverified_context
15
+
16
+ def process_audio(
17
+ audio_paths,
18
+ remove_silence=False,
19
+ min_silence_len=500,
20
+ silence_thresh=-50,
21
+ enable_chunking=False,
22
+ chunk_duration=600,
23
+ ffmpeg_path="ffmpeg",
24
+ model_size="large-v3-turbo",
25
+ language="de"
26
+ ):
27
+ try:
28
+ if not audio_paths:
29
+ return "No files selected.", "", None
30
+
31
+ # Clean up any existing temp directory at the start
32
+ temp_dir = "temp_processing"
33
+ if os.path.exists(temp_dir):
34
+ for file in os.listdir(temp_dir):
35
+ file_path = os.path.join(temp_dir, file)
36
+ try:
37
+ if os.path.isfile(file_path):
38
+ os.remove(file_path)
39
+ except Exception as e:
40
+ print(f"Error cleaning up {file_path}: {e}")
41
+ try:
42
+ os.rmdir(temp_dir)
43
+ except Exception as e:
44
+ print(f"Error removing temp directory: {e}")
45
+
46
+ # Create fresh temp directory with unique timestamp
47
+ temp_dir = f"temp_processing_{int(time.time())}"
48
+ os.makedirs(temp_dir, exist_ok=True)
49
+
50
+ processed_files = []
51
+ all_results = []
52
+ all_segments = []
53
+ all_txt_paths = []
54
+
55
+ try:
56
+ # Step 1: Process each audio file
57
+ for audio_path in audio_paths:
58
+ if not audio_path:
59
+ continue
60
+
61
+ current_file = audio_path
62
+ temp_files = []
63
+
64
+ # Step 1a: Split audio if chunking is enabled
65
+ if enable_chunking:
66
+ base_name = os.path.splitext(os.path.basename(current_file))[0]
67
+ output_pattern = os.path.join(temp_dir, f"{base_name}_part_%d.mp3")
68
+
69
+ cmd = [
70
+ ffmpeg_path, "-i", current_file,
71
+ "-f", "segment",
72
+ "-segment_time", str(chunk_duration),
73
+ "-c:a", "copy",
74
+ "-segment_start_number", "1",
75
+ output_pattern
76
+ ]
77
+
78
+ subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
79
+ chunk_files = sorted([os.path.join(temp_dir, f) for f in os.listdir(temp_dir)
80
+ if f.startswith(f"{base_name}_part_")])
81
+ temp_files.extend(chunk_files)
82
+ else:
83
+ temp_files.append(current_file)
84
+
85
+ # Step 1b: Remove silence if requested
86
+ if remove_silence:
87
+ silence_removed_files = []
88
+ for file in temp_files:
89
+ audio = AudioSegment.from_file(file)
90
+ nonsilent = detect_nonsilent(
91
+ audio,
92
+ min_silence_len=min_silence_len,
93
+ silence_thresh=silence_thresh
94
+ )
95
+ output = AudioSegment.empty()
96
+ for start, end in nonsilent:
97
+ output += audio[start:end]
98
+
99
+ # Save the silence-removed file
100
+ silence_removed_path = os.path.join(temp_dir, f"silence_removed_{os.path.basename(file)}")
101
+ output.export(silence_removed_path, format="mp3")
102
+ silence_removed_files.append(silence_removed_path)
103
+ processed_files.extend(silence_removed_files)
104
+ else:
105
+ processed_files.extend(temp_files)
106
+
107
+ # Step 2: Transcribe all processed files
108
+ print(f"Loading Whisper model '{model_size}'...")
109
+ model = whisper.load_model(model_size, device="cpu")
110
+
111
+ for file in processed_files:
112
+ print(f"Transcribing: {file}")
113
+ warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead")
114
+
115
+ result = model.transcribe(file, fp16=False, language=language, temperature=0.0)
116
+
117
+ full_text = result["text"]
118
+ segments = ""
119
+ for segment in result["segments"]:
120
+ segments += f"[{segment['start']:.2f} - {segment['end']:.2f}]: {segment['text']}\n"
121
+
122
+ # Store transcript files in temp directory
123
+ txt_path = os.path.join(temp_dir, f"transcript_{os.path.splitext(os.path.basename(file))[0]}.txt")
124
+ with open(txt_path, "w", encoding="utf-8") as f:
125
+ f.write("=== Full Transcription ===\n\n")
126
+ f.write(full_text)
127
+ f.write("\n\n=== Segment-wise Transcription ===\n")
128
+ f.write(segments)
129
+
130
+ all_results.append(full_text)
131
+ all_segments.append(segments)
132
+ all_txt_paths.append(txt_path)
133
+
134
+ # Create combined transcript file in temp directory
135
+ combined_txt_path = os.path.join(temp_dir, "combined_transcripts.txt")
136
+ with open(combined_txt_path, "w", encoding="utf-8") as f:
137
+ f.write("=== Combined Transcriptions ===\n\n")
138
+ for i, (result, segment, path) in enumerate(zip(all_results, all_segments, all_txt_paths)):
139
+ filename = os.path.basename(processed_files[i])
140
+ f.write(f"File: {filename}\n")
141
+ f.write("=== Full Transcription ===\n")
142
+ f.write(result)
143
+ f.write("\n\n=== Segment-wise Transcription ===\n")
144
+ f.write(segment)
145
+ f.write("\n" + "-"*50 + "\n\n")
146
+
147
+ # Format display output
148
+ combined_results = "=== File Transcriptions ===\n\n"
149
+ combined_segments = "=== File Segments ===\n\n"
150
+ for i, (result, segment) in enumerate(zip(all_results, all_segments)):
151
+ filename = os.path.basename(processed_files[i])
152
+ combined_results += f"File: {filename}\n{result}\n\n"
153
+ combined_segments += f"File: {filename}\n{segment}\n\n"
154
+
155
+ # Create ZIP with all processed files and transcripts
156
+ zip_path = f"processed_files_and_transcripts_{int(time.time())}.zip"
157
+ cleanup_files = processed_files.copy()
158
+
159
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
160
+ for file in processed_files:
161
+ if os.path.exists(file):
162
+ zipf.write(file, os.path.basename(file))
163
+ for txt_file in all_txt_paths:
164
+ if os.path.exists(txt_file):
165
+ zipf.write(txt_file)
166
+ if os.path.exists(combined_txt_path):
167
+ zipf.write(combined_txt_path)
168
+
169
+ # Cleanup files after ZIP creation
170
+ for file in cleanup_files:
171
+ if os.path.exists(file):
172
+ os.remove(file)
173
+ for txt_file in all_txt_paths:
174
+ if os.path.exists(txt_file):
175
+ os.remove(txt_file)
176
+ if os.path.exists(combined_txt_path):
177
+ os.remove(combined_txt_path)
178
+
179
+ # Clean up temp directory
180
+ if os.path.exists(temp_dir):
181
+ for file in os.listdir(temp_dir):
182
+ file_path = os.path.join(temp_dir, file)
183
+ if os.path.isfile(file_path):
184
+ os.remove(file_path)
185
+ os.rmdir(temp_dir)
186
+
187
+ return combined_results, combined_segments, zip_path
188
+
189
+ except Exception as inner_e:
190
+ print(f"Error during processing: {inner_e}")
191
+ raise inner_e
192
+
193
+ except Exception as e:
194
+ print(f"Error in process_audio: {e}")
195
+ if 'temp_dir' in locals() and os.path.exists(temp_dir):
196
+ try:
197
+ for file in os.listdir(temp_dir):
198
+ file_path = os.path.join(temp_dir, file)
199
+ if os.path.isfile(file_path):
200
+ os.remove(file_path)
201
+ os.rmdir(temp_dir)
202
+ except:
203
+ pass
204
+ return f"Error: {str(e)}", "", None
205
+
206
+ def create_interface():
207
+ with gr.Blocks(title="Interview Audio Processing App") as app:
208
+ gr.Markdown("""
209
+ # Audio Processing App
210
+ Upload audio files (MP3 or M4A) for processing and transcription.\\
211
+ Intended use case: transcription of interviews.
212
+ """)
213
+ with gr.Row():
214
+ with gr.Column():
215
+ audio_input = gr.File(
216
+ label="Upload Audio Files",
217
+ file_count="multiple",
218
+ type="filepath"
219
+ )
220
+
221
+ with gr.Group():
222
+ gr.Markdown("### Silence Removal Settings")
223
+ gr.Markdown(" Default settings are working very well. Silence removal helps to reduce hallucination.")
224
+ remove_silence = gr.Checkbox(
225
+ label="Remove Silence",
226
+ value=False
227
+ )
228
+
229
+ min_silence_len = gr.Slider(
230
+ minimum=100,
231
+ maximum=2000,
232
+ value=500,
233
+ step=100,
234
+ label="Minimum Silence Length (ms)",
235
+ visible=False
236
+ )
237
+ silence_thresh = gr.Slider(
238
+ minimum=-70,
239
+ maximum=-30,
240
+ value=-50,
241
+ step=5,
242
+ label="Silence Threshold (dB)",
243
+ visible=False
244
+ )
245
+
246
+ with gr.Group():
247
+ gr.Markdown("### Chunking Settings")
248
+ gr.Markdown(" Chunking reduces the load on the model. 10min chunks work really good.")
249
+ enable_chunking = gr.Checkbox(
250
+ label="Enable Chunking",
251
+ value=False
252
+ )
253
+ chunk_duration = gr.Slider(
254
+ minimum=60,
255
+ maximum=3600,
256
+ value=600,
257
+ step=60,
258
+ label="Chunk Duration (seconds)",
259
+ visible=False
260
+ )
261
+ ffmpeg_path = gr.Textbox(
262
+ label="FFmpeg Path",
263
+ value="ffmpeg",
264
+ placeholder="Path to ffmpeg executable",
265
+ visible=False
266
+ )
267
+
268
+ with gr.Group():
269
+ gr.Markdown("### Transcription Settings")
270
+ gr.Markdown(" tiny is the fastest, but the worst quality. Large-v3-turbo is the best, but slower.")
271
+ model_size = gr.Dropdown(
272
+ choices=["tiny", "base", "small", "medium", "large", "large-v2", "large-v3", "turbo", "large-v3-turbo"],
273
+ value="large-v3-turbo",
274
+ label="Whisper Model Size"
275
+ )
276
+ language = gr.Dropdown(
277
+ choices=["de", "en", "fr", "es", "it"],
278
+ value="de",
279
+ label="Language"
280
+ )
281
+
282
+ process_btn = gr.Button("Process", variant="primary")
283
+ delete_btn = gr.Button("Delete Everything", variant="stop")
284
+
285
+ with gr.Column():
286
+ full_transcription = gr.Textbox(label="Full Transcription", lines=15)
287
+ segmented_transcription = gr.Textbox(label="Segmented Transcription", lines=15)
288
+ download_output = gr.File(label="Download Processed Files and Transcripts (ZIP)")
289
+
290
+ def update_silence_controls(remove_silence):
291
+ return {
292
+ min_silence_len: gr.update(visible=remove_silence),
293
+ silence_thresh: gr.update(visible=remove_silence),
294
+ full_transcription: gr.update(value=""),
295
+ segmented_transcription: gr.update(value=""),
296
+ download_output: gr.update(value=None)
297
+ }
298
+
299
+ def update_chunking_controls(enable_chunking):
300
+ return {
301
+ chunk_duration: gr.update(visible=enable_chunking),
302
+ ffmpeg_path: gr.update(visible=enable_chunking),
303
+ full_transcription: gr.update(value=""),
304
+ segmented_transcription: gr.update(value=""),
305
+ download_output: gr.update(value=None)
306
+ }
307
+
308
+ remove_silence.change(
309
+ fn=update_silence_controls,
310
+ inputs=[remove_silence],
311
+ outputs=[
312
+ min_silence_len,
313
+ silence_thresh,
314
+ full_transcription,
315
+ segmented_transcription,
316
+ download_output
317
+ ]
318
+ )
319
+
320
+ enable_chunking.change(
321
+ fn=update_chunking_controls,
322
+ inputs=[enable_chunking],
323
+ outputs=[
324
+ chunk_duration,
325
+ ffmpeg_path,
326
+ full_transcription,
327
+ segmented_transcription,
328
+ download_output
329
+ ]
330
+ )
331
+
332
+ process_btn.click(
333
+ fn=process_audio,
334
+ inputs=[
335
+ audio_input,
336
+ remove_silence,
337
+ min_silence_len,
338
+ silence_thresh,
339
+ enable_chunking,
340
+ chunk_duration,
341
+ ffmpeg_path,
342
+ model_size,
343
+ language,
344
+ ],
345
+ outputs=[
346
+ full_transcription,
347
+ segmented_transcription,
348
+ download_output,
349
+ ]
350
+ )
351
+
352
+ # Add cleanup function
353
+ def cleanup_files():
354
+ try:
355
+ # Clean up temp directories
356
+ temp_dirs = [d for d in os.listdir('.') if d.startswith('temp_processing')]
357
+ for temp_dir in temp_dirs:
358
+ if os.path.exists(temp_dir):
359
+ for file in os.listdir(temp_dir):
360
+ file_path = os.path.join(temp_dir, file)
361
+ if os.path.isfile(file_path):
362
+ os.remove(file_path)
363
+ os.rmdir(temp_dir)
364
+
365
+ # Clean up ZIP files
366
+ zip_files = [f for f in os.listdir('.') if f.startswith('processed_files_and_transcripts_')]
367
+ for zip_file in zip_files:
368
+ if os.path.exists(zip_file):
369
+ os.remove(zip_file)
370
+
371
+ # Clean up transcript files
372
+ transcript_files = [f for f in os.listdir('.') if f.startswith('transcript_')]
373
+ for transcript_file in transcript_files:
374
+ if os.path.exists(transcript_file):
375
+ os.remove(transcript_file)
376
+
377
+ # Return updates for all output fields
378
+ return {
379
+ full_transcription: gr.update(value="All temporary files have been deleted."),
380
+ segmented_transcription: gr.update(value=""),
381
+ download_output: gr.update(value=None)
382
+ }
383
+ except Exception as e:
384
+ return {
385
+ full_transcription: gr.update(value=f"Error during cleanup: {str(e)}"),
386
+ segmented_transcription: gr.update(value=""),
387
+ download_output: gr.update(value=None)
388
+ }
389
+
390
+ # Update the delete button click handler
391
+ delete_btn.click(
392
+ fn=cleanup_files,
393
+ inputs=[],
394
+ outputs=[
395
+ full_transcription,
396
+ segmented_transcription,
397
+ download_output
398
+ ]
399
+ )
400
+
401
+ return app
402
+
403
+ if __name__ == "__main__":
404
+ app = create_interface()
405
+ app.launch(share=False)