sdafd commited on
Commit
df44cb1
·
verified ·
1 Parent(s): 1d2d603

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -0
app.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import whisperx
3
+ import torch
4
+ import librosa
5
+ import logging
6
+ import os
7
+ import time
8
+
9
+ # Configure logging
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger("whisperx_app")
12
+
13
+ # Device setup (force CPU)
14
+ device = "cpu"
15
+ torch.set_num_threads(os.cpu_count())
16
+
17
+ # Pre-load models
18
+ models = {
19
+ "tiny": whisperx.load_model("tiny", device),
20
+ "base": whisperx.load_model("base", device),
21
+ "small": whisperx.load_model("small", device),
22
+ }
23
+
24
+ def transcribe(audio_file, model_size="base", debug=False):
25
+ start_time = time.time()
26
+ result = ""
27
+ debug_log = []
28
+
29
+ try:
30
+ # Load audio file
31
+ audio, sr = librosa.load(audio_file, sr=16000)
32
+
33
+ # Run inference
34
+ model = models[model_size]
35
+ batch_size = 8 if model_size == "tiny" else 4
36
+ transcript = model.transcribe(audio, batch_size=batch_size)
37
+
38
+ # Align whisper output
39
+ model_a, metadata = whisperx.load_align_model(
40
+ language_code=transcript["language"], device=device
41
+ )
42
+ transcript_aligned = whisperx.align(
43
+ transcript["segments"], model_a, metadata, audio, device
44
+ )
45
+
46
+ # Format word-level output
47
+ for segment in transcript_aligned["segments"]:
48
+ for word in segment["words"]:
49
+ result += f"[{word['start']:5.2f}s-{word['end']:5.2f}s] {word['word']}\n"
50
+
51
+ debug_log.append(f"Processed in {time.time()-start_time:.2f}s")
52
+ debug_log.append(f"Language detected: {transcript['language']}")
53
+ debug_log.append(f"Batch size: {batch_size}")
54
+
55
+ except Exception as e:
56
+ logger.error("Error during transcription:", exc_info=True)
57
+ result = "Error occurred during transcription"
58
+ debug_log.append(f"ERROR: {str(e)}")
59
+
60
+ if debug:
61
+ return result, "\n".join(debug_log)
62
+ return result
63
+
64
+ # Gradio Interface
65
+ with gr.Blocks(title="WhisperX CPU Transcription") as demo:
66
+ gr.Markdown("# WhisperX CPU Transcription with Word-Level Timestamps")
67
+
68
+ with gr.Row():
69
+ with gr.Column():
70
+ audio_input = gr.Audio(
71
+ label="Upload Audio File",
72
+ type="filepath",
73
+ sources=["upload", "microphone"],
74
+ interactive=True,
75
+ )
76
+ model_selector = gr.Dropdown(
77
+ choices=["tiny", "base", "small"],
78
+ value="base",
79
+ label="Model Size",
80
+ interactive=True,
81
+ )
82
+ debug_checkbox = gr.Checkbox(label="Enable Debug Mode", value=False)
83
+ transcribe_btn = gr.Button("Transcribe", variant="primary")
84
+
85
+ with gr.Column():
86
+ output_text = gr.Textbox(
87
+ label="Transcription Output",
88
+ lines=20,
89
+ placeholder="Transcription will appear here...",
90
+ )
91
+ debug_output = gr.Textbox(
92
+ label="Debug Information",
93
+ lines=10,
94
+ placeholder="Debug logs will appear here...",
95
+ visible=False,
96
+ )
97
+
98
+ # Toggle debug visibility
99
+ def toggle_debug(debug_enabled):
100
+ return gr.update(visible=debug_enabled)
101
+
102
+ debug_checkbox.change(
103
+ toggle_debug,
104
+ inputs=[debug_checkbox],
105
+ outputs=[debug_output]
106
+ )
107
+
108
+ # Process transcription
109
+ transcribe_btn.click(
110
+ transcribe,
111
+ inputs=[audio_input, model_selector, debug_checkbox],
112
+ outputs=[output_text, debug_output]
113
+ )
114
+
115
+ # Launch configuration
116
+ if __name__ == "__main__":
117
+ demo.queue(max_size=4).launch()