sdafd commited on
Commit
57e238b
·
verified ·
1 Parent(s): 7b80d55

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -26
app.py CHANGED
@@ -5,6 +5,7 @@ import librosa
5
  import logging
6
  import os
7
  import time
 
8
 
9
  # Configure logging
10
  logging.basicConfig(level=logging.INFO)
@@ -19,51 +20,105 @@ torch.set_num_threads(os.cpu_count())
19
  models = {
20
  "tiny": whisperx.load_model("tiny", device, compute_type=compute_type, vad_method='silero'),
21
  "base": whisperx.load_model("base", device, compute_type=compute_type, vad_method='silero'),
22
- "small": whisperx.load_model("small", device, compute_type=compute_type, vad_method='silero'),
23
  "large": whisperx.load_model("large", device, compute_type=compute_type, vad_method='silero'),
24
  "large-v2": whisperx.load_model("large-v2", device, compute_type=compute_type, vad_method='silero'),
25
  "large-v3": whisperx.load_model("large-v3", device, compute_type=compute_type, vad_method='silero'),
26
  }
27
 
28
- def transcribe(audio_file, model_size="base", debug=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  start_time = time.time()
30
- result = ""
31
  debug_log = []
32
 
33
  try:
34
- # Load audio file
35
  audio, sr = librosa.load(audio_file, sr=16000)
 
36
 
37
- # Run inference
38
  model = models[model_size]
39
  batch_size = 8 if model_size == "tiny" else 4
40
- transcript = model.transcribe(audio, batch_size=batch_size)
41
 
42
- # Align whisper output
43
- model_a, metadata = whisperx.load_align_model(
44
- language_code=transcript["language"], device=device
45
- )
46
- transcript_aligned = whisperx.align(
47
- transcript["segments"], model_a, metadata, audio, device
48
- )
49
-
50
- # Format word-level output
51
- for segment in transcript_aligned["segments"]:
52
- for word in segment["words"]:
53
- result += f"[{word['start']:5.2f}s-{word['end']:5.2f}s] {word['word']}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- debug_log.append(f"Processed in {time.time()-start_time:.2f}s")
56
  debug_log.append(f"Language detected: {transcript['language']}")
57
  debug_log.append(f"Batch size: {batch_size}")
 
58
 
59
  except Exception as e:
60
  logger.error("Error during transcription:", exc_info=True)
61
- result = "Error occurred during transcription"
62
  debug_log.append(f"ERROR: {str(e)}")
63
 
64
  if debug:
65
- return result, "\n".join(debug_log)
66
- return result
67
 
68
  # Gradio Interface
69
  with gr.Blocks(title="WhisperX CPU Transcription") as demo:
@@ -78,11 +133,18 @@ with gr.Blocks(title="WhisperX CPU Transcription") as demo:
78
  interactive=True,
79
  )
80
  model_selector = gr.Dropdown(
81
- choices=models.keys(),
82
  value="base",
83
  label="Model Size",
84
  interactive=True,
85
  )
 
 
 
 
 
 
 
86
  debug_checkbox = gr.Checkbox(label="Enable Debug Mode", value=False)
87
  transcribe_btn = gr.Button("Transcribe", variant="primary")
88
 
@@ -109,13 +171,13 @@ with gr.Blocks(title="WhisperX CPU Transcription") as demo:
109
  outputs=[debug_output]
110
  )
111
 
112
- # Process transcription
113
  transcribe_btn.click(
114
  transcribe,
115
- inputs=[audio_input, model_selector, debug_checkbox],
116
  outputs=[output_text, debug_output]
117
  )
118
 
119
  # Launch configuration
120
  if __name__ == "__main__":
121
- demo.queue(max_size=4).launch()
 
5
  import logging
6
  import os
7
  import time
8
+ import numpy as np
9
 
10
  # Configure logging
11
  logging.basicConfig(level=logging.INFO)
 
20
  models = {
21
  "tiny": whisperx.load_model("tiny", device, compute_type=compute_type, vad_method='silero'),
22
  "base": whisperx.load_model("base", device, compute_type=compute_type, vad_method='silero'),
23
+ "small": whisperx.load_model("small", device, compute_type=compute_type, vad_method='siliro'),
24
  "large": whisperx.load_model("large", device, compute_type=compute_type, vad_method='silero'),
25
  "large-v2": whisperx.load_model("large-v2", device, compute_type=compute_type, vad_method='silero'),
26
  "large-v3": whisperx.load_model("large-v3", device, compute_type=compute_type, vad_method='silero'),
27
  }
28
 
29
+ def split_audio_by_pause(audio, sr, pause_threshold, top_db=30):
30
+ """
31
+ Splits the audio into segments using librosa's non-silent detection.
32
+ Adjacent non-silent intervals are merged if the gap between them is less than the pause_threshold.
33
+ Returns a list of (start_sample, end_sample) tuples.
34
+ """
35
+ # Get non-silent intervals based on an amplitude threshold (in dB)
36
+ intervals = librosa.effects.split(audio, top_db=top_db)
37
+ if intervals.size == 0:
38
+ return [(0, len(audio))]
39
+
40
+ merged_intervals = []
41
+ current_start, current_end = intervals[0]
42
+
43
+ for start, end in intervals[1:]:
44
+ # Compute the gap duration (in seconds) between the current interval and the next one
45
+ gap_duration = (start - current_end) / sr
46
+ if gap_duration < pause_threshold:
47
+ # Merge intervals if gap is less than the threshold
48
+ current_end = end
49
+ else:
50
+ merged_intervals.append((current_start, current_end))
51
+ current_start, current_end = start, end
52
+ merged_intervals.append((current_start, current_end))
53
+ return merged_intervals
54
+
55
+ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0):
56
  start_time = time.time()
57
+ final_result = ""
58
  debug_log = []
59
 
60
  try:
61
+ # Load audio file at 16kHz
62
  audio, sr = librosa.load(audio_file, sr=16000)
63
+ debug_log.append(f"Audio loaded: {len(audio)/sr:.2f} seconds long at {sr} Hz")
64
 
65
+ # Get the preloaded model and determine batch size
66
  model = models[model_size]
67
  batch_size = 8 if model_size == "tiny" else 4
 
68
 
69
+ # If pause_threshold > 0, split audio into segments based on silence pauses
70
+ if pause_threshold > 0:
71
+ segments = split_audio_by_pause(audio, sr, pause_threshold)
72
+ debug_log.append(f"Audio split into {len(segments)} segment(s) using a pause threshold of {pause_threshold}s")
73
+ # Process each audio segment individually
74
+ for seg_idx, (seg_start, seg_end) in enumerate(segments):
75
+ audio_segment = audio[seg_start:seg_end]
76
+ seg_duration = (seg_end - seg_start) / sr
77
+ debug_log.append(f"Segment {seg_idx+1}: start={seg_start/sr:.2f}s, duration={seg_duration:.2f}s")
78
+
79
+ # Transcribe this segment
80
+ transcript = model.transcribe(audio_segment, batch_size=batch_size)
81
+
82
+ # Load alignment model for the detected language in this segment
83
+ model_a, metadata = whisperx.load_align_model(
84
+ language_code=transcript["language"], device=device
85
+ )
86
+ transcript_aligned = whisperx.align(
87
+ transcript["segments"], model_a, metadata, audio_segment, device
88
+ )
89
+
90
+ # Format word-level output with adjusted timestamps (adding segment offset)
91
+ for segment in transcript_aligned["segments"]:
92
+ for word in segment["words"]:
93
+ # Adjust start and end times by the segment's start time (in seconds)
94
+ adjusted_start = word['start'] + seg_start/sr
95
+ adjusted_end = word['end'] + seg_start/sr
96
+ final_result += f"[{adjusted_start:5.2f}s-{adjusted_end:5.2f}s] {word['word']}\n"
97
+ else:
98
+ # Process the entire audio without splitting
99
+ transcript = model.transcribe(audio, batch_size=batch_size)
100
+ model_a, metadata = whisperx.load_align_model(
101
+ language_code=transcript["language"], device=device
102
+ )
103
+ transcript_aligned = whisperx.align(
104
+ transcript["segments"], model_a, metadata, audio, device
105
+ )
106
+ for segment in transcript_aligned["segments"]:
107
+ for word in segment["words"]:
108
+ final_result += f"[{word['start']:5.2f}s-{word['end']:5.2f}s] {word['word']}\n"
109
 
 
110
  debug_log.append(f"Language detected: {transcript['language']}")
111
  debug_log.append(f"Batch size: {batch_size}")
112
+ debug_log.append(f"Processed in {time.time()-start_time:.2f}s")
113
 
114
  except Exception as e:
115
  logger.error("Error during transcription:", exc_info=True)
116
+ final_result = "Error occurred during transcription"
117
  debug_log.append(f"ERROR: {str(e)}")
118
 
119
  if debug:
120
+ return final_result, "\n".join(debug_log)
121
+ return final_result
122
 
123
  # Gradio Interface
124
  with gr.Blocks(title="WhisperX CPU Transcription") as demo:
 
133
  interactive=True,
134
  )
135
  model_selector = gr.Dropdown(
136
+ choices=list(models.keys()),
137
  value="base",
138
  label="Model Size",
139
  interactive=True,
140
  )
141
+ # New input: pause threshold in seconds (set to 0 to disable splitting)
142
+ pause_threshold_slider = gr.Slider(
143
+ minimum=0, maximum=5, step=0.1, value=0,
144
+ label="Pause Threshold (seconds)",
145
+ interactive=True,
146
+ info="Set a pause duration threshold. Audio pauses longer than this will be used to split the audio into segments."
147
+ )
148
  debug_checkbox = gr.Checkbox(label="Enable Debug Mode", value=False)
149
  transcribe_btn = gr.Button("Transcribe", variant="primary")
150
 
 
171
  outputs=[debug_output]
172
  )
173
 
174
+ # Process transcription with the new pause_threshold parameter
175
  transcribe_btn.click(
176
  transcribe,
177
+ inputs=[audio_input, model_selector, debug_checkbox, pause_threshold_slider],
178
  outputs=[output_text, debug_output]
179
  )
180
 
181
  # Launch configuration
182
  if __name__ == "__main__":
183
+ demo.queue(max_size=4).launch()