Athspi commited on
Commit
0e08e04
·
verified ·
1 Parent(s): ebe4598

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -28
app.py CHANGED
@@ -139,6 +139,10 @@ def convert_to_wav(audio_file):
139
  audio.export(wav_path, format="wav")
140
  return wav_path
141
 
 
 
 
 
142
  def detect_language(audio_file):
143
  """Detect the language of the audio file."""
144
  if audio_file is None:
@@ -247,7 +251,13 @@ def detect_and_trim_audio(main_audio, target_audio, threshold=0.5):
247
 
248
  # Ensure both audio files have the same sample rate
249
  if main_rate != target_rate:
250
- raise ValueError("Sample rates of the main audio and target audio must match.")
 
 
 
 
 
 
251
 
252
  # Normalize audio data
253
  main_data = main_data.astype(np.float32) / np.iinfo(main_data.dtype).max
@@ -258,35 +268,23 @@ def detect_and_trim_audio(main_audio, target_audio, threshold=0.5):
258
  correlation = np.abs(correlation)
259
  max_corr = np.max(correlation)
260
 
261
- # Detect segments where the target audio is present
262
- detected_segments = []
263
- for i, corr_value in enumerate(correlation):
264
- if corr_value >= threshold * max_corr:
265
- start_time = i / main_rate
266
- end_time = (i + len(target_data)) / main_rate
267
- detected_segments.append((start_time, end_time))
268
 
269
- # Merge overlapping or nearby segments
270
- merged_segments = []
271
- for segment in detected_segments:
272
- if not merged_segments:
273
- merged_segments.append(segment)
274
- else:
275
- last_segment = merged_segments[-1]
276
- if segment[0] <= last_segment[1] + 1.0: # Merge if within 1 second
277
- merged_segments[-1] = (last_segment[0], max(last_segment[1], segment[1]))
278
- else:
279
- merged_segments.append(segment)
280
 
281
- # Trim the main audio to include only the detected segments
282
  main_audio_segment = AudioSegment.from_file(main_wav_path)
283
- trimmed_audio = AudioSegment.empty()
284
- timestamps = []
285
- for segment in merged_segments:
286
- start_ms = int(segment[0] * 1000)
287
- end_ms = int(segment[1] * 1000)
288
- trimmed_audio += main_audio_segment[start_ms:end_ms]
289
- timestamps.append(f"{segment[0]:.2f}-{segment[1]:.2f}")
290
 
291
  # Export the trimmed audio
292
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_output:
@@ -294,11 +292,13 @@ def detect_and_trim_audio(main_audio, target_audio, threshold=0.5):
294
  trimmed_audio.export(output_path, format="wav")
295
 
296
  # Format timestamps
297
- timestamps_str = "\n".join(timestamps)
298
 
299
  # Clean up temporary WAV files
300
  os.remove(main_wav_path)
301
  os.remove(target_wav_path)
 
 
302
 
303
  return output_path, timestamps_str
304
  except Exception as e:
 
139
  audio.export(wav_path, format="wav")
140
  return wav_path
141
 
142
+ def resample_audio(audio_segment, target_sample_rate):
143
+ """Resample an audio segment to the target sample rate."""
144
+ return audio_segment.set_frame_rate(target_sample_rate)
145
+
146
  def detect_language(audio_file):
147
  """Detect the language of the audio file."""
148
  if audio_file is None:
 
251
 
252
  # Ensure both audio files have the same sample rate
253
  if main_rate != target_rate:
254
+ logger.warning(f"Sample rates differ: main_audio={main_rate}, target_audio={target_rate}. Resampling target audio.")
255
+ target_segment = AudioSegment.from_file(target_wav_path)
256
+ target_segment = resample_audio(target_segment, main_rate)
257
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_resampled:
258
+ resampled_path = temp_resampled.name
259
+ target_segment.export(resampled_path, format="wav")
260
+ target_rate, target_data = wavfile.read(resampled_path)
261
 
262
  # Normalize audio data
263
  main_data = main_data.astype(np.float32) / np.iinfo(main_data.dtype).max
 
268
  correlation = np.abs(correlation)
269
  max_corr = np.max(correlation)
270
 
271
+ # Find the peak in the cross-correlation result
272
+ peak_index = np.argmax(correlation)
273
+ peak_value = correlation[peak_index]
 
 
 
 
274
 
275
+ # Check if the peak value exceeds the threshold
276
+ if peak_value < threshold * max_corr:
277
+ return None, "Error: Target audio not detected in the main audio."
278
+
279
+ # Calculate the start and end times of the target audio in the main audio
280
+ start_time = peak_index / main_rate
281
+ end_time = (peak_index + len(target_data)) / main_rate
 
 
 
 
282
 
283
+ # Trim the main audio to include only the detected segment
284
  main_audio_segment = AudioSegment.from_file(main_wav_path)
285
+ start_ms = int(start_time * 1000)
286
+ end_ms = int(end_time * 1000)
287
+ trimmed_audio = main_audio_segment[start_ms:end_ms]
 
 
 
 
288
 
289
  # Export the trimmed audio
290
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_output:
 
292
  trimmed_audio.export(output_path, format="wav")
293
 
294
  # Format timestamps
295
+ timestamps_str = f"{start_time:.2f}-{end_time:.2f}"
296
 
297
  # Clean up temporary WAV files
298
  os.remove(main_wav_path)
299
  os.remove(target_wav_path)
300
+ if 'resampled_path' in locals():
301
+ os.remove(resampled_path)
302
 
303
  return output_path, timestamps_str
304
  except Exception as e: