Spaces:

Athspi
/

Ai-audio

Running

App Files Files Community

Athspi commited on Jan 12

Commit

0e08e04

verified ·

1 Parent(s): ebe4598

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -28

app.py CHANGED Viewed

@@ -139,6 +139,10 @@ def convert_to_wav(audio_file):
         audio.export(wav_path, format="wav")
     return wav_path
 def detect_language(audio_file):
     """Detect the language of the audio file."""
     if audio_file is None:
@@ -247,7 +251,13 @@ def detect_and_trim_audio(main_audio, target_audio, threshold=0.5):
         # Ensure both audio files have the same sample rate
         if main_rate != target_rate:
-            raise ValueError("Sample rates of the main audio and target audio must match.")
         # Normalize audio data
         main_data = main_data.astype(np.float32) / np.iinfo(main_data.dtype).max
@@ -258,35 +268,23 @@ def detect_and_trim_audio(main_audio, target_audio, threshold=0.5):
         correlation = np.abs(correlation)
         max_corr = np.max(correlation)
-        # Detect segments where the target audio is present
-        detected_segments = []
-        for i, corr_value in enumerate(correlation):
-            if corr_value >= threshold * max_corr:
-                start_time = i / main_rate
-                end_time = (i + len(target_data)) / main_rate
-                detected_segments.append((start_time, end_time))
-        # Merge overlapping or nearby segments
-        merged_segments = []
-        for segment in detected_segments:
-            if not merged_segments:
-                merged_segments.append(segment)
-            else:
-                last_segment = merged_segments[-1]
-                if segment[0] <= last_segment[1] + 1.0:  # Merge if within 1 second
-                    merged_segments[-1] = (last_segment[0], max(last_segment[1], segment[1]))
-                else:
-                    merged_segments.append(segment)
-        # Trim the main audio to include only the detected segments
         main_audio_segment = AudioSegment.from_file(main_wav_path)
-        trimmed_audio = AudioSegment.empty()
-        timestamps = []
-        for segment in merged_segments:
-            start_ms = int(segment[0] * 1000)
-            end_ms = int(segment[1] * 1000)
-            trimmed_audio += main_audio_segment[start_ms:end_ms]
-            timestamps.append(f"{segment[0]:.2f}-{segment[1]:.2f}")
         # Export the trimmed audio
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_output:
@@ -294,11 +292,13 @@ def detect_and_trim_audio(main_audio, target_audio, threshold=0.5):
             trimmed_audio.export(output_path, format="wav")
         # Format timestamps
-        timestamps_str = "\n".join(timestamps)
         # Clean up temporary WAV files
         os.remove(main_wav_path)
         os.remove(target_wav_path)
         return output_path, timestamps_str
     except Exception as e:

         audio.export(wav_path, format="wav")
     return wav_path
+def resample_audio(audio_segment, target_sample_rate):
+    """Resample an audio segment to the target sample rate."""
+    return audio_segment.set_frame_rate(target_sample_rate)
 def detect_language(audio_file):
     """Detect the language of the audio file."""
     if audio_file is None:
         # Ensure both audio files have the same sample rate
         if main_rate != target_rate:
+            logger.warning(f"Sample rates differ: main_audio={main_rate}, target_audio={target_rate}. Resampling target audio.")
+            target_segment = AudioSegment.from_file(target_wav_path)
+            target_segment = resample_audio(target_segment, main_rate)
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_resampled:
+                resampled_path = temp_resampled.name
+                target_segment.export(resampled_path, format="wav")
+            target_rate, target_data = wavfile.read(resampled_path)
         # Normalize audio data
         main_data = main_data.astype(np.float32) / np.iinfo(main_data.dtype).max
         correlation = np.abs(correlation)
         max_corr = np.max(correlation)
+        # Find the peak in the cross-correlation result
+        peak_index = np.argmax(correlation)
+        peak_value = correlation[peak_index]
+        # Check if the peak value exceeds the threshold
+        if peak_value < threshold * max_corr:
+            return None, "Error: Target audio not detected in the main audio."
+        # Calculate the start and end times of the target audio in the main audio
+        start_time = peak_index / main_rate
+        end_time = (peak_index + len(target_data)) / main_rate
+        # Trim the main audio to include only the detected segment
         main_audio_segment = AudioSegment.from_file(main_wav_path)
+        start_ms = int(start_time * 1000)
+        end_ms = int(end_time * 1000)
+        trimmed_audio = main_audio_segment[start_ms:end_ms]
         # Export the trimmed audio
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_output:
             trimmed_audio.export(output_path, format="wav")
         # Format timestamps
+        timestamps_str = f"{start_time:.2f}-{end_time:.2f}"
         # Clean up temporary WAV files
         os.remove(main_wav_path)
         os.remove(target_wav_path)
+        if 'resampled_path' in locals():
+            os.remove(resampled_path)
         return output_path, timestamps_str
     except Exception as e: