Spaces:

reab5555
/

Multimodal-Behavioral-Anomalies-Detection

Runtime error

App Files Files Community

reab5555 commited on Jul 28, 2024

Commit

4a67bd7

verified ·

1 Parent(s): bafab47

Update voice_analysis.py

Browse files

Files changed (1) hide show

voice_analysis.py +12 -37

voice_analysis.py CHANGED Viewed

@@ -24,51 +24,26 @@ def diarize_speakers(audio_path):
     return diarization
 def get_speaker_embeddings(audio_path, diarization, model_name="pyannote/embedding"):
-    hf_token = os.environ.get("py_annote_hf_token")
-    if not hf_token:
-        raise ValueError("py_annote_hf_token environment variable is not set. Please check your Hugging Face Space's Variables and secrets section.")
-    model = Model.from_pretrained(model_name, use_auth_token=hf_token)
-    model.eval()  # Set the model to evaluation mode
     waveform, sample_rate = torchaudio.load(audio_path)
-    print(f"Sample rate: {sample_rate}")
-    print(f"Waveform shape: {waveform.shape}")
-    # Convert stereo to mono if necessary
-    if waveform.shape[0] == 2:
-        waveform = torch.mean(waveform, dim=0, keepdim=True)
     embeddings = []
     for turn, _, speaker in diarization.itertracks(yield_label=True):
         start_frame = int(turn.start * sample_rate)
         end_frame = int(turn.end * sample_rate)
         segment = waveform[:, start_frame:end_frame]
-        print(f"Segment shape before processing: {segment.shape}")
-        if segment.shape[1] == 0:
-            continue
-        # Ensure the segment is long enough (at least 2 seconds)
-        if segment.shape[1] < 2 * sample_rate:
-            padding = torch.zeros(1, 2 * sample_rate - segment.shape[1])
-            segment = torch.cat([segment, padding], dim=1)
-        # Ensure the segment is not too long (maximum 10 seconds)
-        if segment.shape[1] > 10 * sample_rate:
-            segment = segment[:, :10 * sample_rate]
-        print(f"Segment shape after processing: {segment.shape}")
-        with torch.no_grad():
-            embedding = model(segment)  # Pass the tensor directly, not a dictionary
-        embeddings.append({"time": turn.start, "embedding": embedding.squeeze().cpu().numpy(), "speaker": speaker})
-    return embeddings
 def align_voice_embeddings(voice_embeddings, frame_count, fps):
     aligned_embeddings = []

     return diarization
 def get_speaker_embeddings(audio_path, diarization, model_name="pyannote/embedding"):
+    model = Model.from_pretrained(model_name, use_auth_token=os.environ.get("py_annote_hf_token"))
     waveform, sample_rate = torchaudio.load(audio_path)
+    duration = waveform.shape[1] / sample_rate
     embeddings = []
     for turn, _, speaker in diarization.itertracks(yield_label=True):
         start_frame = int(turn.start * sample_rate)
         end_frame = int(turn.end * sample_rate)
         segment = waveform[:, start_frame:end_frame]
+        if segment.shape[1] > 0:
+            with torch.no_grad():
+                embedding = model(segment.to(model.device))
+            embeddings.append({"time": turn.start, "duration": turn.duration, "embedding": embedding.cpu().numpy(), "speaker": speaker})
+    # Ensure embeddings cover the entire duration
+    if embeddings and embeddings[-1]['time'] + embeddings[-1]['duration'] < duration:
+        embeddings.append({"time": duration, "duration": 0, "embedding": np.zeros_like(embeddings[0]['embedding']), "speaker": "silence"})
+    return embeddings, duration
 def align_voice_embeddings(voice_embeddings, frame_count, fps):
     aligned_embeddings = []