Spaces:

Sven33
/

SATEv1.5

Runtime error

TomRoma commited on Aug 11

Commit

73b6e10

1 Parent(s): fe970e3

Enhance speaker identification functionality and add comprehensive tests for audio inputs, updated requirments.txt

Browse files

Files changed (3) hide show

requirements.txt +3 -1
speaker/speaker_identification.py +111 -9
test_eval_speaker_identification.py +133 -0

requirements.txt CHANGED Viewed

@@ -20,4 +20,6 @@ scipy>=1.7.0
 matplotlib>=3.3.0
 seaborn>=0.11.0
-# install ffmpeg

 matplotlib>=3.3.0
 seaborn>=0.11.0
+# install ffmpeg
+librosa>=0.8.0
+transformers>=4.0.0

speaker/speaker_identification.py CHANGED Viewed

@@ -1,16 +1,118 @@
-from typing import List
-# Assigns speaker IDs for a list of audio segments.
-# Args:
-# audio_list (List): List of audio (list of file path or list of nparray, assume sampling rate = 16000)
-# Returns:
-# List[str]: List of speaker IDs corresponding to each audio segment
-def assign_speaker_for_audio_list(audio_list: List) -> List[str]:
-    speaker_ids = []
     return speaker_ids

+from typing import List, Union, Optional
+import os
+import numpy as np
+import librosa
+from transformers import pipeline
+# Default sample rate for audio processing
+DEFAULT_SAMPLE_RATE = 16000
+# Singleton pattern to avoid loading the model multiple times
+_PREDICTOR_INSTANCE = None
+def get_predictor():
+    """
+    Get or create the singleton predictor instance.
+    Returns:
+        Predictor: A shared instance of the Predictor class.
+    """
+    global _PREDICTOR_INSTANCE
+    if _PREDICTOR_INSTANCE is None:
+        _PREDICTOR_INSTANCE = Predictor()
+    return _PREDICTOR_INSTANCE
+class Predictor:
+    def __init__(self, model_path: Optional[str] = None):
+        """
+        Initialize the predictor with a pre-trained model.
+        Args:
+            model_path: Optional path to a local model. If None, uses the default HuggingFace model.
+        """
+        # Load Hugging Face audio-classification pipeline
+        self.model = pipeline("audio-classification", model="bookbot/wav2vec2-adult-child-cls")
+    def preprocess(self, input_item: Union[str, np.ndarray]) -> np.ndarray:
+        """
+        Preprocess an input item (either file path or numpy array).
+        Args:
+            input_item: Either a file path string or a numpy array of audio data.
+        Returns:
+            np.ndarray: Processed audio data as a numpy array.
+        Raises:
+            ValueError: If input type is unsupported.
+        """
+        if isinstance(input_item, str):
+            # Load audio file to numpy array
+            audio, _ = librosa.load(input_item, sr=DEFAULT_SAMPLE_RATE)
+            return audio
+        elif isinstance(input_item, np.ndarray):
+            return input_item
+        else:
+            raise ValueError(f"Unsupported input type: {type(input_item)}")
+    def predict(self, input_list: List[Union[str, np.ndarray]]) -> List[int]:
+        """
+        Predict speaker type (child=0, adult=1) for a list of audio inputs.
+        Args:
+            input_list: List of inputs, either file paths or numpy arrays.
+        Returns:
+            List[int]: List of predictions (0=child, 1=adult, -1=unknown).
+        """
+        # Preprocess all inputs first
+        processed = [self.preprocess(item) for item in input_list]
+        # Batch inference
+        preds = self.model(processed, sampling_rate=DEFAULT_SAMPLE_RATE)
+        # Map label to 0 (child) or 1 (adult)
+        label_map = {
+            "child": 0,
+            "adult": 1
+        }
+        results = []
+        for pred in preds:
+            # pred can be a list of dicts (top-k), take the top prediction
+            if isinstance(pred, list):
+                label = pred[0]["label"]
+            else:
+                label = pred["label"]
+            results.append(label_map.get(label.lower(), -1))  # -1 for unknown label
+        return results
+# Usage:
+# predictor = Predictor("path/to/model")
+# predictions = predictor.predict(list_of_inputs)
+def assign_speaker_for_audio_list(audio_list: List[Union[str, np.ndarray]]) -> List[str]:
+    """
+    Assigns speaker IDs for a list of audio segments.
+    Args:
+        audio_list: List of audio inputs (either file paths or numpy arrays,
+                   assumed to have sampling rate = 16000).
+    Returns:
+        List[str]: List of speaker IDs corresponding to each audio segment.
+                 "Speaker_id_0" for child, "Speaker_id_1" for adult.
+    """
+    if not audio_list:
+        return []
+    # Use singleton predictor to avoid reloading model
+    predictor = get_predictor()
+    # Get list of 0 (child) or 1 (adult)
+    numeric_labels = predictor.predict(audio_list)
+    # Map to Speaker_id_0 and Speaker_id_1, preserving order
+    speaker_ids = [f"Speaker_id_{label}" if label in (0,1) else "Unknown" for label in numeric_labels]
     return speaker_ids

test_eval_speaker_identification.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import os
+import numpy as np
+import librosa
+from speaker.speaker_identification import assign_speaker_for_audio_list
+# Define constants
+TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'Test_data_for_clas_Idef')
+AUDIO_FILES_DIR = os.path.join(TEST_DATA_DIR, 'enni_audio_files')
+NUMPY_FILES_DIR = os.path.join(TEST_DATA_DIR, 'enni_testset_numpy_minimal')
+FILEPATHS_DIR = os.path.join(TEST_DATA_DIR, 'enni_testset_filepaths_minimal')
+def generate_fake_audio_test_set(num_samples=10, length=16000, random_seed=42):
+    """
+    Generate a synthetic test set of fake audio signals (numpy arrays).
+    Args:
+        num_samples (int): Number of audio samples.
+        length (int): Length of each audio sample (e.g., 1 second at 16kHz).
+        random_seed (int): Seed for reproducibility.
+    Returns:
+        List[np.ndarray]: List of fake audio signals.
+    """
+    np.random.seed(random_seed)
+    return [np.random.randn(length) for _ in range(num_samples)]
+def test_file_paths():
+    """Test with all real audio files from the dataset"""
+    # Get file paths using the constant
+    audio_dir = AUDIO_FILES_DIR
+    # Get all child and adult files
+    child_files = [
+        os.path.join(audio_dir, file)
+        for file in os.listdir(audio_dir)
+        if file.startswith('child_') and file.endswith('.wav')
+    ]  # Use all child files
+    adult_files = [
+        os.path.join(audio_dir, file)
+        for file in os.listdir(audio_dir)
+        if file.startswith('adult_') and file.endswith('.wav')
+    ]  # Use all adult files
+    # Create list with known order
+    audio_list = child_files + adult_files
+    # Get speaker IDs
+    speaker_ids = assign_speaker_for_audio_list(audio_list)
+    # Print results
+    print("\n--- Testing with file paths ---")
+    print(f"Testing {len(audio_list)} audio files: {len(child_files)} child files and {len(adult_files)} adult files")
+    # Count correct predictions
+    correct = 0
+    for i, (file, speaker_id) in enumerate(zip(audio_list, speaker_ids)):
+        expected = "Speaker_id_0" if "child_" in file else "Speaker_id_1"
+        is_correct = speaker_id == expected
+        correct += 1 if is_correct else 0
+        # Print only the first 5 examples to avoid cluttering the output
+        if i < 5:
+            print(f"{i+1}. {os.path.basename(file)}: {speaker_id} (Expected: {expected}) {'✓' if is_correct else '✗'}")
+    # Print accuracy
+    accuracy = correct / len(audio_list) * 100 if audio_list else 0
+    print(f"Accuracy: {correct}/{len(audio_list)} ({accuracy:.2f}%)")
+def test_numpy_arrays():
+    """Test with NumPy arrays by loading all audio files"""
+    # Get file paths using the constant
+    audio_dir = AUDIO_FILES_DIR
+    # Load all child and adult files as arrays
+    child_files = [
+        os.path.join(audio_dir, file)
+        for file in os.listdir(audio_dir)
+        if file.startswith('child_') and file.endswith('.wav')
+    ]
+    adult_files = [
+        os.path.join(audio_dir, file)
+        for file in os.listdir(audio_dir)
+        if file.startswith('adult_') and file.endswith('.wav')
+    ]
+    # Load as arrays
+    child_arrays = [librosa.load(f, sr=16000)[0] for f in child_files]
+    adult_arrays = [librosa.load(f, sr=16000)[0] for f in adult_files]
+    # Create list with known order
+    audio_list = child_arrays + adult_arrays
+    filenames = [os.path.basename(f) for f in child_files + adult_files]
+    # Get speaker IDs
+    speaker_ids = assign_speaker_for_audio_list(audio_list)
+    # Print results
+    print("\n--- Testing with NumPy arrays ---")
+    print(f"Testing {len(audio_list)} audio arrays: {len(child_arrays)} child arrays and {len(adult_arrays)} adult arrays")
+    # Count correct predictions
+    correct = 0
+    for i, (filename, speaker_id) in enumerate(zip(filenames, speaker_ids)):
+        expected = "Speaker_id_0" if "child_" in filename else "Speaker_id_1"
+        is_correct = speaker_id == expected
+        correct += 1 if is_correct else 0
+        # Print only the first 5 examples to avoid cluttering the output
+        if i < 5:
+            print(f"{i+1}. {filename} (as array): {speaker_id} (Expected: {expected}) {'✓' if is_correct else '✗'}")
+    # Print accuracy
+    accuracy = correct / len(audio_list) * 100 if audio_list else 0
+    print(f"Accuracy: {correct}/{len(audio_list)} ({accuracy:.2f}%)")
+if __name__ == "__main__":
+    # Test with synthetic data
+    print("--- Testing with synthetic data ---")
+    audio_list = generate_fake_audio_test_set(num_samples=5)
+    speaker_ids = assign_speaker_for_audio_list(audio_list)
+    print(f"Synthetic data predictions: {speaker_ids}")
+    # Test with real files
+    try:
+        test_file_paths()
+    except Exception as e:
+        print(f"Error testing file paths: {e}")
+    # Test with NumPy arrays
+    try:
+        test_numpy_arrays()
+    except Exception as e:
+        print(f"Error testing NumPy arrays: {e}")