File size: 4,029 Bytes
73b6e10
 
fe970e3
73b6e10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe970e3
73b6e10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe970e3
73b6e10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe970e3
73b6e10
 
fe970e3
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from typing import List, Union, Optional
import os

import numpy as np
import librosa
from transformers import pipeline

# Default sample rate for audio processing
DEFAULT_SAMPLE_RATE = 16000

# Singleton pattern to avoid loading the model multiple times
_PREDICTOR_INSTANCE = None

def get_predictor():
    """
    Get or create the singleton predictor instance.
    Returns:
        Predictor: A shared instance of the Predictor class.
    """
    global _PREDICTOR_INSTANCE
    if _PREDICTOR_INSTANCE is None:
        _PREDICTOR_INSTANCE = Predictor()
    return _PREDICTOR_INSTANCE
class Predictor:
    def __init__(self, model_path: Optional[str] = None):
        """
        Initialize the predictor with a pre-trained model.
        
        Args:
            model_path: Optional path to a local model. If None, uses the default HuggingFace model.
        """
        # Load Hugging Face audio-classification pipeline
        self.model = pipeline("audio-classification", model="bookbot/wav2vec2-adult-child-cls")

    def preprocess(self, input_item: Union[str, np.ndarray]) -> np.ndarray:
        """
        Preprocess an input item (either file path or numpy array).
        
        Args:
            input_item: Either a file path string or a numpy array of audio data.
            
        Returns:
            np.ndarray: Processed audio data as a numpy array.
            
        Raises:
            ValueError: If input type is unsupported.
        """
        if isinstance(input_item, str):
            # Load audio file to numpy array
            audio, _ = librosa.load(input_item, sr=DEFAULT_SAMPLE_RATE)
            return audio
        elif isinstance(input_item, np.ndarray):
            return input_item
        else:
            raise ValueError(f"Unsupported input type: {type(input_item)}")

    def predict(self, input_list: List[Union[str, np.ndarray]]) -> List[int]:
        """
        Predict speaker type (child=0, adult=1) for a list of audio inputs.
        
        Args:
            input_list: List of inputs, either file paths or numpy arrays.
            
        Returns:
            List[int]: List of predictions (0=child, 1=adult, -1=unknown).
        """
        # Preprocess all inputs first
        processed = [self.preprocess(item) for item in input_list]
        
        # Batch inference
        preds = self.model(processed, sampling_rate=DEFAULT_SAMPLE_RATE)
        
        # Map label to 0 (child) or 1 (adult)
        label_map = {
            "child": 0,
            "adult": 1
        }
        
        results = []
        for pred in preds:
            # pred can be a list of dicts (top-k), take the top prediction
            if isinstance(pred, list):
                label = pred[0]["label"]
            else:
                label = pred["label"]
            results.append(label_map.get(label.lower(), -1))  # -1 for unknown label
        return results

# Usage:
# predictor = Predictor("path/to/model")
# predictions = predictor.predict(list_of_inputs)

def assign_speaker_for_audio_list(audio_list: List[Union[str, np.ndarray]]) -> List[str]:
    """
    Assigns speaker IDs for a list of audio segments.
    
    Args:
        audio_list: List of audio inputs (either file paths or numpy arrays, 
                   assumed to have sampling rate = 16000).
        
    Returns:
        List[str]: List of speaker IDs corresponding to each audio segment.
                 "Speaker_id_0" for child, "Speaker_id_1" for adult.
    """
    if not audio_list:
        return []
        
    # Use singleton predictor to avoid reloading model
    predictor = get_predictor()
    
    # Get list of 0 (child) or 1 (adult)
    numeric_labels = predictor.predict(audio_list)
    
    # Map to Speaker_id_0 and Speaker_id_1, preserving order
    speaker_ids = [f"Speaker_id_{label}" if label in (0,1) else "Unknown" for label in numeric_labels]
    return speaker_ids


# you don't have to implement this function
def assign_speaker(session_id: str):

    return