File size: 4,029 Bytes
73b6e10 fe970e3 73b6e10 fe970e3 73b6e10 fe970e3 73b6e10 fe970e3 73b6e10 fe970e3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
from typing import List, Union, Optional
import os
import numpy as np
import librosa
from transformers import pipeline
# Default sample rate for audio processing
DEFAULT_SAMPLE_RATE = 16000
# Singleton pattern to avoid loading the model multiple times
_PREDICTOR_INSTANCE = None
def get_predictor():
"""
Get or create the singleton predictor instance.
Returns:
Predictor: A shared instance of the Predictor class.
"""
global _PREDICTOR_INSTANCE
if _PREDICTOR_INSTANCE is None:
_PREDICTOR_INSTANCE = Predictor()
return _PREDICTOR_INSTANCE
class Predictor:
def __init__(self, model_path: Optional[str] = None):
"""
Initialize the predictor with a pre-trained model.
Args:
model_path: Optional path to a local model. If None, uses the default HuggingFace model.
"""
# Load Hugging Face audio-classification pipeline
self.model = pipeline("audio-classification", model="bookbot/wav2vec2-adult-child-cls")
def preprocess(self, input_item: Union[str, np.ndarray]) -> np.ndarray:
"""
Preprocess an input item (either file path or numpy array).
Args:
input_item: Either a file path string or a numpy array of audio data.
Returns:
np.ndarray: Processed audio data as a numpy array.
Raises:
ValueError: If input type is unsupported.
"""
if isinstance(input_item, str):
# Load audio file to numpy array
audio, _ = librosa.load(input_item, sr=DEFAULT_SAMPLE_RATE)
return audio
elif isinstance(input_item, np.ndarray):
return input_item
else:
raise ValueError(f"Unsupported input type: {type(input_item)}")
def predict(self, input_list: List[Union[str, np.ndarray]]) -> List[int]:
"""
Predict speaker type (child=0, adult=1) for a list of audio inputs.
Args:
input_list: List of inputs, either file paths or numpy arrays.
Returns:
List[int]: List of predictions (0=child, 1=adult, -1=unknown).
"""
# Preprocess all inputs first
processed = [self.preprocess(item) for item in input_list]
# Batch inference
preds = self.model(processed, sampling_rate=DEFAULT_SAMPLE_RATE)
# Map label to 0 (child) or 1 (adult)
label_map = {
"child": 0,
"adult": 1
}
results = []
for pred in preds:
# pred can be a list of dicts (top-k), take the top prediction
if isinstance(pred, list):
label = pred[0]["label"]
else:
label = pred["label"]
results.append(label_map.get(label.lower(), -1)) # -1 for unknown label
return results
# Usage:
# predictor = Predictor("path/to/model")
# predictions = predictor.predict(list_of_inputs)
def assign_speaker_for_audio_list(audio_list: List[Union[str, np.ndarray]]) -> List[str]:
"""
Assigns speaker IDs for a list of audio segments.
Args:
audio_list: List of audio inputs (either file paths or numpy arrays,
assumed to have sampling rate = 16000).
Returns:
List[str]: List of speaker IDs corresponding to each audio segment.
"Speaker_id_0" for child, "Speaker_id_1" for adult.
"""
if not audio_list:
return []
# Use singleton predictor to avoid reloading model
predictor = get_predictor()
# Get list of 0 (child) or 1 (adult)
numeric_labels = predictor.predict(audio_list)
# Map to Speaker_id_0 and Speaker_id_1, preserving order
speaker_ids = [f"Speaker_id_{label}" if label in (0,1) else "Unknown" for label in numeric_labels]
return speaker_ids
# you don't have to implement this function
def assign_speaker(session_id: str):
return |