Spaces:
Runtime error
Runtime error
import numpy as np | |
import librosa | |
from sklearn.cluster import DBSCAN | |
def extract_voice_features(audio_path, fps, video_duration): | |
# Load the audio file | |
y, sr = librosa.load(audio_path) | |
# Calculate the number of samples per frame | |
samples_per_frame = int(sr / fps) | |
# Calculate the total number of frames | |
total_frames = int(fps * video_duration) | |
# Extract MFCC features | |
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) | |
# Segment the MFCCs to align with video frames | |
segments = [] | |
for i in range(total_frames): | |
start = i * samples_per_frame | |
end = start + samples_per_frame | |
if end > mfccs.shape[1]: | |
break | |
segment = mfccs[:, start:end] | |
segments.append(np.mean(segment, axis=1)) | |
return np.array(segments) | |
def cluster_voices(features): | |
if len(features) < 2: | |
print("Not enough voice segments for clustering. Assigning all to one cluster.") | |
return np.zeros(len(features), dtype=int) | |
dbscan = DBSCAN(eps=0.5, min_samples=5, metric='euclidean') | |
clusters = dbscan.fit_predict(features) | |
if np.all(clusters == -1): | |
print("DBSCAN assigned all to noise. Considering as one cluster.") | |
return np.zeros(len(features), dtype=int) | |
return clusters | |
def get_most_frequent_voice(features, clusters): | |
largest_cluster = max(set(clusters), key=list(clusters).count) | |
return features[clusters == largest_cluster] | |
def process_audio(audio_path, fps, video_duration): | |
features = extract_voice_features(audio_path, fps, video_duration) | |
clusters = cluster_voices(features) | |
most_frequent_voice = get_most_frequent_voice(features, clusters) | |
return most_frequent_voice, features, clusters |