Spaces:

mulasagg
/

Voice

Sleeping

File size: 4,028 Bytes

8031a8f

"""
Compute Voice Clarity Score from audio file
"""

import librosa
import numpy as np
from typing import Dict, Any
from .vcs import calculate_voice_clarity_score, get_clarity_insight

def compute_voice_clarity_score(file_path: str, whisper_model) -> Dict[str, Any]:
    """
    Compute Voice Clarity Score and its components from a speech sample.

    Args:
        file_path (str): Path to the audio file.
        whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper)

    Returns:
        dict: A dictionary containing Voice Clarity Score and component scores.
    """
    # Transcribe audio
    result = whisper_model.transcribe(file_path)
    transcript = result.get("text", "").strip()
    segments = result.get("segments", [])

    # Validate early
    if not transcript or not segments:
        raise ValueError("Empty transcript or segments from Whisper.")

    # Load audio
    y, sr = librosa.load(file_path, sr=None)
    duration = len(y) / sr if sr else 0.0
    if duration <= 0:
        raise ValueError("Audio duration invalid or zero.")
    
    # Calculate Voice Clarity Score
    clarity_result = calculate_voice_clarity_score(y, sr, segments)
    
    # Add transcript to results
    clarity_result["transcript"] = transcript
    
    # Add word count and duration info for reference
    word_count = len(transcript.split())
    clarity_result["components"]["word_count"] = word_count
    clarity_result["components"]["duration"] = duration
    
    return clarity_result

def analyze_voice_quality(file_path: str, whisper_model) -> Dict[str, Any]:
    """
    Comprehensive voice quality analysis including clarity.
    
    Args:
        file_path (str): Path to the audio file
        whisper_model: Transcription model
        
    Returns:
        Dict[str, Any]: Complete voice quality analysis
    """
    # Get Voice Clarity Score
    clarity_results = compute_voice_clarity_score(file_path, whisper_model)
    vcs = clarity_results["VCS"]
    
    # Load audio for additional analysis
    y, sr = librosa.load(file_path, sr=None)
    
    # Calculate additional voice quality metrics
    
    # Voice stability - based on pitch (F0) stability
    f0, voiced_flags, voiced_probs = librosa.pyin(
        y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
    voiced_f0 = f0[~np.isnan(f0)]
    
    pitch_stability = 0.0
    if voiced_f0.size > 0:
        # Calculate coefficient of variation (lower is more stable)
        cv = np.std(voiced_f0) / np.mean(voiced_f0) if np.mean(voiced_f0) > 0 else float('inf')
        # Convert to score (0-100)
        pitch_stability = max(0, min(100, 100 - (cv * 100)))
    
    # Voice resonance - based on spectral bandwidth
    bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    # Normalize (ideal range is around 1500-2500 Hz for speech)
    if bandwidth < 1000:
        resonance_score = max(0, bandwidth / 1000 * 70)  # Too narrow
    elif bandwidth <= 2500:
        resonance_score = 70 + ((bandwidth - 1000) / 1500 * 30)  # Optimal range
    else:
        resonance_score = max(0, 100 - ((bandwidth - 2500) / 2500 * 50))  # Too wide
    
    # Voice strength - based on RMS energy
    rms = np.mean(librosa.feature.rms(y=y))
    # Normalize (typical speech RMS values range from 0.01 to 0.2)
    strength_score = min(100, max(0, rms / 0.2 * 100))
    
    # Combine additional metrics
    additional_metrics = {
        "pitch_stability": pitch_stability,
        "voice_resonance": resonance_score,
        "voice_strength": strength_score
    }
    
    # Add to results
    combined_results = {
        "VCS": vcs,
        "insight": clarity_results["insight"],
        "components": {
            **clarity_results["components"],
            **additional_metrics
        },
        "transcript": clarity_results["transcript"]
    }
    
    return combined_results

# Ensure the functions are exposed when imported
__all__ = ['compute_voice_clarity_score', 'analyze_voice_quality']