File size: 4,028 Bytes
8031a8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
Compute Voice Clarity Score from audio file
"""

import librosa
import numpy as np
from typing import Dict, Any
from .vcs import calculate_voice_clarity_score, get_clarity_insight

def compute_voice_clarity_score(file_path: str, whisper_model) -> Dict[str, Any]:
    """
    Compute Voice Clarity Score and its components from a speech sample.

    Args:
        file_path (str): Path to the audio file.
        whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper)

    Returns:
        dict: A dictionary containing Voice Clarity Score and component scores.
    """
    # Transcribe audio
    result = whisper_model.transcribe(file_path)
    transcript = result.get("text", "").strip()
    segments = result.get("segments", [])

    # Validate early
    if not transcript or not segments:
        raise ValueError("Empty transcript or segments from Whisper.")

    # Load audio
    y, sr = librosa.load(file_path, sr=None)
    duration = len(y) / sr if sr else 0.0
    if duration <= 0:
        raise ValueError("Audio duration invalid or zero.")
    
    # Calculate Voice Clarity Score
    clarity_result = calculate_voice_clarity_score(y, sr, segments)
    
    # Add transcript to results
    clarity_result["transcript"] = transcript
    
    # Add word count and duration info for reference
    word_count = len(transcript.split())
    clarity_result["components"]["word_count"] = word_count
    clarity_result["components"]["duration"] = duration
    
    return clarity_result

def analyze_voice_quality(file_path: str, whisper_model) -> Dict[str, Any]:
    """
    Comprehensive voice quality analysis including clarity.
    
    Args:
        file_path (str): Path to the audio file
        whisper_model: Transcription model
        
    Returns:
        Dict[str, Any]: Complete voice quality analysis
    """
    # Get Voice Clarity Score
    clarity_results = compute_voice_clarity_score(file_path, whisper_model)
    vcs = clarity_results["VCS"]
    
    # Load audio for additional analysis
    y, sr = librosa.load(file_path, sr=None)
    
    # Calculate additional voice quality metrics
    
    # Voice stability - based on pitch (F0) stability
    f0, voiced_flags, voiced_probs = librosa.pyin(
        y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
    voiced_f0 = f0[~np.isnan(f0)]
    
    pitch_stability = 0.0
    if voiced_f0.size > 0:
        # Calculate coefficient of variation (lower is more stable)
        cv = np.std(voiced_f0) / np.mean(voiced_f0) if np.mean(voiced_f0) > 0 else float('inf')
        # Convert to score (0-100)
        pitch_stability = max(0, min(100, 100 - (cv * 100)))
    
    # Voice resonance - based on spectral bandwidth
    bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    # Normalize (ideal range is around 1500-2500 Hz for speech)
    if bandwidth < 1000:
        resonance_score = max(0, bandwidth / 1000 * 70)  # Too narrow
    elif bandwidth <= 2500:
        resonance_score = 70 + ((bandwidth - 1000) / 1500 * 30)  # Optimal range
    else:
        resonance_score = max(0, 100 - ((bandwidth - 2500) / 2500 * 50))  # Too wide
    
    # Voice strength - based on RMS energy
    rms = np.mean(librosa.feature.rms(y=y))
    # Normalize (typical speech RMS values range from 0.01 to 0.2)
    strength_score = min(100, max(0, rms / 0.2 * 100))
    
    # Combine additional metrics
    additional_metrics = {
        "pitch_stability": pitch_stability,
        "voice_resonance": resonance_score,
        "voice_strength": strength_score
    }
    
    # Add to results
    combined_results = {
        "VCS": vcs,
        "insight": clarity_results["insight"],
        "components": {
            **clarity_results["components"],
            **additional_metrics
        },
        "transcript": clarity_results["transcript"]
    }
    
    return combined_results

# Ensure the functions are exposed when imported
__all__ = ['compute_voice_clarity_score', 'analyze_voice_quality']