Spaces:

Natwar
/

VoiceAnalysis

Sleeping

File size: 34,751 Bytes

343474c

import os
import subprocess
import sys
import pkg_resources
import time
import tempfile
import numpy as np
import warnings
from pathlib import Path
warnings.filterwarnings("ignore")

def install_package(package, version=None):
    package_spec = f"{package}=={version}" if version else package
    print(f"Installing {package_spec}...")
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir", package_spec])
    except subprocess.CalledProcessError as e:
        print(f"Failed to install {package_spec}: {e}")
        raise

# Required packages (add version pins if needed)
required_packages = {
    "gradio": None,
    "torch": None,
    "torchaudio": None,
    "transformers": None,
    "librosa": None,
    "scipy": None,
    "matplotlib": None,
    "pydub": None,
    "plotly": None
}

installed_packages = {pkg.key for pkg in pkg_resources.working_set}
for package, version in required_packages.items():
    if package not in installed_packages:
        install_package(package, version)

# Now import necessary packages
import gradio as gr
import torch
import torchaudio
import librosa
import matplotlib
matplotlib.use('Agg')  # non-interactive backend for any fallback
from pydub import AudioSegment
import scipy
import io
from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
import plotly.graph_objects as go

# Define emotion labels, tone mapping, and descriptions
EMOTION_DESCRIPTIONS = {
    "angry": "Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.",
    "disgust": "Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.",
    "fear": "Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.",
    "happy": "Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.",
    "neutral": "Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.",
    "sad": "Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.",
    "surprise": "Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic."
}

# If you wish to group emotions by tone, you can do so here:
TONE_MAPPING = {
    "positive": ["happy", "surprise"],
    "neutral": ["neutral"],
    "negative": ["angry", "sad", "fear", "disgust"]
}

# Global variable for the emotion classifier
audio_emotion_classifier = None

def load_emotion_model():
    """Load and cache the speech emotion classification model."""
    global audio_emotion_classifier
    if audio_emotion_classifier is None:
        try:
            print("Loading emotion classification model...")
            model_name = "superb/hubert-large-superb-er"
            audio_emotion_classifier = pipeline("audio-classification", model=model_name)
            print("Emotion classification model loaded successfully")
            return True
        except Exception as e:
            print(f"Error loading emotion model: {e}")
            return False
    return True

def convert_audio_to_wav(audio_file):
    """Convert uploaded audio to WAV format."""
    try:
        audio = AudioSegment.from_file(audio_file)
        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
            wav_path = temp_wav.name
            audio.export(wav_path, format="wav")
        return wav_path
    except Exception as e:
        print(f"Error converting audio: {e}")
        return None

def analyze_voice_tone(audio_file):
    """
    Analyze the tone characteristics of the voice using more robust measurements.
    Includes pitch variation, energy dynamics, and spectral features.
    """
    try:
        audio_data, sample_rate = librosa.load(audio_file, sr=16000)

        # 1. Basic audio features
        audio_duration = librosa.get_duration(y=audio_data, sr=sample_rate)
        if audio_duration < 1.0:  # Too short for reliable analysis
            return "Audio too short for reliable tone analysis. Please provide at least 3 seconds."

        # 2. Pitch analysis with more robust handling
        f0, voiced_flag, voiced_prob = librosa.pyin(
            audio_data,
            fmin=librosa.note_to_hz('C2'),
            fmax=librosa. note_to_hz('C7'),
            sr=sample_rate
        )

        # Filter out NaN values and get valid pitch points
        valid_f0 = f0[~np.isnan(f0)]

        # If no pitch detected, may be noise or silence
        if len(valid_f0) < 10:
            return "**Voice Tone Analysis:** Unable to detect sufficient pitched content for analysis. The audio may contain primarily noise, silence, or non-speech sounds."

        # 3. Calculate improved statistics
        mean_pitch = np.mean(valid_f0)
        median_pitch = np.median(valid_f0)
        std_pitch = np.std(valid_f0)
        pitch_range = np.percentile(valid_f0, 95) - np.percentile(valid_f0, 5)

        # 4. Energy/volume dynamics
        rms_energy = librosa.feature.rms(y=audio_data)[0]
        mean_energy = np.mean(rms_energy)
        std_energy = np.std(rms_energy)
        energy_range = np.percentile(rms_energy, 95) - np.percentile(rms_energy, 5)

        # 5. Speaking rate approximation (zero-crossing rate can help estimate this)
        zcr = librosa.feature.zero_crossing_rate(audio_data)[0]
        mean_zcr = np.mean(zcr)

        # 6. Calculate pitch variability relative to the mean (coefficient of variation)
        # This gives a better measure than raw std dev
        pitch_cv = (std_pitch / mean_pitch) * 100 if mean_pitch > 0 else 0

        # 7. Tone classification logic using multiple features
        # Define tone characteristics based on combinations of features
        tone_class = ""
        tone_details = []

        # Pitch-based characteristics
        if pitch_cv < 5:
            tone_class = "Monotone"
            tone_details.append("Very little pitch variation - sounds flat and unexpressive")
        elif pitch_cv < 12:
            tone_class = "Steady"
            tone_details.append("Moderate pitch variation - sounds controlled and measured")
        elif pitch_cv < 20:
            tone_class = "Expressive"
            tone_details.append("Good pitch variation - sounds naturally engaging")
        else:
            tone_class = "Highly Dynamic"
            tone_details.append("Strong pitch variation - sounds animated and emphatic")

        # Pitch range classification
        if mean_pitch > 180:
            tone_details.append("Higher pitched voice - may convey excitement or tension")
        elif mean_pitch < 120:
            tone_details.append("Lower pitched voice - may convey calmness or authority")
        else:
            tone_details.append("Mid-range pitch - typically perceived as balanced")

        # Energy/volume characteristics
        energy_cv = (std_energy / mean_energy) * 100 if mean_energy > 0 else 0
        if energy_cv < 10:
            tone_details.append("Consistent volume - sounds controlled and measured")
        elif energy_cv > 30:
            tone_details.append("Variable volume - suggests emotional emphasis or expressiveness")

        # Speech rate approximation
        if mean_zcr > 0.1:
            tone_details.append("Faster speech rate - may convey urgency or enthusiasm")
        elif mean_zcr < 0.05:
            tone_details.append("Slower speech rate - may convey thoughtfulness or hesitation")

        # Generate tone summary and interpretation
        tone_analysis = f"### Voice Tone Analysis\n\n"
        tone_analysis += f"**Primary tone quality:** {tone_class}\n\n"
        tone_analysis += "**Tone characteristics:**\n"
        for detail in tone_details:
            tone_analysis += f"- {detail}\n"

        tone_analysis += "\n**Interpretation:**\n"

        # Generate interpretation based on the classified tone
        if tone_class == "Monotone":
            tone_analysis += ("A monotone delivery can create distance and reduce engagement. "
                             "Consider adding more vocal variety to sound more engaging and authentic.")
        elif tone_class == "Steady":
            tone_analysis += ("Your steady tone suggests reliability and control. "
                             "This can be effective in professional settings or when conveying serious information.")
        elif tone_class == "Expressive":
            tone_analysis += ("Your expressive tone helps maintain listener interest and emphasize key points. "
                             "This naturally engaging quality helps convey authenticity and conviction.")
        else:  # Highly Dynamic
            tone_analysis += ("Your highly dynamic vocal style conveys strong emotion and energy. "
                             "This can be powerful for storytelling and persuasion, though in some contexts "
                             "a more measured approach might be appropriate.")

        return tone_analysis

    except Exception as e:
        print(f"Error in tone analysis: {e}")
        return "Tone analysis unavailable due to an error processing the audio."

def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=2):
    """
    Analyze speech emotions in short chunks,
    building a timeline of confidence for each emotion.
    Returns a Plotly figure, summary text, detailed results.
    """
    if not load_emotion_model():
        return None, "Failed to load emotion classifier.", None

    # Use existing WAV if possible, else convert
    if audio_file.endswith(".wav"):
        audio_path = audio_file
    else:
        audio_path = convert_audio_to_wav(audio_file)
        if not audio_path:
            return None, "Could not process audio file", None

    try:
        # Load with librosa
        audio_data, sample_rate = librosa.load(audio_path, sr=16000)
        duration = len(audio_data) / sample_rate

        # Use shorter chunks for more granular analysis
        chunk_samples = int(chunk_duration * sample_rate)
        num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples)))

        all_emotions = []
        time_points = []

        # For each chunk, run emotion classification
        for i in range(num_chunks):
            progress((i + 1) / num_chunks, "Analyzing audio emotions...")
            start_idx = i * chunk_samples
            end_idx = min(start_idx + chunk_samples, len(audio_data))
            chunk = audio_data[start_idx:end_idx]

            # Skip very short chunks
            if len(chunk) < 0.5 * sample_rate:
                continue

            # Write chunk to temp WAV
            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_chunk:
                chunk_path = temp_chunk.name
                scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16))

            # Classify - extract top-n predictions for each chunk
            raw_results = audio_emotion_classifier(chunk_path, top_k=7)  # Get all 7 emotions
            os.unlink(chunk_path)

            all_emotions.append(raw_results)
            time_points.append((start_idx / sample_rate, end_idx / sample_rate))

        # Skip if no valid emotions detected
        if not all_emotions:
            return None, "No speech detected in the audio.", None

        # Build Plotly chart with improved styling
        fig = build_plotly_line_chart(all_emotions, time_points, duration)

        # Build summary and detailed results
        summary_text = generate_emotion_summary(all_emotions)
        detailed_results = build_detailed_results(all_emotions, time_points)

        return fig, summary_text, detailed_results

    except Exception as e:
        import traceback
        traceback.print_exc()
        return None, f"Error analyzing audio: {str(e)}", None

def smooth_data(data, window_size=3):
    """Apply a moving average smoothing to the data"""
    smoothed = np.convolve(data, np.ones(window_size)/window_size, mode='valid')

    # Add back points that were lost in the convolution
    padding = len(data) - len(smoothed)
    if padding > 0:
        # Add padding at the beginning
        padding_front = padding // 2
        padding_back = padding - padding_front

        # Use the first/last values for padding
        front_padding = [smoothed[0]] * padding_front
        back_padding = [smoothed[-1]] * padding_back

        smoothed = np.concatenate([front_padding, smoothed, back_padding])

    return smoothed

def build_plotly_line_chart(all_emotions, time_points, duration):
    """
    Create an improved Plotly line chart with toggles for each emotion.
    Shows all emotions for each time point rather than just the top one.
    """
    emotion_labels = list(EMOTION_DESCRIPTIONS.keys())

    # Custom color scheme for emotions
    colors = {
        "angry": "#E53935",     # Red
        "disgust": "#8E24AA",   # Purple
        "fear": "#7B1FA2",      # Deep Purple
        "happy": "#FFC107",     # Amber/Yellow
        "neutral": "#78909C",   # Blue Grey
        "sad": "#1E88E5",       # Blue
        "surprise": "#43A047"   # Green
    }

    # Prepare data structure for all emotions
    emotion_data = {label: [] for label in emotion_labels}
    timeline_times = [(start + end) / 2 for start, end in time_points]

    # Process emotion scores - ensure all emotions have values
    for chunk_emotions in all_emotions:
        # Create a mapping of label to score for this chunk
        scores = {item["label"]: item["score"] for item in chunk_emotions}

        # Ensure all emotion labels have a value (default to 0.0)
        for label in emotion_labels:
            emotion_data[label].append(scores.get(label, 0.0))

    # Smooth the data
    for label in emotion_labels:
        if len(emotion_data[label]) > 2:
            emotion_data[label] = smooth_data(emotion_data[label])

    # Build the chart
    fig = go.Figure()

    # Add traces for each emotion
    for label in emotion_labels:
        fig.add_trace(
            go.Scatter(
                x=timeline_times,
                y=emotion_data[label],
                mode='lines',
                name=label.capitalize(),
                line=dict(
                    color=colors.get(label, None),
                    width=3,
                    shape='spline',  # Curved lines
                    smoothing=1.3
                ),
                hovertemplate=f'{label.capitalize()}: %{{y:.2f}}<extra></extra>',
            )
        )

    # Add markers for dominant emotion at each point
    dominant_markers_x = []
    dominant_markers_y = []
    dominant_markers_text = []
    dominant_markers_color = []

    for i, time in enumerate(timeline_times):
        scores = {label: emotion_data[label][i] for label in emotion_labels}
        dominant = max(scores.items(), key=lambda x: x[1])

        dominant_markers_x.append(time)
        dominant_markers_y.append(dominant[1])
        dominant_markers_text.append(f"{dominant[0].capitalize()}: {dominant[1]:.2f}")
        dominant_markers_color.append(colors.get(dominant[0], "#000000"))

    fig.add_trace(
        go.Scatter(
            x=dominant_markers_x,
            y=dominant_markers_y,
            mode='markers',
            marker=dict(
                size=10,
                color=dominant_markers_color,
                line=dict(width=2, color='white')
            ),
            name="Dominant Emotion",
            text=dominant_markers_text,
            hoverinfo="text",
            hovertemplate='%{text}<extra></extra>'
        )
    )

    # Add area chart for better visualization
    for label in emotion_labels:
        fig.add_trace(
            go.Scatter(
                x=timeline_times,
                y=emotion_data[label],
                mode='none',
                name=f"{label.capitalize()} Area",
                fill='tozeroy',
                fillcolor=f"rgba{tuple(list(int(colors.get(label, '#000000').lstrip('#')[i:i+2], 16) for i in (0, 2, 4)) + [0.1])}",
                showlegend=False,
                hoverinfo='skip'
            )
        )

    # Improve layout
    fig.update_layout(
        title={
            'text': "Voice Emotion Analysis Over Time",
            'font': {'size': 22, 'family': 'Arial, sans-serif'}
        },
        xaxis_title="Time (seconds)",
        yaxis_title="Confidence Score",
        yaxis=dict(
            range=[0, 1.0],
            showgrid=True,
            gridcolor='rgba(230, 230, 230, 0.8)'
        ),
        xaxis=dict(
            showgrid=True,
            gridcolor='rgba(230, 230, 230, 0.8)'
        ),
        plot_bgcolor='white',
        legend=dict(
            bordercolor='rgba(0,0,0,0.1)',
            borderwidth=1,
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        ),
        hovermode='closest',
        height=500,  # Larger size for better viewing
        margin=dict(l=10, r=10, t=80, b=50)
    )

    return fig

def generate_alternative_chart(all_emotions, time_points):
    """
    Create a stacked area chart to better visualize emotion changes over time
    """
    emotion_labels = list(EMOTION_DESCRIPTIONS.keys())

    # Custom color scheme for emotions - more visible/distinct
    colors = {
        "angry": "#F44336",     # Red
        "disgust": "#9C27B0",   # Purple
        "fear": "#673AB7",      # Deep Purple
        "happy": "#FFC107",     # Amber
        "neutral": "#607D8B",   # Blue Grey
        "sad": "#2196F3",       # Blue
        "surprise": "#4CAF50"   # Green
    }

    # Prepare timeline points
    timeline_times = [(start + end) / 2 for start, end in time_points]

    # Prepare data structure for all emotions
    emotion_data = {label: [] for label in emotion_labels}

    # Process emotion scores - ensure all emotions have values
    for chunk_emotions in all_emotions:
        # Create a mapping of label to score for this chunk
        scores = {item["label"]: item["score"] for item in chunk_emotions}

        # Ensure all emotion labels have a value (default to 0.0)
        for label in emotion_labels:
            emotion_data[label].append(scores.get(label, 0.0))

    # Create the stacked area chart
    fig = go.Figure()

    # Add each emotion as a separate trace
    for label in emotion_labels:
        fig.add_trace(
            go.Scatter(
                x=timeline_times,
                y=emotion_data[label],
                mode='lines',
                name=label.capitalize(),
                line=dict(width=0.5, color=colors.get(label, None)),
                stackgroup='one',  # This makes it a stacked area chart
                fillcolor=colors.get(label, None),
                hovertemplate=f'{label.capitalize()}: %{{y:.2f}}<extra></extra>'
            )
        )

    # Improve layout
    fig.update_layout(
        title={
            'text': "Voice Emotion Distribution Over Time",
            'font': {'size': 22, 'family': 'Arial, sans-serif'}
        },
        xaxis_title="Time (seconds)",
        yaxis_title="Emotion Intensity",
        yaxis=dict(
            showgrid=True,
            gridcolor='rgba(230, 230, 230, 0.8)'
        ),
        xaxis=dict(
            showgrid=True,
            gridcolor='rgba(230, 230, 230, 0.8)'
        ),
        plot_bgcolor='white',
        legend=dict(
            bordercolor='rgba(0,0,0,0.1)',
            borderwidth=1,
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        ),
        hovermode='closest',
        height=500,
        margin=dict(l=10, r=10, t=80, b=50)
    )

    return fig

def generate_emotion_summary(all_emotions):
    """
    Produce an improved textual summary of the overall emotion distribution.
    """
    if not all_emotions:
        return "No emotional content detected."

    emotion_counts = {}
    emotion_confidence = {}
    total_chunks = len(all_emotions)

    for chunk_emotions in all_emotions:
        top_emotion = max(chunk_emotions, key=lambda x: x['score'])
        label = top_emotion["label"]
        confidence = top_emotion["score"]

        emotion_counts[label] = emotion_counts.get(label, 0) + 1
        emotion_confidence[label] = emotion_confidence.get(label, 0) + confidence

    # Calculate average confidence for each emotion
    for emotion in emotion_confidence:
        if emotion_counts[emotion] > 0:
            emotion_confidence[emotion] /= emotion_counts[emotion]

    # Dominant emotion (highest percentage)
    dominant_emotion = max(emotion_counts, key=emotion_counts.get)
    dominant_pct = (emotion_counts[dominant_emotion] / total_chunks) * 100

    # Most confident emotion (might differ from dominant)
    most_confident = max(emotion_confidence, key=emotion_confidence.get)

    # Tone grouping analysis
    tone_group_counts = {group: 0 for group in TONE_MAPPING}
    for emotion, count in emotion_counts.items():
        for tone_group, emotions in TONE_MAPPING.items():
            if emotion in emotions:
                tone_group_counts[tone_group] += count

    dominant_tone = max(tone_group_counts, key=tone_group_counts.get)
    dominant_tone_pct = (tone_group_counts[dominant_tone] / total_chunks) * 100

    # Build summary with markdown formatting
    summary = f"### Voice Emotion Analysis Summary\n\n"
    summary += f"**Dominant emotion:** {dominant_emotion.capitalize()} ({dominant_pct:.1f}%)\n\n"

    if dominant_emotion != most_confident and emotion_confidence[most_confident] > 0.7:
        summary += f"**Most confident detection:** {most_confident.capitalize()} "
        summary += f"(avg. confidence: {emotion_confidence[most_confident]:.2f})\n\n"

    summary += f"**Overall tone:** {dominant_tone.capitalize()} ({dominant_tone_pct:.1f}%)\n\n"
    summary += f"**Description:** {EMOTION_DESCRIPTIONS.get(dominant_emotion, '')}\n\n"

    # Show emotion distribution as sorted list
    summary += "**Emotion distribution:**\n"
    for emotion, count in sorted(emotion_counts.items(), key=lambda x: x[1], reverse=True):
        percentage = (count / total_chunks) * 100
        avg_conf = emotion_confidence[emotion]
        summary += f"- {emotion.capitalize()}: {percentage:.1f}% (confidence: {avg_conf:.2f})\n"

    # Add interpretation based on dominant emotion
    summary += f"\n**Interpretation:**\n"

    if dominant_emotion == "happy":
        summary += "The voice conveys primarily positive emotions, suggesting enthusiasm, satisfaction, or joy."
    elif dominant_emotion == "neutral":
        summary += "The voice maintains an even emotional tone, suggesting composure or professional delivery."
    elif dominant_emotion == "sad":
        summary += "The voice conveys melancholy or disappointment, potentially indicating concern or distress."
    elif dominant_emotion == "angry":
        summary += "The voice shows frustration or assertiveness, suggesting strong conviction or displeasure."
    elif dominant_emotion == "fear":
        summary += "The voice reveals anxiety or nervousness, suggesting uncertainty or concern."
    elif dominant_emotion == "disgust":
        summary += "The voice expresses disapproval or aversion, suggesting rejection of discussed concepts."
    elif dominant_emotion == "surprise":
        summary += "The voice shows unexpected reactions, suggesting discovery of new information or astonishment."

    return summary

def build_detailed_results(all_emotions, time_points):
    """
    Return a list of dictionaries containing chunk start-end, top emotion, confidence, description.
    Suitable for Gradio DataFrame display.
    """
    results_list = []
    for (emotions, (start_time, end_time)) in zip(all_emotions, time_points):
        top_emotion = max(emotions, key=lambda x: x['score'])
        label = top_emotion["label"]

        # Find second highest emotion if available
        if len(emotions) > 1:
            sorted_emotions = sorted(emotions, key=lambda x: x['score'], reverse=True)
            second_emotion = sorted_emotions[1]["label"].capitalize()
            second_score = sorted_emotions[1]["score"]
            secondary = f" ({second_emotion}: {second_score:.2f})"
        else:
            secondary = ""

        results_list.append({
            "Time Range": f"{start_time:.1f}s - {end_time:.1f}s",
            "Primary Emotion": label.capitalize(),
            "Confidence": f"{top_emotion['score']:.2f}{secondary}",
            "Description": EMOTION_DESCRIPTIONS.get(label, "")
        })
    return results_list

def process_audio(audio_file, progress=gr.Progress()):
    """
    Main handler for Gradio:
      1) Emotion analysis (returns Plotly figure).
      2) Tone analysis (returns descriptive text).
    """
    if not audio_file:
        return None, None, "No audio file provided.", None, "No tone analysis."

    # 1) Analyze emotions
    fig, summary_text, detailed_results = analyze_audio_emotions(audio_file, progress)
    if not fig:  # Error or missing
        return None, None, "Failed to analyze audio emotions.", None, "Tone analysis unavailable."

    # 2) Generate alternative chart
    # Extract the necessary data from detailed_results to create time_points
    time_points = []
    for result in detailed_results:
        time_range = result["Time Range"]
        start_time = float(time_range.split("s")[0])
        end_time = float(time_range.split(" - ")[1].split("s")[0])
        time_points.append((start_time, end_time))

    # Extract emotion data from detailed_results
    all_emotions = []
    for result in detailed_results:
        # Parse the primary emotion and confidence
        primary_emotion = result["Primary Emotion"].lower()
        confidence_str = result["Confidence"].split("(")[0].strip()
        primary_confidence = float(confidence_str)

        # Create a list of emotion dictionaries for this time point
        emotions_at_time = [{"label": primary_emotion, "score": primary_confidence}]

        # Check if there's a secondary emotion
        if "(" in result["Confidence"]:
            secondary_part = result["Confidence"].split("(")[1].split(")")[0]
            secondary_emotion = secondary_part.split(":")[0].strip().lower()
            secondary_confidence = float(secondary_part.split(":")[1].strip())
            emotions_at_time.append({"label": secondary_emotion, "score": secondary_confidence})

        # Add remaining emotions with zero confidence
        for emotion in EMOTION_DESCRIPTIONS.keys():
            if emotion not in [e["label"] for e in emotions_at_time]:
                emotions_at_time.append({"label": emotion, "score": 0.0})

        all_emotions.append(emotions_at_time)

    # Now we can generate the alternative chart
    alt_fig = generate_alternative_chart(all_emotions, time_points)

    # 3) Analyze tone
    tone_analysis = analyze_voice_tone(audio_file)

    return fig, alt_fig, summary_text, detailed_results, tone_analysis

# Create Gradio interface with improved UI/UX
with gr.Blocks(title="Voice Emotion & Tone Analysis System", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🎙️ Voice Emotion & Tone Analysis System

    This app provides professional analysis of:
    - **Emotions** in your voice (Anger, Disgust, Fear, Happy, Neutral, Sad, Surprise)
    - **Tone characteristics** (based on pitch, energy, and speech patterns)

    The interactive timeline shows emotion confidence scores throughout your audio.
    """)

    with gr.Tabs():
        # Tab 1: Upload
        with gr.TabItem("Upload Audio"):
            with gr.Row():
                with gr.Column(scale=1):
                    audio_input = gr.Audio(
                        label="Upload Audio File",
                        type="filepath",
                        sources=["upload"],
                        elem_id="audio_upload"
                    )
                    process_btn = gr.Button("Analyze Voice", variant="primary")
                    gr.Markdown("""
                    **Supports:** MP3, WAV, M4A, and most audio formats
                    **For best results:** Use a clear voice recording with minimal background noise
                    """)
                with gr.Column(scale=2):
                    with gr.Tabs():
                        with gr.TabItem("Line Chart"):
                            emotion_timeline = gr.Plot(label="Emotion Timeline",
                                                      elem_id="emotion_plot",
                                                      container=True)
                        with gr.TabItem("Area Chart"):
                            emotion_area_chart = gr.Plot(label="Emotion Distribution",
                                                        elem_id="emotion_area_plot",
                                                        container=True)
            with gr.Row():
                with gr.Column():
                    emotion_summary = gr.Markdown(label="Emotion Summary")
                with gr.Column():
                    tone_analysis_output = gr.Markdown(label="Tone Analysis")
            with gr.Row():
                emotion_results = gr.DataFrame(
                    headers=["Time Range", "Primary Emotion", "Confidence", "Description"],
                    label="Detailed Emotion Analysis"
                )

            process_btn.click(
                fn=process_audio,
                inputs=[audio_input],
                outputs=[emotion_timeline, emotion_area_chart, emotion_summary, emotion_results, tone_analysis_output]
            )

        # Tab 2: Record
        with gr.TabItem("Record Voice"):
            with gr.Row():
                with gr.Column(scale=1):
                    record_input = gr.Audio(
                        label="Record Your Voice",
                        sources=["microphone"],
                        type="filepath",
                        elem_id="record_audio"
                    )
                    analyze_btn = gr.Button("Analyze Recording", variant="primary")
                    gr.Markdown("""
                    **Tips:**
                    - Speak clearly and at a normal pace
                    - Record at least 10-15 seconds for more accurate analysis
                    - Try different emotional tones to see how they're detected
                    """)
                with gr.Column(scale=2):
                    with gr.Tabs():
                        with gr.TabItem("Line Chart"):
                            rec_emotion_timeline = gr.Plot(label="Emotion Timeline",
                                                          elem_id="record_emotion_plot",
                                                          container=True)
                        with gr.TabItem("Area Chart"):
                            rec_emotion_area_chart = gr.Plot(label="Emotion Distribution",
                                                           elem_id="record_emotion_area_plot",
                                                           container=True)
            with gr.Row():
                with gr.Column():
                    rec_emotion_summary = gr.Markdown(label="Emotion Summary")
                with gr.Column():
                    rec_tone_analysis_output = gr.Markdown(label="Tone Analysis")
            with gr.Row():
                rec_emotion_results = gr.DataFrame(
                    headers=["Time Range", "Primary Emotion", "Confidence", "Description"],
                    label="Detailed Emotion Analysis"
                )

            analyze_btn.click(
                fn=process_audio,
                inputs=[record_input],
                outputs=[rec_emotion_timeline, rec_emotion_area_chart, rec_emotion_summary, rec_emotion_results, rec_tone_analysis_output]
            )

        # Tab 3: About & Help
        with gr.TabItem("About & Help"):
            gr.Markdown("""
            ## About This System

            This voice emotion & tone analysis system uses state-of-the-art deep learning models to detect emotions and analyze vocal characteristics. The system is built on HuBERT (Hidden Unit BERT) architecture trained on speech emotion recognition tasks.

            ### How It Works

            1. **Audio Processing**: Your audio is processed in short segments (chunks) to capture emotion variations over time.
            2. **Emotion Classification**: Each segment is analyzed by a neural network to detect emotional patterns.
            3. **Tone Analysis**: Acoustic features like pitch, energy, and rhythm are analyzed to describe voice tone characteristics.

            ### Emotion Categories

            The system detects seven standard emotions:

            - **Angry**: Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.
            - **Disgust**: Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.
            - **Fear**: Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.
            - **Happy**: Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.
            - **Neutral**: Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.
            - **Sad**: Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.
            - **Surprise**: Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic.

            ### Tips for Best Results

            - Use clear audio with minimal background noise
            - Speak naturally at a comfortable volume
            - Record at least 10-15 seconds of speech
            - For tone analysis, longer recordings (30+ seconds) provide more accurate results

            ### Privacy Notice

            All audio processing happens on your device. No audio recordings or analysis results are stored or transmitted to external servers.
            """)

    gr.Markdown("""
    ---
    ### System Information

    - **Model**: HuBERT Large for Speech Emotion Recognition
    - **Version**: 1.2.0
    - **Libraries**: PyTorch, Transformers, Librosa, Plotly

    This application demonstrates the use of AI for speech emotion recognition and acoustic analysis. For research and educational purposes only.
    """)

# Check if model can load before launching interface
print("Checking model availability...")
load_success = load_emotion_model()
if not load_success:
    print("Warning: Emotion model failed to load. Application may have limited functionality.")

# Launch the demo
if __name__ == "__main__":
    demo.launch()