VoiceAnalysis / app.py
Natwar's picture
Create app.py
343474c verified
raw
history blame
34.8 kB
import os
import subprocess
import sys
import pkg_resources
import time
import tempfile
import numpy as np
import warnings
from pathlib import Path
warnings.filterwarnings("ignore")
def install_package(package, version=None):
package_spec = f"{package}=={version}" if version else package
print(f"Installing {package_spec}...")
try:
subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir", package_spec])
except subprocess.CalledProcessError as e:
print(f"Failed to install {package_spec}: {e}")
raise
# Required packages (add version pins if needed)
required_packages = {
"gradio": None,
"torch": None,
"torchaudio": None,
"transformers": None,
"librosa": None,
"scipy": None,
"matplotlib": None,
"pydub": None,
"plotly": None
}
installed_packages = {pkg.key for pkg in pkg_resources.working_set}
for package, version in required_packages.items():
if package not in installed_packages:
install_package(package, version)
# Now import necessary packages
import gradio as gr
import torch
import torchaudio
import librosa
import matplotlib
matplotlib.use('Agg') # non-interactive backend for any fallback
from pydub import AudioSegment
import scipy
import io
from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
import plotly.graph_objects as go
# Define emotion labels, tone mapping, and descriptions
EMOTION_DESCRIPTIONS = {
"angry": "Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.",
"disgust": "Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.",
"fear": "Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.",
"happy": "Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.",
"neutral": "Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.",
"sad": "Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.",
"surprise": "Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic."
}
# If you wish to group emotions by tone, you can do so here:
TONE_MAPPING = {
"positive": ["happy", "surprise"],
"neutral": ["neutral"],
"negative": ["angry", "sad", "fear", "disgust"]
}
# Global variable for the emotion classifier
audio_emotion_classifier = None
def load_emotion_model():
"""Load and cache the speech emotion classification model."""
global audio_emotion_classifier
if audio_emotion_classifier is None:
try:
print("Loading emotion classification model...")
model_name = "superb/hubert-large-superb-er"
audio_emotion_classifier = pipeline("audio-classification", model=model_name)
print("Emotion classification model loaded successfully")
return True
except Exception as e:
print(f"Error loading emotion model: {e}")
return False
return True
def convert_audio_to_wav(audio_file):
"""Convert uploaded audio to WAV format."""
try:
audio = AudioSegment.from_file(audio_file)
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
wav_path = temp_wav.name
audio.export(wav_path, format="wav")
return wav_path
except Exception as e:
print(f"Error converting audio: {e}")
return None
def analyze_voice_tone(audio_file):
"""
Analyze the tone characteristics of the voice using more robust measurements.
Includes pitch variation, energy dynamics, and spectral features.
"""
try:
audio_data, sample_rate = librosa.load(audio_file, sr=16000)
# 1. Basic audio features
audio_duration = librosa.get_duration(y=audio_data, sr=sample_rate)
if audio_duration < 1.0: # Too short for reliable analysis
return "Audio too short for reliable tone analysis. Please provide at least 3 seconds."
# 2. Pitch analysis with more robust handling
f0, voiced_flag, voiced_prob = librosa.pyin(
audio_data,
fmin=librosa.note_to_hz('C2'),
fmax=librosa. note_to_hz('C7'),
sr=sample_rate
)
# Filter out NaN values and get valid pitch points
valid_f0 = f0[~np.isnan(f0)]
# If no pitch detected, may be noise or silence
if len(valid_f0) < 10:
return "**Voice Tone Analysis:** Unable to detect sufficient pitched content for analysis. The audio may contain primarily noise, silence, or non-speech sounds."
# 3. Calculate improved statistics
mean_pitch = np.mean(valid_f0)
median_pitch = np.median(valid_f0)
std_pitch = np.std(valid_f0)
pitch_range = np.percentile(valid_f0, 95) - np.percentile(valid_f0, 5)
# 4. Energy/volume dynamics
rms_energy = librosa.feature.rms(y=audio_data)[0]
mean_energy = np.mean(rms_energy)
std_energy = np.std(rms_energy)
energy_range = np.percentile(rms_energy, 95) - np.percentile(rms_energy, 5)
# 5. Speaking rate approximation (zero-crossing rate can help estimate this)
zcr = librosa.feature.zero_crossing_rate(audio_data)[0]
mean_zcr = np.mean(zcr)
# 6. Calculate pitch variability relative to the mean (coefficient of variation)
# This gives a better measure than raw std dev
pitch_cv = (std_pitch / mean_pitch) * 100 if mean_pitch > 0 else 0
# 7. Tone classification logic using multiple features
# Define tone characteristics based on combinations of features
tone_class = ""
tone_details = []
# Pitch-based characteristics
if pitch_cv < 5:
tone_class = "Monotone"
tone_details.append("Very little pitch variation - sounds flat and unexpressive")
elif pitch_cv < 12:
tone_class = "Steady"
tone_details.append("Moderate pitch variation - sounds controlled and measured")
elif pitch_cv < 20:
tone_class = "Expressive"
tone_details.append("Good pitch variation - sounds naturally engaging")
else:
tone_class = "Highly Dynamic"
tone_details.append("Strong pitch variation - sounds animated and emphatic")
# Pitch range classification
if mean_pitch > 180:
tone_details.append("Higher pitched voice - may convey excitement or tension")
elif mean_pitch < 120:
tone_details.append("Lower pitched voice - may convey calmness or authority")
else:
tone_details.append("Mid-range pitch - typically perceived as balanced")
# Energy/volume characteristics
energy_cv = (std_energy / mean_energy) * 100 if mean_energy > 0 else 0
if energy_cv < 10:
tone_details.append("Consistent volume - sounds controlled and measured")
elif energy_cv > 30:
tone_details.append("Variable volume - suggests emotional emphasis or expressiveness")
# Speech rate approximation
if mean_zcr > 0.1:
tone_details.append("Faster speech rate - may convey urgency or enthusiasm")
elif mean_zcr < 0.05:
tone_details.append("Slower speech rate - may convey thoughtfulness or hesitation")
# Generate tone summary and interpretation
tone_analysis = f"### Voice Tone Analysis\n\n"
tone_analysis += f"**Primary tone quality:** {tone_class}\n\n"
tone_analysis += "**Tone characteristics:**\n"
for detail in tone_details:
tone_analysis += f"- {detail}\n"
tone_analysis += "\n**Interpretation:**\n"
# Generate interpretation based on the classified tone
if tone_class == "Monotone":
tone_analysis += ("A monotone delivery can create distance and reduce engagement. "
"Consider adding more vocal variety to sound more engaging and authentic.")
elif tone_class == "Steady":
tone_analysis += ("Your steady tone suggests reliability and control. "
"This can be effective in professional settings or when conveying serious information.")
elif tone_class == "Expressive":
tone_analysis += ("Your expressive tone helps maintain listener interest and emphasize key points. "
"This naturally engaging quality helps convey authenticity and conviction.")
else: # Highly Dynamic
tone_analysis += ("Your highly dynamic vocal style conveys strong emotion and energy. "
"This can be powerful for storytelling and persuasion, though in some contexts "
"a more measured approach might be appropriate.")
return tone_analysis
except Exception as e:
print(f"Error in tone analysis: {e}")
return "Tone analysis unavailable due to an error processing the audio."
def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=2):
"""
Analyze speech emotions in short chunks,
building a timeline of confidence for each emotion.
Returns a Plotly figure, summary text, detailed results.
"""
if not load_emotion_model():
return None, "Failed to load emotion classifier.", None
# Use existing WAV if possible, else convert
if audio_file.endswith(".wav"):
audio_path = audio_file
else:
audio_path = convert_audio_to_wav(audio_file)
if not audio_path:
return None, "Could not process audio file", None
try:
# Load with librosa
audio_data, sample_rate = librosa.load(audio_path, sr=16000)
duration = len(audio_data) / sample_rate
# Use shorter chunks for more granular analysis
chunk_samples = int(chunk_duration * sample_rate)
num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples)))
all_emotions = []
time_points = []
# For each chunk, run emotion classification
for i in range(num_chunks):
progress((i + 1) / num_chunks, "Analyzing audio emotions...")
start_idx = i * chunk_samples
end_idx = min(start_idx + chunk_samples, len(audio_data))
chunk = audio_data[start_idx:end_idx]
# Skip very short chunks
if len(chunk) < 0.5 * sample_rate:
continue
# Write chunk to temp WAV
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_chunk:
chunk_path = temp_chunk.name
scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16))
# Classify - extract top-n predictions for each chunk
raw_results = audio_emotion_classifier(chunk_path, top_k=7) # Get all 7 emotions
os.unlink(chunk_path)
all_emotions.append(raw_results)
time_points.append((start_idx / sample_rate, end_idx / sample_rate))
# Skip if no valid emotions detected
if not all_emotions:
return None, "No speech detected in the audio.", None
# Build Plotly chart with improved styling
fig = build_plotly_line_chart(all_emotions, time_points, duration)
# Build summary and detailed results
summary_text = generate_emotion_summary(all_emotions)
detailed_results = build_detailed_results(all_emotions, time_points)
return fig, summary_text, detailed_results
except Exception as e:
import traceback
traceback.print_exc()
return None, f"Error analyzing audio: {str(e)}", None
def smooth_data(data, window_size=3):
"""Apply a moving average smoothing to the data"""
smoothed = np.convolve(data, np.ones(window_size)/window_size, mode='valid')
# Add back points that were lost in the convolution
padding = len(data) - len(smoothed)
if padding > 0:
# Add padding at the beginning
padding_front = padding // 2
padding_back = padding - padding_front
# Use the first/last values for padding
front_padding = [smoothed[0]] * padding_front
back_padding = [smoothed[-1]] * padding_back
smoothed = np.concatenate([front_padding, smoothed, back_padding])
return smoothed
def build_plotly_line_chart(all_emotions, time_points, duration):
"""
Create an improved Plotly line chart with toggles for each emotion.
Shows all emotions for each time point rather than just the top one.
"""
emotion_labels = list(EMOTION_DESCRIPTIONS.keys())
# Custom color scheme for emotions
colors = {
"angry": "#E53935", # Red
"disgust": "#8E24AA", # Purple
"fear": "#7B1FA2", # Deep Purple
"happy": "#FFC107", # Amber/Yellow
"neutral": "#78909C", # Blue Grey
"sad": "#1E88E5", # Blue
"surprise": "#43A047" # Green
}
# Prepare data structure for all emotions
emotion_data = {label: [] for label in emotion_labels}
timeline_times = [(start + end) / 2 for start, end in time_points]
# Process emotion scores - ensure all emotions have values
for chunk_emotions in all_emotions:
# Create a mapping of label to score for this chunk
scores = {item["label"]: item["score"] for item in chunk_emotions}
# Ensure all emotion labels have a value (default to 0.0)
for label in emotion_labels:
emotion_data[label].append(scores.get(label, 0.0))
# Smooth the data
for label in emotion_labels:
if len(emotion_data[label]) > 2:
emotion_data[label] = smooth_data(emotion_data[label])
# Build the chart
fig = go.Figure()
# Add traces for each emotion
for label in emotion_labels:
fig.add_trace(
go.Scatter(
x=timeline_times,
y=emotion_data[label],
mode='lines',
name=label.capitalize(),
line=dict(
color=colors.get(label, None),
width=3,
shape='spline', # Curved lines
smoothing=1.3
),
hovertemplate=f'{label.capitalize()}: %{{y:.2f}}<extra></extra>',
)
)
# Add markers for dominant emotion at each point
dominant_markers_x = []
dominant_markers_y = []
dominant_markers_text = []
dominant_markers_color = []
for i, time in enumerate(timeline_times):
scores = {label: emotion_data[label][i] for label in emotion_labels}
dominant = max(scores.items(), key=lambda x: x[1])
dominant_markers_x.append(time)
dominant_markers_y.append(dominant[1])
dominant_markers_text.append(f"{dominant[0].capitalize()}: {dominant[1]:.2f}")
dominant_markers_color.append(colors.get(dominant[0], "#000000"))
fig.add_trace(
go.Scatter(
x=dominant_markers_x,
y=dominant_markers_y,
mode='markers',
marker=dict(
size=10,
color=dominant_markers_color,
line=dict(width=2, color='white')
),
name="Dominant Emotion",
text=dominant_markers_text,
hoverinfo="text",
hovertemplate='%{text}<extra></extra>'
)
)
# Add area chart for better visualization
for label in emotion_labels:
fig.add_trace(
go.Scatter(
x=timeline_times,
y=emotion_data[label],
mode='none',
name=f"{label.capitalize()} Area",
fill='tozeroy',
fillcolor=f"rgba{tuple(list(int(colors.get(label, '#000000').lstrip('#')[i:i+2], 16) for i in (0, 2, 4)) + [0.1])}",
showlegend=False,
hoverinfo='skip'
)
)
# Improve layout
fig.update_layout(
title={
'text': "Voice Emotion Analysis Over Time",
'font': {'size': 22, 'family': 'Arial, sans-serif'}
},
xaxis_title="Time (seconds)",
yaxis_title="Confidence Score",
yaxis=dict(
range=[0, 1.0],
showgrid=True,
gridcolor='rgba(230, 230, 230, 0.8)'
),
xaxis=dict(
showgrid=True,
gridcolor='rgba(230, 230, 230, 0.8)'
),
plot_bgcolor='white',
legend=dict(
bordercolor='rgba(0,0,0,0.1)',
borderwidth=1,
orientation="h",
yanchor="bottom",
y=1.02,
xanchor="right",
x=1
),
hovermode='closest',
height=500, # Larger size for better viewing
margin=dict(l=10, r=10, t=80, b=50)
)
return fig
def generate_alternative_chart(all_emotions, time_points):
"""
Create a stacked area chart to better visualize emotion changes over time
"""
emotion_labels = list(EMOTION_DESCRIPTIONS.keys())
# Custom color scheme for emotions - more visible/distinct
colors = {
"angry": "#F44336", # Red
"disgust": "#9C27B0", # Purple
"fear": "#673AB7", # Deep Purple
"happy": "#FFC107", # Amber
"neutral": "#607D8B", # Blue Grey
"sad": "#2196F3", # Blue
"surprise": "#4CAF50" # Green
}
# Prepare timeline points
timeline_times = [(start + end) / 2 for start, end in time_points]
# Prepare data structure for all emotions
emotion_data = {label: [] for label in emotion_labels}
# Process emotion scores - ensure all emotions have values
for chunk_emotions in all_emotions:
# Create a mapping of label to score for this chunk
scores = {item["label"]: item["score"] for item in chunk_emotions}
# Ensure all emotion labels have a value (default to 0.0)
for label in emotion_labels:
emotion_data[label].append(scores.get(label, 0.0))
# Create the stacked area chart
fig = go.Figure()
# Add each emotion as a separate trace
for label in emotion_labels:
fig.add_trace(
go.Scatter(
x=timeline_times,
y=emotion_data[label],
mode='lines',
name=label.capitalize(),
line=dict(width=0.5, color=colors.get(label, None)),
stackgroup='one', # This makes it a stacked area chart
fillcolor=colors.get(label, None),
hovertemplate=f'{label.capitalize()}: %{{y:.2f}}<extra></extra>'
)
)
# Improve layout
fig.update_layout(
title={
'text': "Voice Emotion Distribution Over Time",
'font': {'size': 22, 'family': 'Arial, sans-serif'}
},
xaxis_title="Time (seconds)",
yaxis_title="Emotion Intensity",
yaxis=dict(
showgrid=True,
gridcolor='rgba(230, 230, 230, 0.8)'
),
xaxis=dict(
showgrid=True,
gridcolor='rgba(230, 230, 230, 0.8)'
),
plot_bgcolor='white',
legend=dict(
bordercolor='rgba(0,0,0,0.1)',
borderwidth=1,
orientation="h",
yanchor="bottom",
y=1.02,
xanchor="right",
x=1
),
hovermode='closest',
height=500,
margin=dict(l=10, r=10, t=80, b=50)
)
return fig
def generate_emotion_summary(all_emotions):
"""
Produce an improved textual summary of the overall emotion distribution.
"""
if not all_emotions:
return "No emotional content detected."
emotion_counts = {}
emotion_confidence = {}
total_chunks = len(all_emotions)
for chunk_emotions in all_emotions:
top_emotion = max(chunk_emotions, key=lambda x: x['score'])
label = top_emotion["label"]
confidence = top_emotion["score"]
emotion_counts[label] = emotion_counts.get(label, 0) + 1
emotion_confidence[label] = emotion_confidence.get(label, 0) + confidence
# Calculate average confidence for each emotion
for emotion in emotion_confidence:
if emotion_counts[emotion] > 0:
emotion_confidence[emotion] /= emotion_counts[emotion]
# Dominant emotion (highest percentage)
dominant_emotion = max(emotion_counts, key=emotion_counts.get)
dominant_pct = (emotion_counts[dominant_emotion] / total_chunks) * 100
# Most confident emotion (might differ from dominant)
most_confident = max(emotion_confidence, key=emotion_confidence.get)
# Tone grouping analysis
tone_group_counts = {group: 0 for group in TONE_MAPPING}
for emotion, count in emotion_counts.items():
for tone_group, emotions in TONE_MAPPING.items():
if emotion in emotions:
tone_group_counts[tone_group] += count
dominant_tone = max(tone_group_counts, key=tone_group_counts.get)
dominant_tone_pct = (tone_group_counts[dominant_tone] / total_chunks) * 100
# Build summary with markdown formatting
summary = f"### Voice Emotion Analysis Summary\n\n"
summary += f"**Dominant emotion:** {dominant_emotion.capitalize()} ({dominant_pct:.1f}%)\n\n"
if dominant_emotion != most_confident and emotion_confidence[most_confident] > 0.7:
summary += f"**Most confident detection:** {most_confident.capitalize()} "
summary += f"(avg. confidence: {emotion_confidence[most_confident]:.2f})\n\n"
summary += f"**Overall tone:** {dominant_tone.capitalize()} ({dominant_tone_pct:.1f}%)\n\n"
summary += f"**Description:** {EMOTION_DESCRIPTIONS.get(dominant_emotion, '')}\n\n"
# Show emotion distribution as sorted list
summary += "**Emotion distribution:**\n"
for emotion, count in sorted(emotion_counts.items(), key=lambda x: x[1], reverse=True):
percentage = (count / total_chunks) * 100
avg_conf = emotion_confidence[emotion]
summary += f"- {emotion.capitalize()}: {percentage:.1f}% (confidence: {avg_conf:.2f})\n"
# Add interpretation based on dominant emotion
summary += f"\n**Interpretation:**\n"
if dominant_emotion == "happy":
summary += "The voice conveys primarily positive emotions, suggesting enthusiasm, satisfaction, or joy."
elif dominant_emotion == "neutral":
summary += "The voice maintains an even emotional tone, suggesting composure or professional delivery."
elif dominant_emotion == "sad":
summary += "The voice conveys melancholy or disappointment, potentially indicating concern or distress."
elif dominant_emotion == "angry":
summary += "The voice shows frustration or assertiveness, suggesting strong conviction or displeasure."
elif dominant_emotion == "fear":
summary += "The voice reveals anxiety or nervousness, suggesting uncertainty or concern."
elif dominant_emotion == "disgust":
summary += "The voice expresses disapproval or aversion, suggesting rejection of discussed concepts."
elif dominant_emotion == "surprise":
summary += "The voice shows unexpected reactions, suggesting discovery of new information or astonishment."
return summary
def build_detailed_results(all_emotions, time_points):
"""
Return a list of dictionaries containing chunk start-end, top emotion, confidence, description.
Suitable for Gradio DataFrame display.
"""
results_list = []
for (emotions, (start_time, end_time)) in zip(all_emotions, time_points):
top_emotion = max(emotions, key=lambda x: x['score'])
label = top_emotion["label"]
# Find second highest emotion if available
if len(emotions) > 1:
sorted_emotions = sorted(emotions, key=lambda x: x['score'], reverse=True)
second_emotion = sorted_emotions[1]["label"].capitalize()
second_score = sorted_emotions[1]["score"]
secondary = f" ({second_emotion}: {second_score:.2f})"
else:
secondary = ""
results_list.append({
"Time Range": f"{start_time:.1f}s - {end_time:.1f}s",
"Primary Emotion": label.capitalize(),
"Confidence": f"{top_emotion['score']:.2f}{secondary}",
"Description": EMOTION_DESCRIPTIONS.get(label, "")
})
return results_list
def process_audio(audio_file, progress=gr.Progress()):
"""
Main handler for Gradio:
1) Emotion analysis (returns Plotly figure).
2) Tone analysis (returns descriptive text).
"""
if not audio_file:
return None, None, "No audio file provided.", None, "No tone analysis."
# 1) Analyze emotions
fig, summary_text, detailed_results = analyze_audio_emotions(audio_file, progress)
if not fig: # Error or missing
return None, None, "Failed to analyze audio emotions.", None, "Tone analysis unavailable."
# 2) Generate alternative chart
# Extract the necessary data from detailed_results to create time_points
time_points = []
for result in detailed_results:
time_range = result["Time Range"]
start_time = float(time_range.split("s")[0])
end_time = float(time_range.split(" - ")[1].split("s")[0])
time_points.append((start_time, end_time))
# Extract emotion data from detailed_results
all_emotions = []
for result in detailed_results:
# Parse the primary emotion and confidence
primary_emotion = result["Primary Emotion"].lower()
confidence_str = result["Confidence"].split("(")[0].strip()
primary_confidence = float(confidence_str)
# Create a list of emotion dictionaries for this time point
emotions_at_time = [{"label": primary_emotion, "score": primary_confidence}]
# Check if there's a secondary emotion
if "(" in result["Confidence"]:
secondary_part = result["Confidence"].split("(")[1].split(")")[0]
secondary_emotion = secondary_part.split(":")[0].strip().lower()
secondary_confidence = float(secondary_part.split(":")[1].strip())
emotions_at_time.append({"label": secondary_emotion, "score": secondary_confidence})
# Add remaining emotions with zero confidence
for emotion in EMOTION_DESCRIPTIONS.keys():
if emotion not in [e["label"] for e in emotions_at_time]:
emotions_at_time.append({"label": emotion, "score": 0.0})
all_emotions.append(emotions_at_time)
# Now we can generate the alternative chart
alt_fig = generate_alternative_chart(all_emotions, time_points)
# 3) Analyze tone
tone_analysis = analyze_voice_tone(audio_file)
return fig, alt_fig, summary_text, detailed_results, tone_analysis
# Create Gradio interface with improved UI/UX
with gr.Blocks(title="Voice Emotion & Tone Analysis System", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 🎙️ Voice Emotion & Tone Analysis System
This app provides professional analysis of:
- **Emotions** in your voice (Anger, Disgust, Fear, Happy, Neutral, Sad, Surprise)
- **Tone characteristics** (based on pitch, energy, and speech patterns)
The interactive timeline shows emotion confidence scores throughout your audio.
""")
with gr.Tabs():
# Tab 1: Upload
with gr.TabItem("Upload Audio"):
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(
label="Upload Audio File",
type="filepath",
sources=["upload"],
elem_id="audio_upload"
)
process_btn = gr.Button("Analyze Voice", variant="primary")
gr.Markdown("""
**Supports:** MP3, WAV, M4A, and most audio formats
**For best results:** Use a clear voice recording with minimal background noise
""")
with gr.Column(scale=2):
with gr.Tabs():
with gr.TabItem("Line Chart"):
emotion_timeline = gr.Plot(label="Emotion Timeline",
elem_id="emotion_plot",
container=True)
with gr.TabItem("Area Chart"):
emotion_area_chart = gr.Plot(label="Emotion Distribution",
elem_id="emotion_area_plot",
container=True)
with gr.Row():
with gr.Column():
emotion_summary = gr.Markdown(label="Emotion Summary")
with gr.Column():
tone_analysis_output = gr.Markdown(label="Tone Analysis")
with gr.Row():
emotion_results = gr.DataFrame(
headers=["Time Range", "Primary Emotion", "Confidence", "Description"],
label="Detailed Emotion Analysis"
)
process_btn.click(
fn=process_audio,
inputs=[audio_input],
outputs=[emotion_timeline, emotion_area_chart, emotion_summary, emotion_results, tone_analysis_output]
)
# Tab 2: Record
with gr.TabItem("Record Voice"):
with gr.Row():
with gr.Column(scale=1):
record_input = gr.Audio(
label="Record Your Voice",
sources=["microphone"],
type="filepath",
elem_id="record_audio"
)
analyze_btn = gr.Button("Analyze Recording", variant="primary")
gr.Markdown("""
**Tips:**
- Speak clearly and at a normal pace
- Record at least 10-15 seconds for more accurate analysis
- Try different emotional tones to see how they're detected
""")
with gr.Column(scale=2):
with gr.Tabs():
with gr.TabItem("Line Chart"):
rec_emotion_timeline = gr.Plot(label="Emotion Timeline",
elem_id="record_emotion_plot",
container=True)
with gr.TabItem("Area Chart"):
rec_emotion_area_chart = gr.Plot(label="Emotion Distribution",
elem_id="record_emotion_area_plot",
container=True)
with gr.Row():
with gr.Column():
rec_emotion_summary = gr.Markdown(label="Emotion Summary")
with gr.Column():
rec_tone_analysis_output = gr.Markdown(label="Tone Analysis")
with gr.Row():
rec_emotion_results = gr.DataFrame(
headers=["Time Range", "Primary Emotion", "Confidence", "Description"],
label="Detailed Emotion Analysis"
)
analyze_btn.click(
fn=process_audio,
inputs=[record_input],
outputs=[rec_emotion_timeline, rec_emotion_area_chart, rec_emotion_summary, rec_emotion_results, rec_tone_analysis_output]
)
# Tab 3: About & Help
with gr.TabItem("About & Help"):
gr.Markdown("""
## About This System
This voice emotion & tone analysis system uses state-of-the-art deep learning models to detect emotions and analyze vocal characteristics. The system is built on HuBERT (Hidden Unit BERT) architecture trained on speech emotion recognition tasks.
### How It Works
1. **Audio Processing**: Your audio is processed in short segments (chunks) to capture emotion variations over time.
2. **Emotion Classification**: Each segment is analyzed by a neural network to detect emotional patterns.
3. **Tone Analysis**: Acoustic features like pitch, energy, and rhythm are analyzed to describe voice tone characteristics.
### Emotion Categories
The system detects seven standard emotions:
- **Angry**: Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.
- **Disgust**: Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.
- **Fear**: Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.
- **Happy**: Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.
- **Neutral**: Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.
- **Sad**: Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.
- **Surprise**: Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic.
### Tips for Best Results
- Use clear audio with minimal background noise
- Speak naturally at a comfortable volume
- Record at least 10-15 seconds of speech
- For tone analysis, longer recordings (30+ seconds) provide more accurate results
### Privacy Notice
All audio processing happens on your device. No audio recordings or analysis results are stored or transmitted to external servers.
""")
gr.Markdown("""
---
### System Information
- **Model**: HuBERT Large for Speech Emotion Recognition
- **Version**: 1.2.0
- **Libraries**: PyTorch, Transformers, Librosa, Plotly
This application demonstrates the use of AI for speech emotion recognition and acoustic analysis. For research and educational purposes only.
""")
# Check if model can load before launching interface
print("Checking model availability...")
load_success = load_emotion_model()
if not load_success:
print("Warning: Emotion model failed to load. Application may have limited functionality.")
# Launch the demo
if __name__ == "__main__":
demo.launch()