Spaces:
Running
Running
import os | |
import subprocess | |
import sys | |
import pkg_resources | |
import time | |
import tempfile | |
import numpy as np | |
import warnings | |
from pathlib import Path | |
warnings.filterwarnings("ignore") | |
def install_package(package, version=None): | |
package_spec = f"{package}=={version}" if version else package | |
print(f"Installing {package_spec}...") | |
try: | |
subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir", package_spec]) | |
except subprocess.CalledProcessError as e: | |
print(f"Failed to install {package_spec}: {e}") | |
raise | |
# Required packages (add version pins if needed) | |
required_packages = { | |
"gradio": None, | |
"torch": None, | |
"torchaudio": None, | |
"transformers": None, | |
"librosa": None, | |
"scipy": None, | |
"matplotlib": None, | |
"pydub": None, | |
"plotly": None | |
} | |
installed_packages = {pkg.key for pkg in pkg_resources.working_set} | |
for package, version in required_packages.items(): | |
if package not in installed_packages: | |
install_package(package, version) | |
# Now import necessary packages | |
import gradio as gr | |
import torch | |
import torchaudio | |
import librosa | |
import matplotlib | |
matplotlib.use('Agg') # non-interactive backend for any fallback | |
from pydub import AudioSegment | |
import scipy | |
import io | |
from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification | |
import plotly.graph_objects as go | |
# Define emotion labels, tone mapping, and descriptions | |
EMOTION_DESCRIPTIONS = { | |
"angry": "Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.", | |
"disgust": "Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.", | |
"fear": "Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.", | |
"happy": "Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.", | |
"neutral": "Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.", | |
"sad": "Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.", | |
"surprise": "Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic." | |
} | |
# If you wish to group emotions by tone, you can do so here: | |
TONE_MAPPING = { | |
"positive": ["happy", "surprise"], | |
"neutral": ["neutral"], | |
"negative": ["angry", "sad", "fear", "disgust"] | |
} | |
# Global variable for the emotion classifier | |
audio_emotion_classifier = None | |
def load_emotion_model(): | |
"""Load and cache the speech emotion classification model.""" | |
global audio_emotion_classifier | |
if audio_emotion_classifier is None: | |
try: | |
print("Loading emotion classification model...") | |
model_name = "superb/hubert-large-superb-er" | |
audio_emotion_classifier = pipeline("audio-classification", model=model_name) | |
print("Emotion classification model loaded successfully") | |
return True | |
except Exception as e: | |
print(f"Error loading emotion model: {e}") | |
return False | |
return True | |
def convert_audio_to_wav(audio_file): | |
"""Convert uploaded audio to WAV format.""" | |
try: | |
audio = AudioSegment.from_file(audio_file) | |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav: | |
wav_path = temp_wav.name | |
audio.export(wav_path, format="wav") | |
return wav_path | |
except Exception as e: | |
print(f"Error converting audio: {e}") | |
return None | |
def analyze_voice_tone(audio_file): | |
""" | |
Analyze the tone characteristics of the voice using more robust measurements. | |
Includes pitch variation, energy dynamics, and spectral features. | |
""" | |
try: | |
audio_data, sample_rate = librosa.load(audio_file, sr=16000) | |
# 1. Basic audio features | |
audio_duration = librosa.get_duration(y=audio_data, sr=sample_rate) | |
if audio_duration < 1.0: # Too short for reliable analysis | |
return "Audio too short for reliable tone analysis. Please provide at least 3 seconds." | |
# 2. Pitch analysis with more robust handling | |
f0, voiced_flag, voiced_prob = librosa.pyin( | |
audio_data, | |
fmin=librosa.note_to_hz('C2'), | |
fmax=librosa. note_to_hz('C7'), | |
sr=sample_rate | |
) | |
# Filter out NaN values and get valid pitch points | |
valid_f0 = f0[~np.isnan(f0)] | |
# If no pitch detected, may be noise or silence | |
if len(valid_f0) < 10: | |
return "**Voice Tone Analysis:** Unable to detect sufficient pitched content for analysis. The audio may contain primarily noise, silence, or non-speech sounds." | |
# 3. Calculate improved statistics | |
mean_pitch = np.mean(valid_f0) | |
median_pitch = np.median(valid_f0) | |
std_pitch = np.std(valid_f0) | |
pitch_range = np.percentile(valid_f0, 95) - np.percentile(valid_f0, 5) | |
# 4. Energy/volume dynamics | |
rms_energy = librosa.feature.rms(y=audio_data)[0] | |
mean_energy = np.mean(rms_energy) | |
std_energy = np.std(rms_energy) | |
energy_range = np.percentile(rms_energy, 95) - np.percentile(rms_energy, 5) | |
# 5. Speaking rate approximation (zero-crossing rate can help estimate this) | |
zcr = librosa.feature.zero_crossing_rate(audio_data)[0] | |
mean_zcr = np.mean(zcr) | |
# 6. Calculate pitch variability relative to the mean (coefficient of variation) | |
# This gives a better measure than raw std dev | |
pitch_cv = (std_pitch / mean_pitch) * 100 if mean_pitch > 0 else 0 | |
# 7. Tone classification logic using multiple features | |
# Define tone characteristics based on combinations of features | |
tone_class = "" | |
tone_details = [] | |
# Pitch-based characteristics | |
if pitch_cv < 5: | |
tone_class = "Monotone" | |
tone_details.append("Very little pitch variation - sounds flat and unexpressive") | |
elif pitch_cv < 12: | |
tone_class = "Steady" | |
tone_details.append("Moderate pitch variation - sounds controlled and measured") | |
elif pitch_cv < 20: | |
tone_class = "Expressive" | |
tone_details.append("Good pitch variation - sounds naturally engaging") | |
else: | |
tone_class = "Highly Dynamic" | |
tone_details.append("Strong pitch variation - sounds animated and emphatic") | |
# Pitch range classification | |
if mean_pitch > 180: | |
tone_details.append("Higher pitched voice - may convey excitement or tension") | |
elif mean_pitch < 120: | |
tone_details.append("Lower pitched voice - may convey calmness or authority") | |
else: | |
tone_details.append("Mid-range pitch - typically perceived as balanced") | |
# Energy/volume characteristics | |
energy_cv = (std_energy / mean_energy) * 100 if mean_energy > 0 else 0 | |
if energy_cv < 10: | |
tone_details.append("Consistent volume - sounds controlled and measured") | |
elif energy_cv > 30: | |
tone_details.append("Variable volume - suggests emotional emphasis or expressiveness") | |
# Speech rate approximation | |
if mean_zcr > 0.1: | |
tone_details.append("Faster speech rate - may convey urgency or enthusiasm") | |
elif mean_zcr < 0.05: | |
tone_details.append("Slower speech rate - may convey thoughtfulness or hesitation") | |
# Generate tone summary and interpretation | |
tone_analysis = f"### Voice Tone Analysis\n\n" | |
tone_analysis += f"**Primary tone quality:** {tone_class}\n\n" | |
tone_analysis += "**Tone characteristics:**\n" | |
for detail in tone_details: | |
tone_analysis += f"- {detail}\n" | |
tone_analysis += "\n**Interpretation:**\n" | |
# Generate interpretation based on the classified tone | |
if tone_class == "Monotone": | |
tone_analysis += ("A monotone delivery can create distance and reduce engagement. " | |
"Consider adding more vocal variety to sound more engaging and authentic.") | |
elif tone_class == "Steady": | |
tone_analysis += ("Your steady tone suggests reliability and control. " | |
"This can be effective in professional settings or when conveying serious information.") | |
elif tone_class == "Expressive": | |
tone_analysis += ("Your expressive tone helps maintain listener interest and emphasize key points. " | |
"This naturally engaging quality helps convey authenticity and conviction.") | |
else: # Highly Dynamic | |
tone_analysis += ("Your highly dynamic vocal style conveys strong emotion and energy. " | |
"This can be powerful for storytelling and persuasion, though in some contexts " | |
"a more measured approach might be appropriate.") | |
return tone_analysis | |
except Exception as e: | |
print(f"Error in tone analysis: {e}") | |
return "Tone analysis unavailable due to an error processing the audio." | |
def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=2): | |
""" | |
Analyze speech emotions in short chunks, | |
building a timeline of confidence for each emotion. | |
Returns a Plotly figure, summary text, detailed results. | |
""" | |
if not load_emotion_model(): | |
return None, "Failed to load emotion classifier.", None | |
# Use existing WAV if possible, else convert | |
if audio_file.endswith(".wav"): | |
audio_path = audio_file | |
else: | |
audio_path = convert_audio_to_wav(audio_file) | |
if not audio_path: | |
return None, "Could not process audio file", None | |
try: | |
# Load with librosa | |
audio_data, sample_rate = librosa.load(audio_path, sr=16000) | |
duration = len(audio_data) / sample_rate | |
# Use shorter chunks for more granular analysis | |
chunk_samples = int(chunk_duration * sample_rate) | |
num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples))) | |
all_emotions = [] | |
time_points = [] | |
# For each chunk, run emotion classification | |
for i in range(num_chunks): | |
progress((i + 1) / num_chunks, "Analyzing audio emotions...") | |
start_idx = i * chunk_samples | |
end_idx = min(start_idx + chunk_samples, len(audio_data)) | |
chunk = audio_data[start_idx:end_idx] | |
# Skip very short chunks | |
if len(chunk) < 0.5 * sample_rate: | |
continue | |
# Write chunk to temp WAV | |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_chunk: | |
chunk_path = temp_chunk.name | |
scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16)) | |
# Classify - extract top-n predictions for each chunk | |
raw_results = audio_emotion_classifier(chunk_path, top_k=7) # Get all 7 emotions | |
os.unlink(chunk_path) | |
all_emotions.append(raw_results) | |
time_points.append((start_idx / sample_rate, end_idx / sample_rate)) | |
# Skip if no valid emotions detected | |
if not all_emotions: | |
return None, "No speech detected in the audio.", None | |
# Build Plotly chart with improved styling | |
fig = build_plotly_line_chart(all_emotions, time_points, duration) | |
# Build summary and detailed results | |
summary_text = generate_emotion_summary(all_emotions) | |
detailed_results = build_detailed_results(all_emotions, time_points) | |
return fig, summary_text, detailed_results | |
except Exception as e: | |
import traceback | |
traceback.print_exc() | |
return None, f"Error analyzing audio: {str(e)}", None | |
def smooth_data(data, window_size=3): | |
"""Apply a moving average smoothing to the data""" | |
smoothed = np.convolve(data, np.ones(window_size)/window_size, mode='valid') | |
# Add back points that were lost in the convolution | |
padding = len(data) - len(smoothed) | |
if padding > 0: | |
# Add padding at the beginning | |
padding_front = padding // 2 | |
padding_back = padding - padding_front | |
# Use the first/last values for padding | |
front_padding = [smoothed[0]] * padding_front | |
back_padding = [smoothed[-1]] * padding_back | |
smoothed = np.concatenate([front_padding, smoothed, back_padding]) | |
return smoothed | |
def build_plotly_line_chart(all_emotions, time_points, duration): | |
""" | |
Create an improved Plotly line chart with toggles for each emotion. | |
Shows all emotions for each time point rather than just the top one. | |
""" | |
emotion_labels = list(EMOTION_DESCRIPTIONS.keys()) | |
# Custom color scheme for emotions | |
colors = { | |
"angry": "#E53935", # Red | |
"disgust": "#8E24AA", # Purple | |
"fear": "#7B1FA2", # Deep Purple | |
"happy": "#FFC107", # Amber/Yellow | |
"neutral": "#78909C", # Blue Grey | |
"sad": "#1E88E5", # Blue | |
"surprise": "#43A047" # Green | |
} | |
# Prepare data structure for all emotions | |
emotion_data = {label: [] for label in emotion_labels} | |
timeline_times = [(start + end) / 2 for start, end in time_points] | |
# Process emotion scores - ensure all emotions have values | |
for chunk_emotions in all_emotions: | |
# Create a mapping of label to score for this chunk | |
scores = {item["label"]: item["score"] for item in chunk_emotions} | |
# Ensure all emotion labels have a value (default to 0.0) | |
for label in emotion_labels: | |
emotion_data[label].append(scores.get(label, 0.0)) | |
# Smooth the data | |
for label in emotion_labels: | |
if len(emotion_data[label]) > 2: | |
emotion_data[label] = smooth_data(emotion_data[label]) | |
# Build the chart | |
fig = go.Figure() | |
# Add traces for each emotion | |
for label in emotion_labels: | |
fig.add_trace( | |
go.Scatter( | |
x=timeline_times, | |
y=emotion_data[label], | |
mode='lines', | |
name=label.capitalize(), | |
line=dict( | |
color=colors.get(label, None), | |
width=3, | |
shape='spline', # Curved lines | |
smoothing=1.3 | |
), | |
hovertemplate=f'{label.capitalize()}: %{{y:.2f}}<extra></extra>', | |
) | |
) | |
# Add markers for dominant emotion at each point | |
dominant_markers_x = [] | |
dominant_markers_y = [] | |
dominant_markers_text = [] | |
dominant_markers_color = [] | |
for i, time in enumerate(timeline_times): | |
scores = {label: emotion_data[label][i] for label in emotion_labels} | |
dominant = max(scores.items(), key=lambda x: x[1]) | |
dominant_markers_x.append(time) | |
dominant_markers_y.append(dominant[1]) | |
dominant_markers_text.append(f"{dominant[0].capitalize()}: {dominant[1]:.2f}") | |
dominant_markers_color.append(colors.get(dominant[0], "#000000")) | |
fig.add_trace( | |
go.Scatter( | |
x=dominant_markers_x, | |
y=dominant_markers_y, | |
mode='markers', | |
marker=dict( | |
size=10, | |
color=dominant_markers_color, | |
line=dict(width=2, color='white') | |
), | |
name="Dominant Emotion", | |
text=dominant_markers_text, | |
hoverinfo="text", | |
hovertemplate='%{text}<extra></extra>' | |
) | |
) | |
# Add area chart for better visualization | |
for label in emotion_labels: | |
fig.add_trace( | |
go.Scatter( | |
x=timeline_times, | |
y=emotion_data[label], | |
mode='none', | |
name=f"{label.capitalize()} Area", | |
fill='tozeroy', | |
fillcolor=f"rgba{tuple(list(int(colors.get(label, '#000000').lstrip('#')[i:i+2], 16) for i in (0, 2, 4)) + [0.1])}", | |
showlegend=False, | |
hoverinfo='skip' | |
) | |
) | |
# Improve layout | |
fig.update_layout( | |
title={ | |
'text': "Voice Emotion Analysis Over Time", | |
'font': {'size': 22, 'family': 'Arial, sans-serif'} | |
}, | |
xaxis_title="Time (seconds)", | |
yaxis_title="Confidence Score", | |
yaxis=dict( | |
range=[0, 1.0], | |
showgrid=True, | |
gridcolor='rgba(230, 230, 230, 0.8)' | |
), | |
xaxis=dict( | |
showgrid=True, | |
gridcolor='rgba(230, 230, 230, 0.8)' | |
), | |
plot_bgcolor='white', | |
legend=dict( | |
bordercolor='rgba(0,0,0,0.1)', | |
borderwidth=1, | |
orientation="h", | |
yanchor="bottom", | |
y=1.02, | |
xanchor="right", | |
x=1 | |
), | |
hovermode='closest', | |
height=500, # Larger size for better viewing | |
margin=dict(l=10, r=10, t=80, b=50) | |
) | |
return fig | |
def generate_alternative_chart(all_emotions, time_points): | |
""" | |
Create a stacked area chart to better visualize emotion changes over time | |
""" | |
emotion_labels = list(EMOTION_DESCRIPTIONS.keys()) | |
# Custom color scheme for emotions - more visible/distinct | |
colors = { | |
"angry": "#F44336", # Red | |
"disgust": "#9C27B0", # Purple | |
"fear": "#673AB7", # Deep Purple | |
"happy": "#FFC107", # Amber | |
"neutral": "#607D8B", # Blue Grey | |
"sad": "#2196F3", # Blue | |
"surprise": "#4CAF50" # Green | |
} | |
# Prepare timeline points | |
timeline_times = [(start + end) / 2 for start, end in time_points] | |
# Prepare data structure for all emotions | |
emotion_data = {label: [] for label in emotion_labels} | |
# Process emotion scores - ensure all emotions have values | |
for chunk_emotions in all_emotions: | |
# Create a mapping of label to score for this chunk | |
scores = {item["label"]: item["score"] for item in chunk_emotions} | |
# Ensure all emotion labels have a value (default to 0.0) | |
for label in emotion_labels: | |
emotion_data[label].append(scores.get(label, 0.0)) | |
# Create the stacked area chart | |
fig = go.Figure() | |
# Add each emotion as a separate trace | |
for label in emotion_labels: | |
fig.add_trace( | |
go.Scatter( | |
x=timeline_times, | |
y=emotion_data[label], | |
mode='lines', | |
name=label.capitalize(), | |
line=dict(width=0.5, color=colors.get(label, None)), | |
stackgroup='one', # This makes it a stacked area chart | |
fillcolor=colors.get(label, None), | |
hovertemplate=f'{label.capitalize()}: %{{y:.2f}}<extra></extra>' | |
) | |
) | |
# Improve layout | |
fig.update_layout( | |
title={ | |
'text': "Voice Emotion Distribution Over Time", | |
'font': {'size': 22, 'family': 'Arial, sans-serif'} | |
}, | |
xaxis_title="Time (seconds)", | |
yaxis_title="Emotion Intensity", | |
yaxis=dict( | |
showgrid=True, | |
gridcolor='rgba(230, 230, 230, 0.8)' | |
), | |
xaxis=dict( | |
showgrid=True, | |
gridcolor='rgba(230, 230, 230, 0.8)' | |
), | |
plot_bgcolor='white', | |
legend=dict( | |
bordercolor='rgba(0,0,0,0.1)', | |
borderwidth=1, | |
orientation="h", | |
yanchor="bottom", | |
y=1.02, | |
xanchor="right", | |
x=1 | |
), | |
hovermode='closest', | |
height=500, | |
margin=dict(l=10, r=10, t=80, b=50) | |
) | |
return fig | |
def generate_emotion_summary(all_emotions): | |
""" | |
Produce an improved textual summary of the overall emotion distribution. | |
""" | |
if not all_emotions: | |
return "No emotional content detected." | |
emotion_counts = {} | |
emotion_confidence = {} | |
total_chunks = len(all_emotions) | |
for chunk_emotions in all_emotions: | |
top_emotion = max(chunk_emotions, key=lambda x: x['score']) | |
label = top_emotion["label"] | |
confidence = top_emotion["score"] | |
emotion_counts[label] = emotion_counts.get(label, 0) + 1 | |
emotion_confidence[label] = emotion_confidence.get(label, 0) + confidence | |
# Calculate average confidence for each emotion | |
for emotion in emotion_confidence: | |
if emotion_counts[emotion] > 0: | |
emotion_confidence[emotion] /= emotion_counts[emotion] | |
# Dominant emotion (highest percentage) | |
dominant_emotion = max(emotion_counts, key=emotion_counts.get) | |
dominant_pct = (emotion_counts[dominant_emotion] / total_chunks) * 100 | |
# Most confident emotion (might differ from dominant) | |
most_confident = max(emotion_confidence, key=emotion_confidence.get) | |
# Tone grouping analysis | |
tone_group_counts = {group: 0 for group in TONE_MAPPING} | |
for emotion, count in emotion_counts.items(): | |
for tone_group, emotions in TONE_MAPPING.items(): | |
if emotion in emotions: | |
tone_group_counts[tone_group] += count | |
dominant_tone = max(tone_group_counts, key=tone_group_counts.get) | |
dominant_tone_pct = (tone_group_counts[dominant_tone] / total_chunks) * 100 | |
# Build summary with markdown formatting | |
summary = f"### Voice Emotion Analysis Summary\n\n" | |
summary += f"**Dominant emotion:** {dominant_emotion.capitalize()} ({dominant_pct:.1f}%)\n\n" | |
if dominant_emotion != most_confident and emotion_confidence[most_confident] > 0.7: | |
summary += f"**Most confident detection:** {most_confident.capitalize()} " | |
summary += f"(avg. confidence: {emotion_confidence[most_confident]:.2f})\n\n" | |
summary += f"**Overall tone:** {dominant_tone.capitalize()} ({dominant_tone_pct:.1f}%)\n\n" | |
summary += f"**Description:** {EMOTION_DESCRIPTIONS.get(dominant_emotion, '')}\n\n" | |
# Show emotion distribution as sorted list | |
summary += "**Emotion distribution:**\n" | |
for emotion, count in sorted(emotion_counts.items(), key=lambda x: x[1], reverse=True): | |
percentage = (count / total_chunks) * 100 | |
avg_conf = emotion_confidence[emotion] | |
summary += f"- {emotion.capitalize()}: {percentage:.1f}% (confidence: {avg_conf:.2f})\n" | |
# Add interpretation based on dominant emotion | |
summary += f"\n**Interpretation:**\n" | |
if dominant_emotion == "happy": | |
summary += "The voice conveys primarily positive emotions, suggesting enthusiasm, satisfaction, or joy." | |
elif dominant_emotion == "neutral": | |
summary += "The voice maintains an even emotional tone, suggesting composure or professional delivery." | |
elif dominant_emotion == "sad": | |
summary += "The voice conveys melancholy or disappointment, potentially indicating concern or distress." | |
elif dominant_emotion == "angry": | |
summary += "The voice shows frustration or assertiveness, suggesting strong conviction or displeasure." | |
elif dominant_emotion == "fear": | |
summary += "The voice reveals anxiety or nervousness, suggesting uncertainty or concern." | |
elif dominant_emotion == "disgust": | |
summary += "The voice expresses disapproval or aversion, suggesting rejection of discussed concepts." | |
elif dominant_emotion == "surprise": | |
summary += "The voice shows unexpected reactions, suggesting discovery of new information or astonishment." | |
return summary | |
def build_detailed_results(all_emotions, time_points): | |
""" | |
Return a list of dictionaries containing chunk start-end, top emotion, confidence, description. | |
Suitable for Gradio DataFrame display. | |
""" | |
results_list = [] | |
for (emotions, (start_time, end_time)) in zip(all_emotions, time_points): | |
top_emotion = max(emotions, key=lambda x: x['score']) | |
label = top_emotion["label"] | |
# Find second highest emotion if available | |
if len(emotions) > 1: | |
sorted_emotions = sorted(emotions, key=lambda x: x['score'], reverse=True) | |
second_emotion = sorted_emotions[1]["label"].capitalize() | |
second_score = sorted_emotions[1]["score"] | |
secondary = f" ({second_emotion}: {second_score:.2f})" | |
else: | |
secondary = "" | |
results_list.append({ | |
"Time Range": f"{start_time:.1f}s - {end_time:.1f}s", | |
"Primary Emotion": label.capitalize(), | |
"Confidence": f"{top_emotion['score']:.2f}{secondary}", | |
"Description": EMOTION_DESCRIPTIONS.get(label, "") | |
}) | |
return results_list | |
def process_audio(audio_file, progress=gr.Progress()): | |
""" | |
Main handler for Gradio: | |
1) Emotion analysis (returns Plotly figure). | |
2) Tone analysis (returns descriptive text). | |
""" | |
if not audio_file: | |
return None, None, "No audio file provided.", None, "No tone analysis." | |
# 1) Analyze emotions | |
fig, summary_text, detailed_results = analyze_audio_emotions(audio_file, progress) | |
if not fig: # Error or missing | |
return None, None, "Failed to analyze audio emotions.", None, "Tone analysis unavailable." | |
# 2) Generate alternative chart | |
# Extract the necessary data from detailed_results to create time_points | |
time_points = [] | |
for result in detailed_results: | |
time_range = result["Time Range"] | |
start_time = float(time_range.split("s")[0]) | |
end_time = float(time_range.split(" - ")[1].split("s")[0]) | |
time_points.append((start_time, end_time)) | |
# Extract emotion data from detailed_results | |
all_emotions = [] | |
for result in detailed_results: | |
# Parse the primary emotion and confidence | |
primary_emotion = result["Primary Emotion"].lower() | |
confidence_str = result["Confidence"].split("(")[0].strip() | |
primary_confidence = float(confidence_str) | |
# Create a list of emotion dictionaries for this time point | |
emotions_at_time = [{"label": primary_emotion, "score": primary_confidence}] | |
# Check if there's a secondary emotion | |
if "(" in result["Confidence"]: | |
secondary_part = result["Confidence"].split("(")[1].split(")")[0] | |
secondary_emotion = secondary_part.split(":")[0].strip().lower() | |
secondary_confidence = float(secondary_part.split(":")[1].strip()) | |
emotions_at_time.append({"label": secondary_emotion, "score": secondary_confidence}) | |
# Add remaining emotions with zero confidence | |
for emotion in EMOTION_DESCRIPTIONS.keys(): | |
if emotion not in [e["label"] for e in emotions_at_time]: | |
emotions_at_time.append({"label": emotion, "score": 0.0}) | |
all_emotions.append(emotions_at_time) | |
# Now we can generate the alternative chart | |
alt_fig = generate_alternative_chart(all_emotions, time_points) | |
# 3) Analyze tone | |
tone_analysis = analyze_voice_tone(audio_file) | |
return fig, alt_fig, summary_text, detailed_results, tone_analysis | |
# Create Gradio interface with improved UI/UX | |
with gr.Blocks(title="Voice Emotion & Tone Analysis System", theme=gr.themes.Soft()) as demo: | |
gr.Markdown(""" | |
# 🎙️ Voice Emotion & Tone Analysis System | |
This app provides professional analysis of: | |
- **Emotions** in your voice (Anger, Disgust, Fear, Happy, Neutral, Sad, Surprise) | |
- **Tone characteristics** (based on pitch, energy, and speech patterns) | |
The interactive timeline shows emotion confidence scores throughout your audio. | |
""") | |
with gr.Tabs(): | |
# Tab 1: Upload | |
with gr.TabItem("Upload Audio"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
audio_input = gr.Audio( | |
label="Upload Audio File", | |
type="filepath", | |
sources=["upload"], | |
elem_id="audio_upload" | |
) | |
process_btn = gr.Button("Analyze Voice", variant="primary") | |
gr.Markdown(""" | |
**Supports:** MP3, WAV, M4A, and most audio formats | |
**For best results:** Use a clear voice recording with minimal background noise | |
""") | |
with gr.Column(scale=2): | |
with gr.Tabs(): | |
with gr.TabItem("Line Chart"): | |
emotion_timeline = gr.Plot(label="Emotion Timeline", | |
elem_id="emotion_plot", | |
container=True) | |
with gr.TabItem("Area Chart"): | |
emotion_area_chart = gr.Plot(label="Emotion Distribution", | |
elem_id="emotion_area_plot", | |
container=True) | |
with gr.Row(): | |
with gr.Column(): | |
emotion_summary = gr.Markdown(label="Emotion Summary") | |
with gr.Column(): | |
tone_analysis_output = gr.Markdown(label="Tone Analysis") | |
with gr.Row(): | |
emotion_results = gr.DataFrame( | |
headers=["Time Range", "Primary Emotion", "Confidence", "Description"], | |
label="Detailed Emotion Analysis" | |
) | |
process_btn.click( | |
fn=process_audio, | |
inputs=[audio_input], | |
outputs=[emotion_timeline, emotion_area_chart, emotion_summary, emotion_results, tone_analysis_output] | |
) | |
# Tab 2: Record | |
with gr.TabItem("Record Voice"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
record_input = gr.Audio( | |
label="Record Your Voice", | |
sources=["microphone"], | |
type="filepath", | |
elem_id="record_audio" | |
) | |
analyze_btn = gr.Button("Analyze Recording", variant="primary") | |
gr.Markdown(""" | |
**Tips:** | |
- Speak clearly and at a normal pace | |
- Record at least 10-15 seconds for more accurate analysis | |
- Try different emotional tones to see how they're detected | |
""") | |
with gr.Column(scale=2): | |
with gr.Tabs(): | |
with gr.TabItem("Line Chart"): | |
rec_emotion_timeline = gr.Plot(label="Emotion Timeline", | |
elem_id="record_emotion_plot", | |
container=True) | |
with gr.TabItem("Area Chart"): | |
rec_emotion_area_chart = gr.Plot(label="Emotion Distribution", | |
elem_id="record_emotion_area_plot", | |
container=True) | |
with gr.Row(): | |
with gr.Column(): | |
rec_emotion_summary = gr.Markdown(label="Emotion Summary") | |
with gr.Column(): | |
rec_tone_analysis_output = gr.Markdown(label="Tone Analysis") | |
with gr.Row(): | |
rec_emotion_results = gr.DataFrame( | |
headers=["Time Range", "Primary Emotion", "Confidence", "Description"], | |
label="Detailed Emotion Analysis" | |
) | |
analyze_btn.click( | |
fn=process_audio, | |
inputs=[record_input], | |
outputs=[rec_emotion_timeline, rec_emotion_area_chart, rec_emotion_summary, rec_emotion_results, rec_tone_analysis_output] | |
) | |
# Tab 3: About & Help | |
with gr.TabItem("About & Help"): | |
gr.Markdown(""" | |
## About This System | |
This voice emotion & tone analysis system uses state-of-the-art deep learning models to detect emotions and analyze vocal characteristics. The system is built on HuBERT (Hidden Unit BERT) architecture trained on speech emotion recognition tasks. | |
### How It Works | |
1. **Audio Processing**: Your audio is processed in short segments (chunks) to capture emotion variations over time. | |
2. **Emotion Classification**: Each segment is analyzed by a neural network to detect emotional patterns. | |
3. **Tone Analysis**: Acoustic features like pitch, energy, and rhythm are analyzed to describe voice tone characteristics. | |
### Emotion Categories | |
The system detects seven standard emotions: | |
- **Angry**: Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense. | |
- **Disgust**: Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous. | |
- **Fear**: Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense. | |
- **Happy**: Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted. | |
- **Neutral**: Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat. | |
- **Sad**: Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued. | |
- **Surprise**: Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic. | |
### Tips for Best Results | |
- Use clear audio with minimal background noise | |
- Speak naturally at a comfortable volume | |
- Record at least 10-15 seconds of speech | |
- For tone analysis, longer recordings (30+ seconds) provide more accurate results | |
### Privacy Notice | |
All audio processing happens on your device. No audio recordings or analysis results are stored or transmitted to external servers. | |
""") | |
gr.Markdown(""" | |
--- | |
### System Information | |
- **Model**: HuBERT Large for Speech Emotion Recognition | |
- **Version**: 1.2.0 | |
- **Libraries**: PyTorch, Transformers, Librosa, Plotly | |
This application demonstrates the use of AI for speech emotion recognition and acoustic analysis. For research and educational purposes only. | |
""") | |
# Check if model can load before launching interface | |
print("Checking model availability...") | |
load_success = load_emotion_model() | |
if not load_success: | |
print("Warning: Emotion model failed to load. Application may have limited functionality.") | |
# Launch the demo | |
if __name__ == "__main__": | |
demo.launch() | |