Spaces:

Natwar
/

VoiceAnalysis

Running

App Files Files Community

VoiceAnalysis / app.py

Natwar

Create app.py

343474c verified about 2 months ago

raw

history blame

34.8 kB

	import os
	import subprocess
	import sys
	import pkg_resources
	import time
	import tempfile
	import numpy as np
	import warnings
	from pathlib import Path
	warnings.filterwarnings("ignore")

	def install_package(package, version=None):
	package_spec = f"{package}=={version}" if version else package
	print(f"Installing {package_spec}...")
	try:
	subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir", package_spec])
	except subprocess.CalledProcessError as e:
	print(f"Failed to install {package_spec}: {e}")
	raise

	# Required packages (add version pins if needed)
	required_packages = {
	"gradio": None,
	"torch": None,
	"torchaudio": None,
	"transformers": None,
	"librosa": None,
	"scipy": None,
	"matplotlib": None,
	"pydub": None,
	"plotly": None
	}

	installed_packages = {pkg.key for pkg in pkg_resources.working_set}
	for package, version in required_packages.items():
	if package not in installed_packages:
	install_package(package, version)

	# Now import necessary packages
	import gradio as gr
	import torch
	import torchaudio
	import librosa
	import matplotlib
	matplotlib.use('Agg') # non-interactive backend for any fallback
	from pydub import AudioSegment
	import scipy
	import io
	from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
	import plotly.graph_objects as go

	# Define emotion labels, tone mapping, and descriptions
	EMOTION_DESCRIPTIONS = {
	"angry": "Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.",
	"disgust": "Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.",
	"fear": "Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.",
	"happy": "Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.",
	"neutral": "Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.",
	"sad": "Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.",
	"surprise": "Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic."
	}

	# If you wish to group emotions by tone, you can do so here:
	TONE_MAPPING = {
	"positive": ["happy", "surprise"],
	"neutral": ["neutral"],
	"negative": ["angry", "sad", "fear", "disgust"]
	}

	# Global variable for the emotion classifier
	audio_emotion_classifier = None

	def load_emotion_model():
	"""Load and cache the speech emotion classification model."""
	global audio_emotion_classifier
	if audio_emotion_classifier is None:
	try:
	print("Loading emotion classification model...")
	model_name = "superb/hubert-large-superb-er"
	audio_emotion_classifier = pipeline("audio-classification", model=model_name)
	print("Emotion classification model loaded successfully")
	return True
	except Exception as e:
	print(f"Error loading emotion model: {e}")
	return False
	return True

	def convert_audio_to_wav(audio_file):
	"""Convert uploaded audio to WAV format."""
	try:
	audio = AudioSegment.from_file(audio_file)
	with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
	wav_path = temp_wav.name
	audio.export(wav_path, format="wav")
	return wav_path
	except Exception as e:
	print(f"Error converting audio: {e}")
	return None

	def analyze_voice_tone(audio_file):
	"""
	Analyze the tone characteristics of the voice using more robust measurements.
	Includes pitch variation, energy dynamics, and spectral features.
	"""
	try:
	audio_data, sample_rate = librosa.load(audio_file, sr=16000)

	# 1. Basic audio features
	audio_duration = librosa.get_duration(y=audio_data, sr=sample_rate)
	if audio_duration < 1.0: # Too short for reliable analysis
	return "Audio too short for reliable tone analysis. Please provide at least 3 seconds."

	# 2. Pitch analysis with more robust handling
	f0, voiced_flag, voiced_prob = librosa.pyin(
	audio_data,
	fmin=librosa.note_to_hz('C2'),
	fmax=librosa. note_to_hz('C7'),
	sr=sample_rate
	)

	# Filter out NaN values and get valid pitch points
	valid_f0 = f0[~np.isnan(f0)]

	# If no pitch detected, may be noise or silence
	if len(valid_f0) < 10:
	return "Voice Tone Analysis: Unable to detect sufficient pitched content for analysis. The audio may contain primarily noise, silence, or non-speech sounds."

	# 3. Calculate improved statistics
	mean_pitch = np.mean(valid_f0)
	median_pitch = np.median(valid_f0)
	std_pitch = np.std(valid_f0)
	pitch_range = np.percentile(valid_f0, 95) - np.percentile(valid_f0, 5)

	# 4. Energy/volume dynamics
	rms_energy = librosa.feature.rms(y=audio_data)[0]
	mean_energy = np.mean(rms_energy)
	std_energy = np.std(rms_energy)
	energy_range = np.percentile(rms_energy, 95) - np.percentile(rms_energy, 5)

	# 5. Speaking rate approximation (zero-crossing rate can help estimate this)
	zcr = librosa.feature.zero_crossing_rate(audio_data)[0]
	mean_zcr = np.mean(zcr)

	# 6. Calculate pitch variability relative to the mean (coefficient of variation)
	# This gives a better measure than raw std dev
	pitch_cv = (std_pitch / mean_pitch) * 100 if mean_pitch > 0 else 0

	# 7. Tone classification logic using multiple features
	# Define tone characteristics based on combinations of features
	tone_class = ""
	tone_details = []

	# Pitch-based characteristics
	if pitch_cv < 5:
	tone_class = "Monotone"
	tone_details.append("Very little pitch variation - sounds flat and unexpressive")
	elif pitch_cv < 12:
	tone_class = "Steady"
	tone_details.append("Moderate pitch variation - sounds controlled and measured")
	elif pitch_cv < 20:
	tone_class = "Expressive"
	tone_details.append("Good pitch variation - sounds naturally engaging")
	else:
	tone_class = "Highly Dynamic"
	tone_details.append("Strong pitch variation - sounds animated and emphatic")

	# Pitch range classification
	if mean_pitch > 180:
	tone_details.append("Higher pitched voice - may convey excitement or tension")
	elif mean_pitch < 120:
	tone_details.append("Lower pitched voice - may convey calmness or authority")
	else:
	tone_details.append("Mid-range pitch - typically perceived as balanced")

	# Energy/volume characteristics
	energy_cv = (std_energy / mean_energy) * 100 if mean_energy > 0 else 0
	if energy_cv < 10:
	tone_details.append("Consistent volume - sounds controlled and measured")
	elif energy_cv > 30:
	tone_details.append("Variable volume - suggests emotional emphasis or expressiveness")

	# Speech rate approximation
	if mean_zcr > 0.1:
	tone_details.append("Faster speech rate - may convey urgency or enthusiasm")
	elif mean_zcr < 0.05:
	tone_details.append("Slower speech rate - may convey thoughtfulness or hesitation")

	# Generate tone summary and interpretation
	tone_analysis = f"### Voice Tone Analysis\n\n"
	tone_analysis += f"Primary tone quality: {tone_class}\n\n"
	tone_analysis += "Tone characteristics:\n"
	for detail in tone_details:
	tone_analysis += f"- {detail}\n"

	tone_analysis += "\nInterpretation:\n"

	# Generate interpretation based on the classified tone
	if tone_class == "Monotone":
	tone_analysis += ("A monotone delivery can create distance and reduce engagement. "
	"Consider adding more vocal variety to sound more engaging and authentic.")
	elif tone_class == "Steady":
	tone_analysis += ("Your steady tone suggests reliability and control. "
	"This can be effective in professional settings or when conveying serious information.")
	elif tone_class == "Expressive":
	tone_analysis += ("Your expressive tone helps maintain listener interest and emphasize key points. "
	"This naturally engaging quality helps convey authenticity and conviction.")
	else: # Highly Dynamic
	tone_analysis += ("Your highly dynamic vocal style conveys strong emotion and energy. "
	"This can be powerful for storytelling and persuasion, though in some contexts "
	"a more measured approach might be appropriate.")

	return tone_analysis

	except Exception as e:
	print(f"Error in tone analysis: {e}")
	return "Tone analysis unavailable due to an error processing the audio."

	def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=2):
	"""
	Analyze speech emotions in short chunks,
	building a timeline of confidence for each emotion.
	Returns a Plotly figure, summary text, detailed results.
	"""
	if not load_emotion_model():
	return None, "Failed to load emotion classifier.", None

	# Use existing WAV if possible, else convert
	if audio_file.endswith(".wav"):
	audio_path = audio_file
	else:
	audio_path = convert_audio_to_wav(audio_file)
	if not audio_path:
	return None, "Could not process audio file", None

	try:
	# Load with librosa
	audio_data, sample_rate = librosa.load(audio_path, sr=16000)
	duration = len(audio_data) / sample_rate

	# Use shorter chunks for more granular analysis
	chunk_samples = int(chunk_duration * sample_rate)
	num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples)))

	all_emotions = []
	time_points = []

	# For each chunk, run emotion classification
	for i in range(num_chunks):
	progress((i + 1) / num_chunks, "Analyzing audio emotions...")
	start_idx = i * chunk_samples
	end_idx = min(start_idx + chunk_samples, len(audio_data))
	chunk = audio_data[start_idx:end_idx]

	# Skip very short chunks
	if len(chunk) < 0.5 * sample_rate:
	continue

	# Write chunk to temp WAV
	with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_chunk:
	chunk_path = temp_chunk.name
	scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16))

	# Classify - extract top-n predictions for each chunk
	raw_results = audio_emotion_classifier(chunk_path, top_k=7) # Get all 7 emotions
	os.unlink(chunk_path)

	all_emotions.append(raw_results)
	time_points.append((start_idx / sample_rate, end_idx / sample_rate))

	# Skip if no valid emotions detected
	if not all_emotions:
	return None, "No speech detected in the audio.", None

	# Build Plotly chart with improved styling
	fig = build_plotly_line_chart(all_emotions, time_points, duration)

	# Build summary and detailed results
	summary_text = generate_emotion_summary(all_emotions)
	detailed_results = build_detailed_results(all_emotions, time_points)

	return fig, summary_text, detailed_results

	except Exception as e:
	import traceback
	traceback.print_exc()
	return None, f"Error analyzing audio: {str(e)}", None

	def smooth_data(data, window_size=3):
	"""Apply a moving average smoothing to the data"""
	smoothed = np.convolve(data, np.ones(window_size)/window_size, mode='valid')

	# Add back points that were lost in the convolution
	padding = len(data) - len(smoothed)
	if padding > 0:
	# Add padding at the beginning
	padding_front = padding // 2
	padding_back = padding - padding_front

	# Use the first/last values for padding
	front_padding = [smoothed[0]] * padding_front
	back_padding = [smoothed[-1]] * padding_back

	smoothed = np.concatenate([front_padding, smoothed, back_padding])

	return smoothed

	def build_plotly_line_chart(all_emotions, time_points, duration):
	"""
	Create an improved Plotly line chart with toggles for each emotion.
	Shows all emotions for each time point rather than just the top one.
	"""
	emotion_labels = list(EMOTION_DESCRIPTIONS.keys())

	# Custom color scheme for emotions
	colors = {
	"angry": "#E53935", # Red
	"disgust": "#8E24AA", # Purple
	"fear": "#7B1FA2", # Deep Purple
	"happy": "#FFC107", # Amber/Yellow
	"neutral": "#78909C", # Blue Grey
	"sad": "#1E88E5", # Blue
	"surprise": "#43A047" # Green
	}

	# Prepare data structure for all emotions
	emotion_data = {label: [] for label in emotion_labels}
	timeline_times = [(start + end) / 2 for start, end in time_points]

	# Process emotion scores - ensure all emotions have values
	for chunk_emotions in all_emotions:
	# Create a mapping of label to score for this chunk
	scores = {item["label"]: item["score"] for item in chunk_emotions}

	# Ensure all emotion labels have a value (default to 0.0)
	for label in emotion_labels:
	emotion_data[label].append(scores.get(label, 0.0))

	# Smooth the data
	for label in emotion_labels:
	if len(emotion_data[label]) > 2:
	emotion_data[label] = smooth_data(emotion_data[label])

	# Build the chart
	fig = go.Figure()

	# Add traces for each emotion
	for label in emotion_labels:
	fig.add_trace(
	go.Scatter(
	x=timeline_times,
	y=emotion_data[label],
	mode='lines',
	name=label.capitalize(),
	line=dict(
	color=colors.get(label, None),
	width=3,
	shape='spline', # Curved lines
	smoothing=1.3
	),
	hovertemplate=f'{label.capitalize()}: %{{y:.2f}}<extra></extra>',
	)
	)

	# Add markers for dominant emotion at each point
	dominant_markers_x = []
	dominant_markers_y = []
	dominant_markers_text = []
	dominant_markers_color = []

	for i, time in enumerate(timeline_times):
	scores = {label: emotion_data[label][i] for label in emotion_labels}
	dominant = max(scores.items(), key=lambda x: x[1])

	dominant_markers_x.append(time)
	dominant_markers_y.append(dominant[1])
	dominant_markers_text.append(f"{dominant[0].capitalize()}: {dominant[1]:.2f}")
	dominant_markers_color.append(colors.get(dominant[0], "#000000"))

	fig.add_trace(
	go.Scatter(
	x=dominant_markers_x,
	y=dominant_markers_y,
	mode='markers',
	marker=dict(
	size=10,
	color=dominant_markers_color,
	line=dict(width=2, color='white')
	),
	name="Dominant Emotion",
	text=dominant_markers_text,
	hoverinfo="text",
	hovertemplate='%{text}<extra></extra>'
	)
	)

	# Add area chart for better visualization
	for label in emotion_labels:
	fig.add_trace(
	go.Scatter(
	x=timeline_times,
	y=emotion_data[label],
	mode='none',
	name=f"{label.capitalize()} Area",
	fill='tozeroy',
	fillcolor=f"rgba{tuple(list(int(colors.get(label, '#000000').lstrip('#')[i:i+2], 16) for i in (0, 2, 4)) + [0.1])}",
	showlegend=False,
	hoverinfo='skip'
	)
	)

	# Improve layout
	fig.update_layout(
	title={
	'text': "Voice Emotion Analysis Over Time",
	'font': {'size': 22, 'family': 'Arial, sans-serif'}
	},
	xaxis_title="Time (seconds)",
	yaxis_title="Confidence Score",
	yaxis=dict(
	range=[0, 1.0],
	showgrid=True,
	gridcolor='rgba(230, 230, 230, 0.8)'
	),
	xaxis=dict(
	showgrid=True,
	gridcolor='rgba(230, 230, 230, 0.8)'
	),
	plot_bgcolor='white',
	legend=dict(
	bordercolor='rgba(0,0,0,0.1)',
	borderwidth=1,
	orientation="h",
	yanchor="bottom",
	y=1.02,
	xanchor="right",
	x=1
	),
	hovermode='closest',
	height=500, # Larger size for better viewing
	margin=dict(l=10, r=10, t=80, b=50)
	)

	return fig

	def generate_alternative_chart(all_emotions, time_points):
	"""
	Create a stacked area chart to better visualize emotion changes over time
	"""
	emotion_labels = list(EMOTION_DESCRIPTIONS.keys())

	# Custom color scheme for emotions - more visible/distinct
	colors = {
	"angry": "#F44336", # Red
	"disgust": "#9C27B0", # Purple
	"fear": "#673AB7", # Deep Purple
	"happy": "#FFC107", # Amber
	"neutral": "#607D8B", # Blue Grey
	"sad": "#2196F3", # Blue
	"surprise": "#4CAF50" # Green
	}

	# Prepare timeline points
	timeline_times = [(start + end) / 2 for start, end in time_points]

	# Prepare data structure for all emotions
	emotion_data = {label: [] for label in emotion_labels}

	# Process emotion scores - ensure all emotions have values
	for chunk_emotions in all_emotions:
	# Create a mapping of label to score for this chunk
	scores = {item["label"]: item["score"] for item in chunk_emotions}

	# Ensure all emotion labels have a value (default to 0.0)
	for label in emotion_labels:
	emotion_data[label].append(scores.get(label, 0.0))

	# Create the stacked area chart
	fig = go.Figure()

	# Add each emotion as a separate trace
	for label in emotion_labels:
	fig.add_trace(
	go.Scatter(
	x=timeline_times,
	y=emotion_data[label],
	mode='lines',
	name=label.capitalize(),
	line=dict(width=0.5, color=colors.get(label, None)),
	stackgroup='one', # This makes it a stacked area chart
	fillcolor=colors.get(label, None),
	hovertemplate=f'{label.capitalize()}: %{{y:.2f}}<extra></extra>'
	)
	)

	# Improve layout
	fig.update_layout(
	title={
	'text': "Voice Emotion Distribution Over Time",
	'font': {'size': 22, 'family': 'Arial, sans-serif'}
	},
	xaxis_title="Time (seconds)",
	yaxis_title="Emotion Intensity",
	yaxis=dict(
	showgrid=True,
	gridcolor='rgba(230, 230, 230, 0.8)'
	),
	xaxis=dict(
	showgrid=True,
	gridcolor='rgba(230, 230, 230, 0.8)'
	),
	plot_bgcolor='white',
	legend=dict(
	bordercolor='rgba(0,0,0,0.1)',
	borderwidth=1,
	orientation="h",
	yanchor="bottom",
	y=1.02,
	xanchor="right",
	x=1
	),
	hovermode='closest',
	height=500,
	margin=dict(l=10, r=10, t=80, b=50)
	)

	return fig

	def generate_emotion_summary(all_emotions):
	"""
	Produce an improved textual summary of the overall emotion distribution.
	"""
	if not all_emotions:
	return "No emotional content detected."

	emotion_counts = {}
	emotion_confidence = {}
	total_chunks = len(all_emotions)

	for chunk_emotions in all_emotions:
	top_emotion = max(chunk_emotions, key=lambda x: x['score'])
	label = top_emotion["label"]
	confidence = top_emotion["score"]

	emotion_counts[label] = emotion_counts.get(label, 0) + 1
	emotion_confidence[label] = emotion_confidence.get(label, 0) + confidence

	# Calculate average confidence for each emotion
	for emotion in emotion_confidence:
	if emotion_counts[emotion] > 0:
	emotion_confidence[emotion] /= emotion_counts[emotion]

	# Dominant emotion (highest percentage)
	dominant_emotion = max(emotion_counts, key=emotion_counts.get)
	dominant_pct = (emotion_counts[dominant_emotion] / total_chunks) * 100

	# Most confident emotion (might differ from dominant)
	most_confident = max(emotion_confidence, key=emotion_confidence.get)

	# Tone grouping analysis
	tone_group_counts = {group: 0 for group in TONE_MAPPING}
	for emotion, count in emotion_counts.items():
	for tone_group, emotions in TONE_MAPPING.items():
	if emotion in emotions:
	tone_group_counts[tone_group] += count

	dominant_tone = max(tone_group_counts, key=tone_group_counts.get)
	dominant_tone_pct = (tone_group_counts[dominant_tone] / total_chunks) * 100

	# Build summary with markdown formatting
	summary = f"### Voice Emotion Analysis Summary\n\n"
	summary += f"Dominant emotion: {dominant_emotion.capitalize()} ({dominant_pct:.1f}%)\n\n"

	if dominant_emotion != most_confident and emotion_confidence[most_confident] > 0.7:
	summary += f"Most confident detection: {most_confident.capitalize()} "
	summary += f"(avg. confidence: {emotion_confidence[most_confident]:.2f})\n\n"

	summary += f"Overall tone: {dominant_tone.capitalize()} ({dominant_tone_pct:.1f}%)\n\n"
	summary += f"Description: {EMOTION_DESCRIPTIONS.get(dominant_emotion, '')}\n\n"

	# Show emotion distribution as sorted list
	summary += "Emotion distribution:\n"
	for emotion, count in sorted(emotion_counts.items(), key=lambda x: x[1], reverse=True):
	percentage = (count / total_chunks) * 100
	avg_conf = emotion_confidence[emotion]
	summary += f"- {emotion.capitalize()}: {percentage:.1f}% (confidence: {avg_conf:.2f})\n"

	# Add interpretation based on dominant emotion
	summary += f"\nInterpretation:\n"

	if dominant_emotion == "happy":
	summary += "The voice conveys primarily positive emotions, suggesting enthusiasm, satisfaction, or joy."
	elif dominant_emotion == "neutral":
	summary += "The voice maintains an even emotional tone, suggesting composure or professional delivery."
	elif dominant_emotion == "sad":
	summary += "The voice conveys melancholy or disappointment, potentially indicating concern or distress."
	elif dominant_emotion == "angry":
	summary += "The voice shows frustration or assertiveness, suggesting strong conviction or displeasure."
	elif dominant_emotion == "fear":
	summary += "The voice reveals anxiety or nervousness, suggesting uncertainty or concern."
	elif dominant_emotion == "disgust":
	summary += "The voice expresses disapproval or aversion, suggesting rejection of discussed concepts."
	elif dominant_emotion == "surprise":
	summary += "The voice shows unexpected reactions, suggesting discovery of new information or astonishment."

	return summary

	def build_detailed_results(all_emotions, time_points):
	"""
	Return a list of dictionaries containing chunk start-end, top emotion, confidence, description.
	Suitable for Gradio DataFrame display.
	"""
	results_list = []
	for (emotions, (start_time, end_time)) in zip(all_emotions, time_points):
	top_emotion = max(emotions, key=lambda x: x['score'])
	label = top_emotion["label"]

	# Find second highest emotion if available
	if len(emotions) > 1:
	sorted_emotions = sorted(emotions, key=lambda x: x['score'], reverse=True)
	second_emotion = sorted_emotions[1]["label"].capitalize()
	second_score = sorted_emotions[1]["score"]
	secondary = f" ({second_emotion}: {second_score:.2f})"
	else:
	secondary = ""

	results_list.append({
	"Time Range": f"{start_time:.1f}s - {end_time:.1f}s",
	"Primary Emotion": label.capitalize(),
	"Confidence": f"{top_emotion['score']:.2f}{secondary}",
	"Description": EMOTION_DESCRIPTIONS.get(label, "")
	})
	return results_list

	def process_audio(audio_file, progress=gr.Progress()):
	"""
	Main handler for Gradio:
	1) Emotion analysis (returns Plotly figure).
	2) Tone analysis (returns descriptive text).
	"""
	if not audio_file:
	return None, None, "No audio file provided.", None, "No tone analysis."

	# 1) Analyze emotions
	fig, summary_text, detailed_results = analyze_audio_emotions(audio_file, progress)
	if not fig: # Error or missing
	return None, None, "Failed to analyze audio emotions.", None, "Tone analysis unavailable."

	# 2) Generate alternative chart
	# Extract the necessary data from detailed_results to create time_points
	time_points = []
	for result in detailed_results:
	time_range = result["Time Range"]
	start_time = float(time_range.split("s")[0])
	end_time = float(time_range.split(" - ")[1].split("s")[0])
	time_points.append((start_time, end_time))

	# Extract emotion data from detailed_results
	all_emotions = []
	for result in detailed_results:
	# Parse the primary emotion and confidence
	primary_emotion = result["Primary Emotion"].lower()
	confidence_str = result["Confidence"].split("(")[0].strip()
	primary_confidence = float(confidence_str)

	# Create a list of emotion dictionaries for this time point
	emotions_at_time = [{"label": primary_emotion, "score": primary_confidence}]

	# Check if there's a secondary emotion
	if "(" in result["Confidence"]:
	secondary_part = result["Confidence"].split("(")[1].split(")")[0]
	secondary_emotion = secondary_part.split(":")[0].strip().lower()
	secondary_confidence = float(secondary_part.split(":")[1].strip())
	emotions_at_time.append({"label": secondary_emotion, "score": secondary_confidence})

	# Add remaining emotions with zero confidence
	for emotion in EMOTION_DESCRIPTIONS.keys():
	if emotion not in [e["label"] for e in emotions_at_time]:
	emotions_at_time.append({"label": emotion, "score": 0.0})

	all_emotions.append(emotions_at_time)

	# Now we can generate the alternative chart
	alt_fig = generate_alternative_chart(all_emotions, time_points)

	# 3) Analyze tone
	tone_analysis = analyze_voice_tone(audio_file)

	return fig, alt_fig, summary_text, detailed_results, tone_analysis

	# Create Gradio interface with improved UI/UX
	with gr.Blocks(title="Voice Emotion & Tone Analysis System", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🎙️ Voice Emotion & Tone Analysis System

	This app provides professional analysis of:
	- Emotions in your voice (Anger, Disgust, Fear, Happy, Neutral, Sad, Surprise)
	- Tone characteristics (based on pitch, energy, and speech patterns)

	The interactive timeline shows emotion confidence scores throughout your audio.
	""")

	with gr.Tabs():
	# Tab 1: Upload
	with gr.TabItem("Upload Audio"):
	with gr.Row():
	with gr.Column(scale=1):
	audio_input = gr.Audio(
	label="Upload Audio File",
	type="filepath",
	sources=["upload"],
	elem_id="audio_upload"
	)
	process_btn = gr.Button("Analyze Voice", variant="primary")
	gr.Markdown("""
	Supports: MP3, WAV, M4A, and most audio formats
	For best results: Use a clear voice recording with minimal background noise
	""")
	with gr.Column(scale=2):
	with gr.Tabs():
	with gr.TabItem("Line Chart"):
	emotion_timeline = gr.Plot(label="Emotion Timeline",
	elem_id="emotion_plot",
	container=True)
	with gr.TabItem("Area Chart"):
	emotion_area_chart = gr.Plot(label="Emotion Distribution",
	elem_id="emotion_area_plot",
	container=True)
	with gr.Row():
	with gr.Column():
	emotion_summary = gr.Markdown(label="Emotion Summary")
	with gr.Column():
	tone_analysis_output = gr.Markdown(label="Tone Analysis")
	with gr.Row():
	emotion_results = gr.DataFrame(
	headers=["Time Range", "Primary Emotion", "Confidence", "Description"],
	label="Detailed Emotion Analysis"
	)

	process_btn.click(
	fn=process_audio,
	inputs=[audio_input],
	outputs=[emotion_timeline, emotion_area_chart, emotion_summary, emotion_results, tone_analysis_output]
	)

	# Tab 2: Record
	with gr.TabItem("Record Voice"):
	with gr.Row():
	with gr.Column(scale=1):
	record_input = gr.Audio(
	label="Record Your Voice",
	sources=["microphone"],
	type="filepath",
	elem_id="record_audio"
	)
	analyze_btn = gr.Button("Analyze Recording", variant="primary")
	gr.Markdown("""
	Tips:
	- Speak clearly and at a normal pace
	- Record at least 10-15 seconds for more accurate analysis
	- Try different emotional tones to see how they're detected
	""")
	with gr.Column(scale=2):
	with gr.Tabs():
	with gr.TabItem("Line Chart"):
	rec_emotion_timeline = gr.Plot(label="Emotion Timeline",
	elem_id="record_emotion_plot",
	container=True)
	with gr.TabItem("Area Chart"):
	rec_emotion_area_chart = gr.Plot(label="Emotion Distribution",
	elem_id="record_emotion_area_plot",
	container=True)
	with gr.Row():
	with gr.Column():
	rec_emotion_summary = gr.Markdown(label="Emotion Summary")
	with gr.Column():
	rec_tone_analysis_output = gr.Markdown(label="Tone Analysis")
	with gr.Row():
	rec_emotion_results = gr.DataFrame(
	headers=["Time Range", "Primary Emotion", "Confidence", "Description"],
	label="Detailed Emotion Analysis"
	)

	analyze_btn.click(
	fn=process_audio,
	inputs=[record_input],
	outputs=[rec_emotion_timeline, rec_emotion_area_chart, rec_emotion_summary, rec_emotion_results, rec_tone_analysis_output]
	)

	# Tab 3: About & Help
	with gr.TabItem("About & Help"):
	gr.Markdown("""
	## About This System

	This voice emotion & tone analysis system uses state-of-the-art deep learning models to detect emotions and analyze vocal characteristics. The system is built on HuBERT (Hidden Unit BERT) architecture trained on speech emotion recognition tasks.

	### How It Works

	1. Audio Processing: Your audio is processed in short segments (chunks) to capture emotion variations over time.
	2. Emotion Classification: Each segment is analyzed by a neural network to detect emotional patterns.
	3. Tone Analysis: Acoustic features like pitch, energy, and rhythm are analyzed to describe voice tone characteristics.

	### Emotion Categories

	The system detects seven standard emotions:

	- Angry: Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.
	- Disgust: Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.
	- Fear: Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.
	- Happy: Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.
	- Neutral: Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.
	- Sad: Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.
	- Surprise: Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic.

	### Tips for Best Results

	- Use clear audio with minimal background noise
	- Speak naturally at a comfortable volume
	- Record at least 10-15 seconds of speech
	- For tone analysis, longer recordings (30+ seconds) provide more accurate results

	### Privacy Notice

	All audio processing happens on your device. No audio recordings or analysis results are stored or transmitted to external servers.
	""")

	gr.Markdown("""
	---
	### System Information

	- Model: HuBERT Large for Speech Emotion Recognition
	- Version: 1.2.0
	- Libraries: PyTorch, Transformers, Librosa, Plotly

	This application demonstrates the use of AI for speech emotion recognition and acoustic analysis. For research and educational purposes only.
	""")

	# Check if model can load before launching interface
	print("Checking model availability...")
	load_success = load_emotion_model()
	if not load_success:
	print("Warning: Emotion model failed to load. Application may have limited functionality.")

	# Launch the demo
	if __name__ == "__main__":
	demo.launch()