KavyaBansal commited on
Commit
d329cd5
·
verified ·
1 Parent(s): d91a71a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +1369 -0
app.py ADDED
@@ -0,0 +1,1369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tone Classification System
2
+ # This implementation combines text and acoustic features to detect emotions,
3
+ # including sarcasm and figures of speech
4
+ # Part 1: Install required packages with improved error handling
5
+ import sys
6
+ import os
7
+
8
+ # Function to install packages with error handling
9
+ def install_packages():
10
+ packages = [
11
+ "hf_xet","transformers", "pytorch-lightning", "datasets",
12
+ "numpy", "pandas", "matplotlib", "seaborn",
13
+ "librosa", "opensmile", "torch", "torchaudio",
14
+ "accelerate", "nltk", "scikit-learn"
15
+ ]
16
+
17
+ for package in packages:
18
+ try:
19
+ print(f"Installing {package}...")
20
+ !pip install {package} -q
21
+ print(f"Successfully installed {package}")
22
+ except Exception as e:
23
+ print(f"Error installing {package}: {e}")
24
+
25
+ print("Package installation completed!")
26
+
27
+ install_packages()
28
+
29
+ # Part 2: Import libraries with error handling
30
+ import numpy as np
31
+ import pandas as pd
32
+ import torch
33
+ import matplotlib.pyplot as plt
34
+ import seaborn as sns
35
+ from sklearn.model_selection import train_test_split
36
+ from sklearn.metrics import confusion_matrix, classification_report
37
+ from torch.utils.data import Dataset, DataLoader
38
+ import torch.nn as nn
39
+ import torch.nn.functional as F
40
+ import torch.optim as optim
41
+
42
+ # Check for CUDA availability
43
+ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
44
+ print(f"Using device: {DEVICE}")
45
+
46
+ # Try to import libraries that might cause issues with specific error handling
47
+ try:
48
+ import torchaudio
49
+ print("Successfully imported torchaudio")
50
+ except Exception as e:
51
+ print(f"Error importing torchaudio: {e}")
52
+ print("Some audio functionality may be limited")
53
+
54
+ try:
55
+ import librosa
56
+ print("Successfully imported librosa")
57
+ except Exception as e:
58
+ print(f"Error importing librosa: {e}")
59
+ print("Audio processing capabilities will be limited")
60
+
61
+ try:
62
+ import opensmile
63
+ print("Successfully imported opensmile")
64
+ except Exception as e:
65
+ print(f"Error importing opensmile: {e}")
66
+ print("Will use fallback feature extraction methods")
67
+
68
+ # Part 3: Define constants
69
+ EMOTIONS = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised", "sarcastic"]
70
+ MODEL_CACHE_DIR = "./model_cache"
71
+
72
+ # Create cache directory if it doesn't exist
73
+ os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
74
+ print(f"Using model cache directory: {MODEL_CACHE_DIR}")
75
+
76
+ # Part 4: Model Loading with Error Handling and Cache
77
+ def load_model_with_cache(model_class, model_name, cache_subdir=""):
78
+ """Load a model with proper error handling and caching"""
79
+ cache_path = os.path.join(MODEL_CACHE_DIR, cache_subdir)
80
+ os.makedirs(cache_path, exist_ok=True)
81
+
82
+ print(f"Loading model: {model_name}")
83
+ try:
84
+ model = model_class.from_pretrained(
85
+ model_name,
86
+ cache_dir=cache_path,
87
+ local_files_only=os.path.exists(os.path.join(cache_path, model_name.replace('/', '-')))
88
+ )
89
+ print(f"Successfully loaded model: {model_name}")
90
+ return model
91
+ except KeyboardInterrupt:
92
+ print("\nModel download interrupted. Try again or download manually.")
93
+ return None
94
+ except Exception as e:
95
+ print(f"Error loading model {model_name}: {e}")
96
+ print("Will try to continue with limited functionality.")
97
+ return None
98
+
99
+ # Part 5: Modified Whisper Transcriber with Error Handling
100
+ class WhisperTranscriber:
101
+ def __init__(self, model_size="tiny"): # Changed from base to tiny for faster loading
102
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
103
+ print("Initializing Whisper transcriber...")
104
+
105
+ try:
106
+ self.processor = load_model_with_cache(
107
+ WhisperProcessor,
108
+ f"openai/whisper-{model_size}",
109
+ "whisper"
110
+ )
111
+ self.model = load_model_with_cache(
112
+ WhisperForConditionalGeneration,
113
+ f"openai/whisper-{model_size}",
114
+ "whisper"
115
+ )
116
+
117
+ if self.model is not None:
118
+ self.model = self.model.to(DEVICE)
119
+ print("Whisper model loaded successfully and moved to device")
120
+ else:
121
+ print("Failed to load Whisper model")
122
+
123
+ except Exception as e:
124
+ print(f"Error initializing Whisper: {e}")
125
+ self.processor = None
126
+ self.model = None
127
+
128
+ def transcribe(self, audio_path):
129
+ if self.processor is None or self.model is None:
130
+ print("Whisper not properly initialized. Cannot transcribe.")
131
+ return "Error: Transcription failed."
132
+
133
+ try:
134
+ # Load audio
135
+ waveform, sample_rate = librosa.load(audio_path, sr=16000)
136
+
137
+ # Process audio
138
+ input_features = self.processor(waveform, sampling_rate=16000, return_tensors="pt").input_features.to(DEVICE)
139
+
140
+ # Generate transcription
141
+ with torch.no_grad():
142
+ predicted_ids = self.model.generate(input_features, max_length=100)
143
+
144
+ # Decode the transcription
145
+ transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
146
+ return transcription
147
+
148
+ except Exception as e:
149
+ print(f"Error in transcription: {e}")
150
+ return "Error: Transcription failed."
151
+
152
+ # Part 6: Text-based Emotion Analysis with Fallback Options
153
+ # Improved Text-based Emotion Analysis
154
+ class TextEmotionClassifier:
155
+ def __init__(self):
156
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
157
+ print("Initializing text emotion classifier...")
158
+
159
+ # Primary emotion model
160
+ self.emotion_model_name = "j-hartmann/emotion-english-distilroberta-base"
161
+ self.tokenizer = load_model_with_cache(
162
+ AutoTokenizer,
163
+ self.emotion_model_name,
164
+ "text_emotion"
165
+ )
166
+ self.model = load_model_with_cache(
167
+ AutoModelForSequenceClassification,
168
+ self.emotion_model_name,
169
+ "text_emotion"
170
+ )
171
+
172
+ if self.model is not None:
173
+ self.model = self.model.to(DEVICE)
174
+
175
+ # Sentiment model for sarcasm detection
176
+ self.sentiment_model_name = "cardiffnlp/twitter-roberta-base-sentiment"
177
+ self.sarcasm_tokenizer = load_model_with_cache(
178
+ AutoTokenizer,
179
+ self.sentiment_model_name,
180
+ "sentiment"
181
+ )
182
+ self.sarcasm_model = load_model_with_cache(
183
+ AutoModelForSequenceClassification,
184
+ self.sentiment_model_name,
185
+ "sentiment"
186
+ )
187
+
188
+ if self.sarcasm_model is not None:
189
+ self.sarcasm_model = self.sarcasm_model.to(DEVICE)
190
+
191
+ # Enhanced keyword-based analyzer as fallback and enhancement
192
+ self.keyword_analyzer = EnhancedKeywordEmotionAnalyzer()
193
+
194
+ def predict_emotion(self, text):
195
+ if self.tokenizer is None or self.model is None:
196
+ print("Text emotion model not properly initialized.")
197
+ # Use keyword-based analysis as primary method in this case
198
+ return self.keyword_analyzer.analyze(text)
199
+
200
+ try:
201
+ # Get model predictions
202
+ inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(DEVICE)
203
+ with torch.no_grad():
204
+ outputs = self.model(**inputs)
205
+
206
+ # Get probabilities from model
207
+ model_probs = F.softmax(outputs.logits, dim=1).cpu().numpy()[0]
208
+
209
+ # Get keyword-based analysis
210
+ keyword_probs = self.keyword_analyzer.analyze(text)
211
+
212
+ # Combine both methods with weighting
213
+ # If text contains strong emotional keywords, give more weight to keyword analysis
214
+ keyword_strength = self.keyword_analyzer.get_keyword_strength(text)
215
+
216
+ # Adaptive weighting based on keyword strength
217
+ keyword_weight = min(0.6, keyword_strength * 0.1) # Cap at 0.6
218
+ model_weight = 1.0 - keyword_weight
219
+
220
+ # Combine predictions
221
+ combined_probs = (model_weight * model_probs) + (keyword_weight * keyword_probs)
222
+
223
+ # Normalize to ensure sum is 1
224
+ combined_probs = combined_probs / np.sum(combined_probs)
225
+
226
+ return combined_probs
227
+
228
+ except Exception as e:
229
+ print(f"Error in text emotion prediction: {e}")
230
+ # Fallback to keyword analysis
231
+ return self.keyword_analyzer.analyze(text)
232
+
233
+ def detect_sarcasm(self, text):
234
+ if self.sarcasm_tokenizer is None or self.sarcasm_model is None:
235
+ print("Sarcasm model not properly initialized.")
236
+ # Use keyword-based sarcasm detection as fallback
237
+ return self.keyword_analyzer.detect_sarcasm(text)
238
+
239
+ try:
240
+ inputs = self.sarcasm_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(DEVICE)
241
+ with torch.no_grad():
242
+ outputs = self.sarcasm_model(**inputs)
243
+
244
+ sentiment_probs = F.softmax(outputs.logits, dim=1).cpu().numpy()[0]
245
+
246
+ # Enhance with keyword-based sarcasm detection
247
+ keyword_sarcasm = self.keyword_analyzer.detect_sarcasm(text)
248
+
249
+ # If keyword analysis strongly suggests sarcasm, blend with model prediction
250
+ if keyword_sarcasm[2] > 0.5: # If sarcasm probability is high from keywords
251
+ # Give 40% weight to keyword analysis
252
+ combined_probs = 0.6 * sentiment_probs + 0.4 * keyword_sarcasm
253
+ return combined_probs
254
+
255
+ return sentiment_probs
256
+
257
+ except Exception as e:
258
+ print(f"Error in sarcasm detection: {e}")
259
+ # Fallback to keyword analysis
260
+ return self.keyword_analyzer.detect_sarcasm(text)
261
+
262
+ # Enhanced keyword-based emotion analyzer
263
+ class EnhancedKeywordEmotionAnalyzer:
264
+ def __init__(self):
265
+ # Enhanced emotion keywords with weights
266
+ self.emotion_keywords = {
267
+ "happy": [
268
+ ("happy", 1.0), ("joy", 1.0), ("delight", 0.9), ("excited", 0.9),
269
+ ("glad", 0.8), ("pleased", 0.8), ("cheerful", 0.9), ("smile", 0.7),
270
+ ("enjoy", 0.8), ("wonderful", 0.8), ("great", 0.7), ("excellent", 0.8),
271
+ ("thrilled", 1.0), ("ecstatic", 1.0), ("content", 0.7), ("satisfied", 0.7),
272
+ ("pleasure", 0.8), ("fantastic", 0.9), ("awesome", 0.9), ("love", 0.9),
273
+ ("amazing", 0.9), ("perfect", 0.8), ("fun", 0.8), ("delighted", 1.0)
274
+ ],
275
+ "sad": [
276
+ ("sad", 1.0), ("unhappy", 0.9), ("depressed", 1.0), ("sorrow", 1.0),
277
+ ("grief", 1.0), ("tearful", 0.9), ("miserable", 1.0), ("disappointed", 0.8),
278
+ ("upset", 0.8), ("down", 0.7), ("heartbroken", 1.0), ("gloomy", 0.9),
279
+ ("devastated", 1.0), ("hurt", 0.8), ("blue", 0.7), ("regret", 0.8),
280
+ ("dejected", 0.9), ("dismal", 0.9), ("lonely", 0.8), ("terrible", 0.8),
281
+ ("hopeless", 0.9), ("lost", 0.7), ("crying", 0.9), ("tragic", 0.9)
282
+ ],
283
+ "angry": [
284
+ ("angry", 1.0), ("mad", 0.9), ("furious", 1.0), ("annoyed", 0.8),
285
+ ("irritated", 0.8), ("enraged", 1.0), ("livid", 1.0), ("outraged", 1.0),
286
+ ("frustrated", 0.8), ("infuriated", 1.0), ("pissed", 0.9), ("hate", 0.9),
287
+ ("hostile", 0.9), ("bitter", 0.8), ("resentful", 0.8), ("fuming", 0.9),
288
+ ("irate", 1.0), ("outraged", 1.0), ("seething", 1.0), ("cross", 0.7),
289
+ ("exasperated", 0.8), ("disgusted", 0.8), ("indignant", 0.9), ("rage", 1.0)
290
+ ],
291
+ "fearful": [
292
+ ("afraid", 1.0), ("scared", 1.0), ("frightened", 1.0), ("fear", 0.9),
293
+ ("terror", 1.0), ("panic", 1.0), ("horrified", 1.0), ("worried", 0.8),
294
+ ("anxious", 0.9), ("nervous", 0.8), ("terrified", 1.0), ("dread", 0.9),
295
+ ("alarmed", 0.8), ("petrified", 1.0), ("threatened", 0.8), ("intimidated", 0.8),
296
+ ("apprehensive", 0.8), ("uneasy", 0.7), ("tense", 0.7), ("stressed", 0.7),
297
+ ("spooked", 0.9), ("paranoid", 0.9), ("freaked", 0.9), ("jumpy", 0.8)
298
+ ],
299
+ "disgust": [
300
+ ("disgust", 1.0), ("gross", 0.9), ("repulsed", 1.0), ("revolted", 1.0),
301
+ ("sick", 0.8), ("nauseous", 0.8), ("yuck", 0.9), ("ew", 0.8),
302
+ ("nasty", 0.9), ("repugnant", 1.0), ("foul", 0.9), ("appalled", 0.9),
303
+ ("sickened", 0.9), ("offended", 0.8), ("distaste", 0.9), ("aversion", 0.9),
304
+ ("abhorrent", 1.0), ("odious", 1.0), ("repellent", 1.0), ("objectionable", 0.8),
305
+ ("detestable", 1.0), ("loathsome", 1.0), ("vile", 1.0), ("horrid", 0.9)
306
+ ],
307
+ "surprised": [
308
+ ("surprised", 1.0), ("shocked", 0.9), ("astonished", 1.0), ("amazed", 0.9),
309
+ ("startled", 0.9), ("stunned", 0.9), ("speechless", 0.8), ("unexpected", 0.8),
310
+ ("wow", 0.8), ("whoa", 0.8), ("unbelievable", 0.8), ("incredible", 0.8),
311
+ ("dumbfounded", 1.0), ("flabbergasted", 1.0), ("staggered", 0.9), ("aghast", 0.9),
312
+ ("astounded", 1.0), ("taken aback", 0.9), ("disbelief", 0.8), ("bewildered", 0.8),
313
+ ("thunderstruck", 1.0), ("wonder", 0.7), ("sudden", 0.6), ("jaw-dropping", 0.9)
314
+ ],
315
+ "neutral": [
316
+ ("okay", 0.7), ("fine", 0.7), ("alright", 0.7), ("normal", 0.8),
317
+ ("calm", 0.8), ("steady", 0.8), ("balanced", 0.8), ("ordinary", 0.8),
318
+ ("routine", 0.8), ("regular", 0.8), ("standard", 0.8), ("moderate", 0.8),
319
+ ("usual", 0.8), ("typical", 0.8), ("average", 0.8), ("common", 0.8),
320
+ ("so-so", 0.7), ("fair", 0.7), ("acceptable", 0.7), ("stable", 0.8),
321
+ ("unchanged", 0.8), ("plain", 0.7), ("mild", 0.7), ("middle-of-the-road", 0.8)
322
+ ],
323
+ "sarcastic": [
324
+ ("yeah right", 1.0), ("sure thing", 0.9), ("oh great", 0.9), ("how wonderful", 0.9),
325
+ ("wow", 0.7), ("really", 0.7), ("obviously", 0.8), ("definitely", 0.7),
326
+ ("of course", 0.7), ("totally", 0.7), ("exactly", 0.7), ("perfect", 0.7),
327
+ ("brilliant", 0.8), ("genius", 0.8), ("whatever", 0.8), ("right", 0.7),
328
+ ("nice job", 0.8), ("good one", 0.8), ("bravo", 0.8), ("slow clap", 1.0),
329
+ ("im shocked", 0.9), ("never would have guessed", 0.9), ("shocking", 0.7), ("unbelievable", 0.7)
330
+ ]
331
+ }
332
+
333
+ # Sarcasm indicators
334
+ self.sarcasm_indicators = [
335
+ "yeah right", "sure thing", "oh great", "riiiight", "suuure",
336
+ "*slow clap*", "/s", "wow just wow", "you don't say", "no kidding",
337
+ "what a surprise", "shocker", "congratulations", "well done", "genius",
338
+ "oh wow", "oh really", "totally", "absolutely", "clearly", "obviously",
339
+ "genius idea", "brilliant plan", "fantastic job", "amazing work"
340
+ ]
341
+
342
+ # Negation words
343
+ self.negations = [
344
+ "not", "no", "never", "none", "nothing", "neither", "nor", "nowhere",
345
+ "hardly", "scarcely", "barely", "doesn't", "isn't", "wasn't", "shouldn't",
346
+ "wouldn't", "couldn't", "won't", "can't", "don't", "didn't", "haven't"
347
+ ]
348
+
349
+ # Intensifiers
350
+ self.intensifiers = [
351
+ "very", "really", "extremely", "absolutely", "completely", "totally",
352
+ "utterly", "quite", "particularly", "especially", "remarkably", "truly",
353
+ "so", "too", "such", "incredibly", "exceedingly", "extraordinarily"
354
+ ]
355
+
356
+ # Compile patterns for more efficient matching
357
+ import re
358
+ self.emotion_patterns = {}
359
+ for emotion, keywords in self.emotion_keywords.items():
360
+ self.emotion_patterns[emotion] = [
361
+ (re.compile(r'\b' + re.escape(word) + r'\b', re.IGNORECASE), weight)
362
+ for word, weight in keywords
363
+ ]
364
+
365
+ self.negation_pattern = re.compile(r'\b(' + '|'.join(re.escape(n) for n in self.negations) + r')\s+(\w+)', re.IGNORECASE)
366
+ self.intensifier_pattern = re.compile(r'\b(' + '|'.join(re.escape(i) for i in self.intensifiers) + r')\s+(\w+)', re.IGNORECASE)
367
+
368
+ def analyze(self, text):
369
+ """
370
+ Analyze text for emotions using enhanced keyword matching
371
+ Returns numpy array of emotion probabilities
372
+ """
373
+ # Initialize scores
374
+ emotion_scores = {emotion: 0.0 for emotion in EMOTIONS}
375
+
376
+ # Set base score for neutral
377
+ emotion_scores["neutral"] = 1.0
378
+
379
+ # Convert to lowercase for case-insensitive matching
380
+ text_lower = text.lower()
381
+
382
+ # Process each emotion
383
+ for emotion, patterns in self.emotion_patterns.items():
384
+ for pattern, weight in patterns:
385
+ matches = pattern.findall(text_lower)
386
+ if matches:
387
+ # Add score based on number of matches and their weights
388
+ emotion_scores[emotion] += len(matches) * weight
389
+
390
+ # Process negations - look for "not happy" patterns
391
+ negation_matches = self.negation_pattern.finditer(text_lower)
392
+ for match in negation_matches:
393
+ negation, word = match.groups()
394
+ # Check if the negated word is in any emotion keywords
395
+ for emotion, keywords in self.emotion_keywords.items():
396
+ if any(word == kw[0] for kw in keywords):
397
+ # Reduce score for this emotion and slightly increase opposite emotions
398
+ emotion_scores[emotion] -= 0.7
399
+
400
+ # Increase opposite emotions (e.g., if "not happy", increase "sad")
401
+ if emotion == "happy":
402
+ emotion_scores["sad"] += 0.3
403
+ elif emotion == "sad":
404
+ emotion_scores["happy"] += 0.3
405
+
406
+ # Process intensifiers - "very happy" should increase score
407
+ intensifier_matches = self.intensifier_pattern.finditer(text_lower)
408
+ for match in intensifier_matches:
409
+ intensifier, word = match.groups()
410
+ # Check if the intensified word is in any emotion keywords
411
+ for emotion, keywords in self.emotion_keywords.items():
412
+ if any(word == kw[0] for kw in keywords):
413
+ # Increase score for this emotion
414
+ emotion_scores[emotion] += 0.5
415
+
416
+ # Ensure no negative scores
417
+ for emotion in emotion_scores:
418
+ emotion_scores[emotion] = max(0, emotion_scores[emotion])
419
+
420
+ # Normalize to probabilities
421
+ total = sum(emotion_scores.values())
422
+ if total > 0:
423
+ probs = {emotion: score/total for emotion, score in emotion_scores.items()}
424
+ else:
425
+ # If no emotions detected, default to neutral
426
+ probs = {emotion: 0.0 for emotion in EMOTIONS}
427
+ probs["neutral"] = 1.0
428
+
429
+ # Convert to numpy array in the same order as EMOTIONS
430
+ return np.array([probs[emotion] for emotion in EMOTIONS])
431
+
432
+ def detect_sarcasm(self, text):
433
+ """
434
+ Detect sarcasm in text
435
+ Returns [negative, neutral, positive] probability array where high "positive"
436
+ with negative context indicates sarcasm
437
+ """
438
+ text_lower = text.lower()
439
+ sarcasm_score = 0.0
440
+
441
+ # Check for direct sarcasm indicators
442
+ for indicator in self.sarcasm_indicators:
443
+ if indicator in text_lower:
444
+ sarcasm_score += 0.3
445
+
446
+ # Check for common sarcasm patterns
447
+ positive_words = [kw[0] for kw in self.emotion_keywords["happy"]]
448
+ has_positive = any(word in text_lower for word in positive_words)
449
+
450
+ negative_context = any(neg in text_lower for neg in ["terrible", "awful", "horrible", "fail", "disaster", "mess"])
451
+
452
+ # Positive words in negative context suggests sarcasm
453
+ if has_positive and negative_context:
454
+ sarcasm_score += 0.4
455
+
456
+ # Check for excessive punctuation which might indicate sarcasm
457
+ if "!!!" in text or "?!" in text:
458
+ sarcasm_score += 0.2
459
+
460
+ # Cap the score
461
+ sarcasm_score = min(1.0, sarcasm_score)
462
+
463
+ # If sarcasm detected, return sentiment array biased toward sarcasm
464
+ # [negative, neutral, positive] - high positive with negative context indicates sarcasm
465
+ if sarcasm_score > 0.3:
466
+ return np.array([0.1, 0.1, 0.8]) # High positive signal for sarcasm detection
467
+ else:
468
+ # Return balanced array (no strong indication of sarcasm)
469
+ return np.array([0.33, 0.34, 0.33])
470
+
471
+ def get_keyword_strength(self, text):
472
+ """
473
+ Measure the strength of emotional keywords in the text
474
+ Returns a value between 0 and 10
475
+ """
476
+ text_lower = text.lower()
477
+ total_matches = 0
478
+ weighted_matches = 0
479
+
480
+ # Count all matches across all emotions with their weights
481
+ for emotion, patterns in self.emotion_patterns.items():
482
+ for pattern, weight in patterns:
483
+ matches = pattern.findall(text_lower)
484
+ total_matches += len(matches)
485
+ weighted_matches += len(matches) * weight
486
+
487
+ # Calculate strength score on a scale of 0-10
488
+ if total_matches > 0:
489
+ avg_weight = weighted_matches / total_matches
490
+ # Scale based on number of matches and their average weight
491
+ strength = min(10, (total_matches * avg_weight) / 2)
492
+ return strength
493
+ else:
494
+ return 0.0
495
+
496
+ # Part 7: Acoustic Feature Extraction with Fallback
497
+ class AcousticFeatureExtractor:
498
+ def __init__(self):
499
+ self.use_opensmile = True
500
+ try:
501
+ import opensmile
502
+ # Initialize OpenSMILE with the eGeMAPS feature set instead of ComParE_2016
503
+ # eGeMAPS is specifically designed for voice analysis and emotion recognition
504
+ self.smile = opensmile.Smile(
505
+ feature_set=opensmile.FeatureSet.eGeMAPSv02,
506
+ feature_level=opensmile.FeatureLevel.Functionals,
507
+ )
508
+ print("OpenSMILE feature extractor initialized successfully with eGeMAPS")
509
+ except Exception as e:
510
+ print(f"Failed to initialize OpenSMILE: {e}")
511
+ print("Using librosa for feature extraction instead.")
512
+ self.use_opensmile = False
513
+
514
+ def extract_features(self, audio_path):
515
+ try:
516
+ if self.use_opensmile:
517
+ # Use OpenSMILE for feature extraction
518
+ features = self.smile.process_file(audio_path)
519
+ return features.values
520
+ else:
521
+ # Fallback to improved librosa feature extraction
522
+ return self._extract_librosa_features(audio_path)
523
+ except Exception as e:
524
+ print(f"Error in acoustic feature extraction: {e}")
525
+ print("Using dummy features as fallback")
526
+ # Return dummy features in case of error
527
+ return np.zeros(88) # eGeMAPS dimension
528
+
529
+ def _extract_librosa_features(self, audio_path):
530
+ """Improved librosa feature extraction focusing on emotion-relevant features"""
531
+ try:
532
+ # Load audio
533
+ y, sr = librosa.load(audio_path, sr=22050)
534
+
535
+ # Extract features specifically relevant to emotion detection
536
+
537
+ # 1. Pitch features (fundamental frequency)
538
+ pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
539
+ pitch_mean = np.mean(pitches[magnitudes > np.median(magnitudes)])
540
+ pitch_std = np.std(pitches[magnitudes > np.median(magnitudes)])
541
+
542
+ # 2. Energy/intensity features
543
+ rms = librosa.feature.rms(y=y)[0]
544
+ energy_mean = np.mean(rms)
545
+ energy_std = np.std(rms)
546
+
547
+ # 3. Tempo and rhythm features
548
+ tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
549
+
550
+ # 4. Spectral features
551
+ spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
552
+ spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0]
553
+ spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
554
+
555
+ # 5. Voice quality features
556
+ zero_crossing_rate = librosa.feature.zero_crossing_rate(y)[0]
557
+
558
+ # Compute statistics for each feature
559
+ features = []
560
+ for feature in [spectral_centroid, spectral_bandwidth, spectral_rolloff, zero_crossing_rate]:
561
+ features.extend([np.mean(feature), np.std(feature), np.min(feature), np.max(feature)])
562
+
563
+ # Add pitch and energy features
564
+ features.extend([pitch_mean, pitch_std, energy_mean, energy_std, tempo])
565
+
566
+ # Add MFCCs (critical for speech emotion)
567
+ mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
568
+ for mfcc in mfccs:
569
+ features.extend([np.mean(mfcc), np.std(mfcc)])
570
+
571
+ # Convert to numpy array
572
+ features = np.array(features)
573
+
574
+ # Handle NaN values
575
+ features = np.nan_to_num(features)
576
+
577
+ # Pad or truncate to match eGeMAPS dimension (88)
578
+ if len(features) < 88:
579
+ features = np.pad(features, (0, 88 - len(features)))
580
+ else:
581
+ features = features[:88]
582
+
583
+ return features
584
+
585
+ except Exception as e:
586
+ print(f"Error in librosa feature extraction: {e}")
587
+ return np.zeros(88) # Same dimension as eGeMAPS
588
+
589
+
590
+ # Part 8: Acoustic Emotion Classifier
591
+ class AcousticEmotionClassifier(nn.Module):
592
+ def __init__(self, input_dim, hidden_dim=128, num_classes=len(EMOTIONS)):
593
+ super().__init__()
594
+
595
+ # Normalize input features
596
+ self.batch_norm = nn.BatchNorm1d(input_dim)
597
+
598
+ # Feature extraction layers
599
+ self.feature_extractor = nn.Sequential(
600
+ nn.Linear(input_dim, hidden_dim * 2),
601
+ nn.ReLU(),
602
+ nn.Dropout(0.3),
603
+ nn.Linear(hidden_dim * 2, hidden_dim),
604
+ nn.ReLU(),
605
+ nn.Dropout(0.3)
606
+ )
607
+
608
+ # Emotion classification head
609
+ self.classifier = nn.Sequential(
610
+ nn.Linear(hidden_dim, hidden_dim // 2),
611
+ nn.ReLU(),
612
+ nn.Dropout(0.2),
613
+ nn.Linear(hidden_dim // 2, num_classes)
614
+ )
615
+
616
+ # Initialize weights properly
617
+ self._init_weights()
618
+
619
+ def _init_weights(self):
620
+ """Initialize weights with Xavier initialization"""
621
+ for m in self.modules():
622
+ if isinstance(m, nn.Linear):
623
+ nn.init.xavier_uniform_(m.weight)
624
+ if m.bias is not None:
625
+ nn.init.zeros_(m.bias)
626
+
627
+ def forward(self, x):
628
+ # Handle different input shapes
629
+ if len(x.shape) == 1:
630
+ x = x.unsqueeze(0) # Add batch dimension
631
+
632
+ # Normalize features
633
+ x = self.batch_norm(x)
634
+
635
+ # Extract features
636
+ features = self.feature_extractor(x)
637
+
638
+ # Classify emotions
639
+ output = self.classifier(features)
640
+
641
+ return output
642
+
643
+
644
+ class PretrainedAudioClassifier:
645
+ """A rule-based classifier for audio emotion detection until proper training"""
646
+
647
+ def __init__(self):
648
+ # Define acoustic feature thresholds for emotions based on research
649
+ # These are simplified heuristics based on acoustic phonetics research
650
+ self.feature_thresholds = {
651
+ "happy": {
652
+ "pitch_mean": (220, 400), # Higher pitch for happiness
653
+ "energy_mean": (0.6, 1.0), # Higher energy
654
+ "speech_rate": (0.8, 1.0) # Faster speech rate
655
+ },
656
+ "sad": {
657
+ "pitch_mean": (100, 220), # Lower pitch for sadness
658
+ "energy_mean": (0.1, 0.5), # Lower energy
659
+ "speech_rate": (0.3, 0.7) # Slower speech rate
660
+ },
661
+ "angry": {
662
+ "pitch_mean": (250, 400), # Higher pitch for anger
663
+ "energy_mean": (0.7, 1.0), # Higher energy
664
+ "speech_rate": (0.7, 1.0) # Faster speech rate
665
+ },
666
+ "fearful": {
667
+ "pitch_mean": (200, 350), # Higher pitch
668
+ "energy_mean": (0.4, 0.8), # Medium energy
669
+ "speech_rate": (0.6, 0.9) # Medium-fast speech rate
670
+ },
671
+ "neutral": {
672
+ "pitch_mean": (180, 240), # Medium pitch
673
+ "energy_mean": (0.3, 0.6), # Medium energy
674
+ "speech_rate": (0.4, 0.7) # Medium speech rate
675
+ }
676
+ }
677
+
678
+ def extract_key_features(self, audio_path):
679
+ """Extract key acoustic features for rule-based classification"""
680
+ try:
681
+ y, sr = librosa.load(audio_path, sr=22050)
682
+
683
+ # Extract pitch
684
+ pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
685
+ pitch_mean = np.mean(pitches[magnitudes > 0.1]) if np.any(magnitudes > 0.1) else 200
686
+
687
+ # Normalize pitch to 0-1 range (assuming human pitch range 80-400 Hz)
688
+ pitch_mean_norm = (pitch_mean - 80) / (400 - 80)
689
+ pitch_mean_norm = max(0, min(1, pitch_mean_norm))
690
+
691
+ # Extract energy
692
+ rms = librosa.feature.rms(y=y)[0]
693
+ energy_mean = np.mean(rms)
694
+
695
+ # Normalize energy
696
+ energy_mean_norm = energy_mean / 0.1 # Assuming 0.1 is a reasonable max RMS
697
+ energy_mean_norm = max(0, min(1, energy_mean_norm))
698
+
699
+ # Estimate speech rate from onsets
700
+ onset_env = librosa.onset.onset_strength(y=y, sr=sr)
701
+ onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr)
702
+ if len(onsets) > 1:
703
+ speech_rate = len(onsets) / (len(y) / sr) # Onsets per second
704
+ speech_rate_norm = min(1.0, speech_rate / 5.0) # Normalize, assuming 5 onsets/sec is fast
705
+ else:
706
+ speech_rate_norm = 0.5 # Default to medium if can't detect
707
+
708
+ return {
709
+ "pitch_mean": pitch_mean_norm,
710
+ "energy_mean": energy_mean_norm,
711
+ "speech_rate": speech_rate_norm
712
+ }
713
+
714
+ except Exception as e:
715
+ print(f"Error extracting key features: {e}")
716
+ return {
717
+ "pitch_mean": 0.5, # Default to medium values
718
+ "energy_mean": 0.5,
719
+ "speech_rate": 0.5
720
+ }
721
+
722
+ def predict(self, audio_path):
723
+ """Predict emotion based on acoustic features"""
724
+ # Extract key features
725
+ features = self.extract_key_features(audio_path)
726
+
727
+ # Calculate match scores for each emotion
728
+ emotion_scores = {}
729
+ for emotion, thresholds in self.feature_thresholds.items():
730
+ score = 0
731
+ for feature, (min_val, max_val) in thresholds.items():
732
+ # Normalize threshold to 0-1 range
733
+ min_norm = (min_val - 80) / (400 - 80) if feature == "pitch_mean" else min_val
734
+ max_norm = (max_val - 80) / (400 - 80) if feature == "pitch_mean" else max_val
735
+
736
+ # Check if feature is in the emotion's range
737
+ if min_norm <= features[feature] <= max_norm:
738
+ # Higher score if closer to the middle of the range
739
+ middle = (min_norm + max_norm) / 2
740
+ distance = abs(features[feature] - middle) / ((max_norm - min_norm) / 2)
741
+ feature_score = 1 - distance
742
+ score += feature_score
743
+ else:
744
+ # Penalty for being outside the range
745
+ score -= 0.5
746
+
747
+ emotion_scores[emotion] = max(0, score)
748
+
749
+ # Add small values for other emotions not in our basic set
750
+ for emotion in EMOTIONS:
751
+ if emotion not in emotion_scores:
752
+ emotion_scores[emotion] = 0.1
753
+
754
+ # Normalize scores to probabilities
755
+ total = sum(emotion_scores.values())
756
+ if total > 0:
757
+ probs = {emotion: score/total for emotion, score in emotion_scores.items()}
758
+ else:
759
+ # Default to neutral if all scores are 0
760
+ probs = {emotion: 0.1 for emotion in EMOTIONS}
761
+ probs["neutral"] = 0.5
762
+
763
+ # Convert to array in the same order as EMOTIONS
764
+ return np.array([probs[emotion] for emotion in EMOTIONS])
765
+
766
+
767
+
768
+
769
+ # Part 9: Improved Fusion Model for combining text and acoustic predictions
770
+ class AdaptiveModalityFusionModel(nn.Module):
771
+ def __init__(self, text_dim, acoustic_dim, hidden_dim=128, num_classes=len(EMOTIONS)):
772
+ super().__init__()
773
+
774
+ # Confidence estimators for each modality
775
+ self.text_confidence = nn.Sequential(
776
+ nn.Linear(text_dim, hidden_dim),
777
+ nn.ReLU(),
778
+ nn.Linear(hidden_dim, 1),
779
+ nn.Sigmoid()
780
+ )
781
+
782
+ self.acoustic_confidence = nn.Sequential(
783
+ nn.Linear(acoustic_dim, hidden_dim),
784
+ nn.ReLU(),
785
+ nn.Linear(hidden_dim, 1),
786
+ nn.Sigmoid()
787
+ )
788
+
789
+ # Feature transformation
790
+ self.text_transform = nn.Linear(text_dim, hidden_dim)
791
+ self.acoustic_transform = nn.Linear(acoustic_dim, hidden_dim)
792
+
793
+ # Final classifier
794
+ self.classifier = nn.Sequential(
795
+ nn.Linear(hidden_dim, num_classes),
796
+ nn.Softmax(dim=1)
797
+ )
798
+
799
+ # Initialize weights
800
+ self._init_weights()
801
+
802
+ def _init_weights(self):
803
+ for m in self.modules():
804
+ if isinstance(m, nn.Linear):
805
+ nn.init.xavier_uniform_(m.weight)
806
+ if m.bias is not None:
807
+ nn.init.zeros_(m.bias)
808
+
809
+ def forward(self, text_features, acoustic_features):
810
+ # Estimate confidence for each modality
811
+ text_conf = self.text_confidence(text_features)
812
+ acoustic_conf = self.acoustic_confidence(acoustic_features)
813
+
814
+ # Normalize confidences to sum to 1
815
+ total_conf = text_conf + acoustic_conf
816
+ text_weight = text_conf / total_conf
817
+ acoustic_weight = acoustic_conf / total_conf
818
+
819
+ # Transform features
820
+ text_transformed = self.text_transform(text_features)
821
+ acoustic_transformed = self.acoustic_transform(acoustic_features)
822
+
823
+ # Weighted combination
824
+ combined = text_weight * text_transformed + acoustic_weight * acoustic_transformed
825
+
826
+ # Classification
827
+ output = self.classifier(combined)
828
+
829
+ return output
830
+
831
+
832
+ # Part 10: Simple Rule-based Fallback Classifier
833
+ class RuleBasedClassifier:
834
+ """A simple rule-based classifier for fallback when models fail"""
835
+
836
+ def predict(self, text):
837
+ """Predict emotion based on simple word matching"""
838
+ text = text.lower()
839
+
840
+ # Simple emotion keywords
841
+ emotion_keywords = {
842
+ "happy": ["happy", "joy", "delight", "excited", "glad", "pleased", "cheerful", "smile"],
843
+ "sad": ["sad", "unhappy", "depressed", "sorrow", "grief", "tearful", "miserable"],
844
+ "angry": ["angry", "mad", "furious", "annoyed", "irritated", "enraged", "livid"],
845
+ "fearful": ["afraid", "scared", "frightened", "fear", "terror", "panic", "horrified"],
846
+ "disgust": ["disgust", "gross", "repulsed", "revolted", "sick", "nauseous"],
847
+ "surprised": ["surprised", "shocked", "astonished", "amazed", "startled"],
848
+ "sarcastic": ["yeah right", "sure thing", "oh great", "wow", "really", "obviously"]
849
+ }
850
+
851
+ # Count matches for each emotion
852
+ emotion_scores = {emotion: 0 for emotion in EMOTIONS}
853
+ emotion_scores["neutral"] = 1 # Default to neutral
854
+
855
+ for emotion, keywords in emotion_keywords.items():
856
+ for keyword in keywords:
857
+ if keyword in text:
858
+ emotion_scores[emotion] += 1
859
+
860
+ # Return the emotion with highest score
861
+ max_emotion = max(emotion_scores, key=emotion_scores.get)
862
+
863
+ # Convert to probabilities
864
+ total = sum(emotion_scores.values())
865
+ probs = {emotion: score/total for emotion, score in emotion_scores.items()}
866
+
867
+ return max_emotion, probs
868
+
869
+ # Part 11: Complete Emotion Recognition Pipeline with Comprehensive Error Handling
870
+ class EmotionRecognitionPipeline:
871
+ def __init__(self, acoustic_model_path=None, fusion_model_path=None):
872
+ try:
873
+ print("Initializing Improved Emotion Recognition Pipeline...")
874
+
875
+ # Initialize transcriber
876
+ self.transcriber = WhisperTranscriber()
877
+
878
+ # Initialize text classifier
879
+ self.text_classifier = TextEmotionClassifier()
880
+
881
+ # Initialize feature extractor with improved features
882
+ self.feature_extractor = AcousticFeatureExtractor()
883
+
884
+ # Initialize rule-based audio classifier as fallback
885
+ self.rule_based_audio = PretrainedAudioClassifier()
886
+
887
+ # Initialize simple rule-based fallback
888
+ self.rule_based = RuleBasedClassifier()
889
+
890
+ # Define simple fusion strategy
891
+ self.use_adaptive_fusion = False
892
+
893
+ print("Improved Emotion Recognition Pipeline initialized successfully")
894
+ except Exception as e:
895
+ print(f"Error initializing pipeline: {e}")
896
+ print("Some functionality may be limited")
897
+
898
+ def predict(self, audio_path):
899
+ results = {
900
+ "transcription": "",
901
+ "text_emotions": {emotion: 0.0 for emotion in EMOTIONS},
902
+ "acoustic_emotions": {emotion: 0.0 for emotion in EMOTIONS},
903
+ "final_emotions": {emotion: 0.0 for emotion in EMOTIONS},
904
+ "predicted_emotion": "neutral",
905
+ "is_sarcastic": False,
906
+ "errors": []
907
+ }
908
+
909
+ # Step 1: Transcribe audio
910
+ try:
911
+ transcription = self.transcriber.transcribe(audio_path)
912
+ results["transcription"] = transcription
913
+ print(f"Transcription: {transcription}")
914
+ except Exception as e:
915
+ error_msg = f"Failed to transcribe audio: {e}"
916
+ print(error_msg)
917
+ results["errors"].append(error_msg)
918
+ results["transcription"] = "Error: Could not transcribe audio"
919
+
920
+ # Step 2: Analyze text emotions
921
+ try:
922
+ if results["transcription"].startswith("Error:"):
923
+ # Skip text analysis if transcription failed
924
+ text_emotions = np.ones(len(EMOTIONS)) / len(EMOTIONS) # Equal probabilities
925
+ sarcasm_indicators = np.array([0.33, 0.33, 0.33])
926
+
927
+ # Try rule-based as fallback
928
+ rule_emotion, rule_probs = self.rule_based.predict(results["transcription"])
929
+ results["text_emotions"] = rule_probs
930
+ else:
931
+ text_emotions = self.text_classifier.predict_emotion(results["transcription"])
932
+ sarcasm_indicators = self.text_classifier.detect_sarcasm(results["transcription"])
933
+
934
+ # Format text emotions result
935
+ results["text_emotions"] = {EMOTIONS[i]: float(text_emotions[i])
936
+ for i in range(min(len(text_emotions), len(EMOTIONS)))}
937
+
938
+ print(f"Text-based emotions: {results['text_emotions']}")
939
+ except Exception as e:
940
+ error_msg = f"Failed to analyze text emotions: {e}"
941
+ print(error_msg)
942
+ results["errors"].append(error_msg)
943
+
944
+ # Use equal probabilities as fallback
945
+ results["text_emotions"] = {emotion: 1.0/len(EMOTIONS) for emotion in EMOTIONS}
946
+
947
+ # Step 3: Use rule-based audio classifier instead of the untrained model
948
+ try:
949
+ # Get predictions from rule-based classifier
950
+ audio_probs = self.rule_based_audio.predict(audio_path)
951
+
952
+ # Format acoustic emotions result
953
+ results["acoustic_emotions"] = {EMOTIONS[i]: float(audio_probs[i])
954
+ for i in range(min(len(audio_probs), len(EMOTIONS)))}
955
+
956
+ print(f"Acoustic-based emotions: {results['acoustic_emotions']}")
957
+ except Exception as e:
958
+ error_msg = f"Failed to predict acoustic emotions: {e}"
959
+ print(error_msg)
960
+ results["errors"].append(error_msg)
961
+
962
+ # Use equal probabilities as fallback
963
+ results["acoustic_emotions"] = {emotion: 1.0/len(EMOTIONS) for emotion in EMOTIONS}
964
+ audio_probs = np.ones(len(EMOTIONS)) / len(EMOTIONS)
965
+
966
+ # Step 4: Improved fusion strategy - text-biased weighted average
967
+ try:
968
+ # Convert dictionaries to arrays
969
+ text_array = np.array(list(results["text_emotions"].values()))
970
+ audio_array = np.array(list(results["acoustic_emotions"].values()))
971
+
972
+ # Calculate confidence scores
973
+ text_confidence = 1.0 - np.std(text_array) # Higher confidence if distribution is more certain
974
+ audio_confidence = 1.0 - np.std(audio_array)
975
+
976
+ # Bias toward text model since it's working better
977
+ text_confidence *= 1.5 # Increase text confidence
978
+
979
+ # Normalize confidences
980
+ total_confidence = text_confidence + audio_confidence
981
+ text_weight = text_confidence / total_confidence
982
+ audio_weight = audio_confidence / total_confidence
983
+
984
+ # Weighted average
985
+ final_probs = (text_weight * text_array) + (audio_weight * audio_array)
986
+
987
+ # Format final emotions
988
+ results["final_emotions"] = {EMOTIONS[i]: float(final_probs[i])
989
+ for i in range(len(EMOTIONS))}
990
+
991
+ print(f"Fusion weights: Text={text_weight:.2f}, Audio={audio_weight:.2f}")
992
+ except Exception as e:
993
+ error_msg = f"Failed to fuse predictions: {e}"
994
+ print(error_msg)
995
+ results["errors"].append(error_msg)
996
+
997
+ # Fallback to text-only predictions since they're more reliable
998
+ results["final_emotions"] = results["text_emotions"]
999
+
1000
+ # Get predicted emotion
1001
+ try:
1002
+ emotion_values = list(results["final_emotions"].values())
1003
+ emotion_idx = np.argmax(emotion_values)
1004
+ predicted_emotion = EMOTIONS[emotion_idx]
1005
+ results["predicted_emotion"] = predicted_emotion
1006
+
1007
+ # Check for sarcasm
1008
+ is_sarcastic = False
1009
+ if hasattr(sarcasm_indicators, "__len__") and len(sarcasm_indicators) > 0:
1010
+ if predicted_emotion in ["happy", "neutral"] and np.argmax(sarcasm_indicators) == 0:
1011
+ is_sarcastic = True
1012
+ results["predicted_emotion"] = "sarcastic"
1013
+
1014
+ results["is_sarcastic"] = is_sarcastic
1015
+ except Exception as e:
1016
+ error_msg = f"Failed to determine final emotion: {e}"
1017
+ print(error_msg)
1018
+ results["errors"].append(error_msg)
1019
+ results["predicted_emotion"] = "neutral" # Default fallback
1020
+
1021
+ return results
1022
+
1023
+
1024
+ # Part 12: Example on sample audio (with better error handling)
1025
+ def demo_on_sample_audio(pipeline, audio_path):
1026
+ if not os.path.exists(audio_path):
1027
+ print(f"Error: Audio file not found at {audio_path}")
1028
+ return
1029
+
1030
+ print(f"Analyzing audio file: {audio_path}")
1031
+
1032
+ try:
1033
+ # Predict emotion from audio
1034
+ result = pipeline.predict(audio_path)
1035
+
1036
+ # Print results
1037
+ print("\n===== EMOTION ANALYSIS RESULTS =====")
1038
+ print(f"Transcription: {result['transcription']}")
1039
+ print(f"\nPredicted Emotion: {result['predicted_emotion'].upper()}")
1040
+ print(f"Is Sarcastic: {'Yes' if result['is_sarcastic'] else 'No'}")
1041
+
1042
+ print("\nText-based Emotions:")
1043
+ for emotion, score in result['text_emotions'].items():
1044
+ print(f" {emotion}: {score:.4f}")
1045
+
1046
+ print("\nAcoustic-based Emotions:")
1047
+ for emotion, score in result['acoustic_emotions'].items():
1048
+ print(f" {emotion}: {score:.4f}")
1049
+
1050
+ print("\nFinal Fusion Emotions:")
1051
+ for emotion, score in result['final_emotions'].items():
1052
+ print(f" {emotion}: {score:.4f}")
1053
+
1054
+ if 'errors' in result and result['errors']:
1055
+ print("\nErrors encountered:")
1056
+ for error in result['errors']:
1057
+ print(f" - {error}")
1058
+
1059
+ # Plot results for visualization
1060
+ try:
1061
+ emotions = list(result['text_emotions'].keys())
1062
+ text_scores = list(result['text_emotions'].values())
1063
+ acoustic_scores = list(result['acoustic_emotions'].values())
1064
+ final_scores = list(result['final_emotions'].values())
1065
+
1066
+ plt.figure(figsize=(12, 6))
1067
+
1068
+ x = np.arange(len(emotions))
1069
+ width = 0.25
1070
+
1071
+ plt.bar(x - width, text_scores, width, label='Text')
1072
+ plt.bar(x, acoustic_scores, width, label='Acoustic')
1073
+ plt.bar(x + width, final_scores, width, label='Final')
1074
+
1075
+ plt.xlabel('Emotions')
1076
+ plt.ylabel('Probability')
1077
+ plt.title('Emotion Prediction Results')
1078
+ plt.xticks(x, emotions, rotation=45)
1079
+ plt.legend()
1080
+
1081
+ plt.tight_layout()
1082
+ plt.show()
1083
+ except Exception as e:
1084
+ print(f"Error creating visualization: {e}")
1085
+
1086
+ except Exception as e:
1087
+ print(f"Error in demo: {e}")
1088
+
1089
+ # Part 13: Simplified dataset loading for RAVDESS dataset
1090
+ def load_ravdess_sample():
1091
+ """
1092
+ Download a small sample from RAVDESS dataset for testing
1093
+ """
1094
+ # Create directory for sample data
1095
+ sample_dir = "./sample_data"
1096
+ os.makedirs(sample_dir, exist_ok=True)
1097
+
1098
+ # Try to download a sample file
1099
+ try:
1100
+ import urllib.request
1101
+
1102
+ # Example file from RAVDESS dataset (happy emotion)
1103
+ url = "https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24/Actor_01/03-01-01-01-01-01-01.wav"
1104
+ sample_path = os.path.join(sample_dir, "sample_happy.wav")
1105
+
1106
+ if not os.path.exists(sample_path):
1107
+ print(f"Downloading sample audio file from RAVDESS dataset...")
1108
+ urllib.request.urlretrieve(url, sample_path)
1109
+ print(f"Downloaded sample to {sample_path}")
1110
+ else:
1111
+ print(f"Sample file already exists at {sample_path}")
1112
+
1113
+ return sample_path
1114
+ except Exception as e:
1115
+ print(f"Error downloading RAVDESS sample: {e}")
1116
+ return None
1117
+
1118
+ # Part 14: Simplified main function with proper error handling
1119
+ def main():
1120
+ print("Starting Tone Classification System...")
1121
+
1122
+ try:
1123
+ # Create the pipeline
1124
+ pipeline = EmotionRecognitionPipeline()
1125
+
1126
+ # Try to load a sample file
1127
+ sample_audio_path = load_ravdess_sample()
1128
+
1129
+ if sample_audio_path and os.path.exists(sample_audio_path):
1130
+ demo_on_sample_audio(pipeline, sample_audio_path)
1131
+ else:
1132
+ print("\nNo sample audio file available.")
1133
+ print("To use the system, provide an audio file path when calling the demo_on_sample_audio function:")
1134
+ print("\ndemo_on_sample_audio(pipeline, '/path/to/your/audio.wav')")
1135
+
1136
+ except Exception as e:
1137
+ print(f"Error in main execution: {e}")
1138
+ print("\nTroubleshooting tips:")
1139
+ print("1. Check if your audio file exists and is in a supported format (WAV recommended)")
1140
+ print("2. Ensure you have sufficient memory for model loading")
1141
+ print("3. Try with a smaller model size in WhisperTranscriber (tiny instead of base)")
1142
+ print("4. Make sure you have stable internet connection for model downloading")
1143
+
1144
+ if __name__ == "__main__":
1145
+ main()
1146
+
1147
+
1148
+ # Add this after the main() function definition but before the if __name__ == "__main__": line
1149
+ def upload_and_analyze():
1150
+ from IPython.display import display
1151
+ import ipywidgets as widgets
1152
+
1153
+ # Create upload widget
1154
+ upload_widget = widgets.FileUpload(
1155
+ accept='.wav, .mp3',
1156
+ multiple=False,
1157
+ description='Upload Audio File',
1158
+ button_style='primary'
1159
+ )
1160
+ display(upload_widget)
1161
+
1162
+ # Create button to trigger analysis
1163
+ analyze_button = widgets.Button(description='Analyze Audio')
1164
+ display(analyze_button)
1165
+
1166
+ # Create output area for results
1167
+ output = widgets.Output()
1168
+ display(output)
1169
+
1170
+ def on_analyze_click(b):
1171
+ with output:
1172
+ output.clear_output()
1173
+ if not upload_widget.value:
1174
+ print("Please upload an audio file first.")
1175
+ return
1176
+
1177
+ # Get the uploaded file
1178
+ file_data = next(iter(upload_widget.value.values()))
1179
+ file_name = next(iter(upload_widget.value.keys()))
1180
+
1181
+ # Save to temp file
1182
+ temp_file = f"./temp_{file_name}"
1183
+ with open(temp_file, 'wb') as f:
1184
+ f.write(file_data['content'])
1185
+
1186
+ print(f"Analyzing uploaded file: {file_name}")
1187
+
1188
+ # Create pipeline and analyze
1189
+ pipeline = EmotionRecognitionPipeline()
1190
+ demo_on_sample_audio(pipeline, temp_file)
1191
+
1192
+ analyze_button.on_click(on_analyze_click)
1193
+
1194
+ # Then modify the if __name__ == "__main__": section
1195
+ if __name__ == "__main__":
1196
+ try:
1197
+ import ipywidgets
1198
+ # If ipywidgets is available, we're in a notebook
1199
+ print("Running in notebook mode - use the upload widget below:")
1200
+ upload_and_analyze()
1201
+ except ImportError:
1202
+ # Otherwise, run the standard main function
1203
+ main()
1204
+
1205
+
1206
+ import os
1207
+ import numpy as np
1208
+ import torch
1209
+ import matplotlib.pyplot as plt
1210
+ import gradio as gr
1211
+ from io import BytesIO
1212
+
1213
+ # Use the existing EmotionRecognitionPipeline class from your code
1214
+
1215
+ def analyze_audio(audio_path):
1216
+ """
1217
+ Analyze an audio file and return the emotion recognition results
1218
+ """
1219
+ if audio_path is None:
1220
+ return "Please provide an audio file.", None, None
1221
+
1222
+ try:
1223
+ # Create the pipeline
1224
+ pipeline = EmotionRecognitionPipeline()
1225
+
1226
+ # Predict emotion from audio
1227
+ result = pipeline.predict(audio_path)
1228
+
1229
+ # Format the results for display
1230
+ transcription = result['transcription']
1231
+ predicted_emotion = result['predicted_emotion'].upper()
1232
+ is_sarcastic = 'Yes' if result['is_sarcastic'] else 'No'
1233
+
1234
+ # Create text summary
1235
+ summary = f"Transcription: {transcription}\n\n"
1236
+ summary += f"Predicted Emotion: {predicted_emotion}\n"
1237
+ summary += f"Is Sarcastic: {is_sarcastic}\n\n"
1238
+
1239
+ summary += "Text-based Emotions:\n"
1240
+ for emotion, score in result['text_emotions'].items():
1241
+ summary += f" {emotion}: {score:.4f}\n"
1242
+
1243
+ summary += "\nAcoustic-based Emotions:\n"
1244
+ for emotion, score in result['acoustic_emotions'].items():
1245
+ summary += f" {emotion}: {score:.4f}\n"
1246
+
1247
+ summary += "\nFinal Fusion Emotions:\n"
1248
+ for emotion, score in result['final_emotions'].items():
1249
+ summary += f" {emotion}: {score:.4f}\n"
1250
+
1251
+ if 'errors' in result and result['errors']:
1252
+ summary += "\nErrors encountered:\n"
1253
+ for error in result['errors']:
1254
+ summary += f" - {error}\n"
1255
+
1256
+ # Create visualization
1257
+ fig = create_emotion_plot(result)
1258
+
1259
+ return summary, fig, result['predicted_emotion']
1260
+ except Exception as e:
1261
+ return f"Error analyzing audio: {str(e)}", None, "error"
1262
+
1263
+ def create_emotion_plot(result):
1264
+ """
1265
+ Create a visualization of the emotion recognition results
1266
+ """
1267
+ emotions = list(result['text_emotions'].keys())
1268
+ text_scores = list(result['text_emotions'].values())
1269
+ acoustic_scores = list(result['acoustic_emotions'].values())
1270
+ final_scores = list(result['final_emotions'].values())
1271
+
1272
+ fig = plt.figure(figsize=(10, 6))
1273
+
1274
+ x = np.arange(len(emotions))
1275
+ width = 0.25
1276
+
1277
+ plt.bar(x - width, text_scores, width, label='Text')
1278
+ plt.bar(x, acoustic_scores, width, label='Acoustic')
1279
+ plt.bar(x + width, final_scores, width, label='Final')
1280
+
1281
+ plt.xlabel('Emotions')
1282
+ plt.ylabel('Probability')
1283
+ plt.title('Emotion Recognition Results')
1284
+ plt.xticks(x, emotions, rotation=45)
1285
+ plt.legend()
1286
+ plt.tight_layout()
1287
+
1288
+ return fig
1289
+
1290
+ # Create the Gradio interface with tabs for microphone and file upload
1291
+ def create_gradio_interface():
1292
+ with gr.Blocks(title="Tone Classification System") as demo:
1293
+ gr.Markdown("# Tone Classification System")
1294
+ gr.Markdown("This system analyzes audio to detect emotions, including sarcasm and figures of speech.")
1295
+
1296
+ with gr.Tabs():
1297
+ with gr.TabItem("Microphone Input"):
1298
+ with gr.Row():
1299
+ with gr.Column():
1300
+ audio_input = gr.Audio(
1301
+ sources=["microphone"],
1302
+ type="filepath",
1303
+ label="Record your voice"
1304
+ )
1305
+ analyze_btn = gr.Button("Analyze Recording", variant="primary")
1306
+
1307
+ with gr.Column():
1308
+ result_text = gr.Textbox(label="Analysis Results", lines=15)
1309
+ emotion_plot = gr.Plot(label="Emotion Probabilities")
1310
+ emotion_label = gr.Label(label="Detected Emotion")
1311
+
1312
+ analyze_btn.click(
1313
+ fn=analyze_audio,
1314
+ inputs=audio_input,
1315
+ outputs=[result_text, emotion_plot, emotion_label]
1316
+ )
1317
+
1318
+ with gr.TabItem("File Upload"):
1319
+ with gr.Row():
1320
+ with gr.Column():
1321
+ file_input = gr.Audio(
1322
+ sources=["upload"],
1323
+ type="filepath",
1324
+ label="Upload audio file (.wav, .mp3)"
1325
+ )
1326
+ file_analyze_btn = gr.Button("Analyze File", variant="primary")
1327
+
1328
+ with gr.Column():
1329
+ file_result_text = gr.Textbox(label="Analysis Results", lines=15)
1330
+ file_emotion_plot = gr.Plot(label="Emotion Probabilities")
1331
+ file_emotion_label = gr.Label(label="Detected Emotion")
1332
+
1333
+ file_analyze_btn.click(
1334
+ fn=analyze_audio,
1335
+ inputs=file_input,
1336
+ outputs=[file_result_text, file_emotion_plot, file_emotion_label]
1337
+ )
1338
+
1339
+ gr.Markdown("## How to Use")
1340
+ gr.Markdown("""
1341
+ 1. **Microphone Input**: Record your voice and click 'Analyze Recording'
1342
+ 2. **File Upload**: Upload an audio file (.wav or .mp3) and click 'Analyze File'
1343
+
1344
+ The system will transcribe the speech, analyze emotions from both text and acoustic features,
1345
+ and display the results with a visualization of emotion probabilities.
1346
+ """)
1347
+
1348
+ gr.Markdown("## About")
1349
+ gr.Markdown("""
1350
+ This tone classification system combines text and acoustic features to detect emotions in speech.
1351
+ It uses a multi-modal approach with:
1352
+
1353
+ - Speech-to-text transcription
1354
+ - Text-based emotion analysis
1355
+ - Acoustic feature extraction
1356
+ - Fusion of both modalities for final prediction
1357
+
1358
+ The system can detect: neutral, happy, sad, angry, fearful, disgust, surprised, and sarcastic tones.
1359
+ """)
1360
+
1361
+ return demo
1362
+
1363
+ # Main function to launch the Gradio interface
1364
+ def main():
1365
+ demo = create_gradio_interface()
1366
+ demo.launch()
1367
+
1368
+ if __name__ == "__main__":
1369
+ main()