MaroofTechSorcerer commited on
Commit
9464f08
·
verified ·
1 Parent(s): 64c8a12

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -66
app.py CHANGED
@@ -13,9 +13,6 @@ import time
13
  import numpy as np
14
  import librosa
15
  import subprocess
16
- import pyaudio
17
- import wave
18
- import io
19
 
20
  # Suppress warnings for a clean console
21
  logging.getLogger("torch").setLevel(logging.CRITICAL)
@@ -32,7 +29,7 @@ st.set_page_config(layout="wide", page_title="Advanced Voice Emotion Analyzer")
32
 
33
  # Interface design
34
  st.title("🎙️ Advanced Voice Emotion Analyzer")
35
- st.write("Analyze all emotions from audio using hybrid ML models, ensuring accurate detection across 27 emotions.")
36
 
37
  # Audio Preprocessing
38
  def make_audio_scarier(audio_path, output_path):
@@ -83,14 +80,27 @@ def perform_audio_emotion_detection(audio_path):
83
  audio_emotions = ["neutral", "happy", "sad", "angry", "fearful", "surprise", "disgust"]
84
  emotion_dict = {emotion: float(scores[i]) for i, emotion in enumerate(audio_emotions)}
85
  top_emotion = audio_emotions[np.argmax(scores)]
86
- # Boost emotions for audio characteristics
87
  features = extract_audio_features(audio_path)
88
  if features.get("pitch_mean", 0) < 200 and features.get("energy_mean", 0) > 0.1 and features.get("zcr_mean", 0) > 0.1:
89
- emotion_dict["fearful"] = min(1.0, emotion_dict.get("fearful", 0) + 0.3)
90
  top_emotion = "fearful" if emotion_dict["fearful"] > emotion_dict[top_emotion] else top_emotion
91
- elif features.get("energy_mean", 0) > 0.2:
92
- emotion_dict["angry"] = min(1.0, emotion_dict.get("angry", 0) + 0.2)
93
  top_emotion = "angry" if emotion_dict["angry"] > emotion_dict[top_emotion] else top_emotion
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  return emotion_dict, top_emotion
95
  except Exception as e:
96
  st.error(f"Audio emotion detection failed: {str(e)}")
@@ -114,6 +124,10 @@ def perform_text_emotion_detection(text):
114
  "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"]
115
  emotions_dict = {result['label']: result['score'] for result in results if result['label'] in emotions}
116
  top_emotion = max(emotions_dict, key=emotions_dict.get)
 
 
 
 
117
  return emotions_dict, top_emotion
118
  except Exception as e:
119
  st.error(f"Text emotion detection failed: {str(e)}")
@@ -169,50 +183,12 @@ def transcribe_audio(audio_path):
169
  st.error(f"Transcription failed: {str(e)}")
170
  return ""
171
 
172
- # Python Audio Recording
173
- def record_audio():
174
- CHUNK = 1024
175
- FORMAT = pyaudio.paInt16
176
- CHANNELS = 1
177
- RATE = 16000
178
- RECORD_SECONDS = st.slider("Recording duration (seconds)", 1, 30, 5)
179
-
180
- p = pyaudio.PyAudio()
181
- stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
182
-
183
- if st.button("Start Recording"):
184
- st.write("Recording...")
185
- frames = []
186
- for _ in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
187
- data = stream.read(CHUNK)
188
- frames.append(data)
189
- st.write("Recording finished.")
190
-
191
- stream.stop_stream()
192
- stream.close()
193
- p.terminate()
194
-
195
- temp_file_path = os.path.join(tempfile.gettempdir(), f"recorded_audio_{int(time.time())}.wav")
196
- wf = wave.open(temp_file_path, 'wb')
197
- wf.setnchannels(CHANNELS)
198
- wf.setsampwidth(p.get_sample_size(FORMAT))
199
- wf.setframerate(RATE)
200
- wf.writeframes(b''.join(frames))
201
- wf.close()
202
-
203
- return temp_file_path
204
- return None
205
-
206
  # Process Audio Files
207
  def process_audio_file(audio_data):
208
  temp_dir = tempfile.gettempdir()
209
  temp_file_path = os.path.join(temp_dir, f"audio_{int(time.time())}.wav")
210
  with open(temp_file_path, "wb") as f:
211
- if isinstance(audio_data, str):
212
- with open(audio_data, "rb") as f_audio:
213
- f.write(f_audio.read())
214
- else:
215
- f.write(audio_data.getvalue())
216
  if not validate_audio(temp_file_path):
217
  return None
218
  return temp_file_path
@@ -230,6 +206,7 @@ def display_analysis_results(audio_path):
230
  audio_emotions, audio_top_emotion = perform_audio_emotion_detection(processed_audio_path)
231
  st.subheader("Audio-Based Emotion")
232
  st.write(f"**Dominant Emotion:** {audio_top_emotion} (Score: {audio_emotions.get(audio_top_emotion, 0):.3f})")
 
233
 
234
  # Transcription and text emotion detection
235
  transcribed_text = transcribe_audio(processed_audio_path)
@@ -238,6 +215,7 @@ def display_analysis_results(audio_path):
238
  if transcribed_text:
239
  text_emotions, text_top_emotion = perform_text_emotion_detection(transcribed_text)
240
  st.write(f"**Text-Based Dominant Emotion:** {text_top_emotion} (Score: {text_emotions.get(text_top_emotion, 0):.3f})")
 
241
 
242
  # Combine emotions (prioritize audio, map to 27 emotions)
243
  emotion_map = {
@@ -256,7 +234,12 @@ def display_analysis_results(audio_path):
256
  for text_emotion, score in text_emotions.items():
257
  combined_emotions[text_emotion] = combined_emotions.get(text_emotion, 0) + score * 0.3
258
 
 
259
  top_emotion = max(combined_emotions, key=combined_emotions.get)
 
 
 
 
260
  sentiment = "POSITIVE" if top_emotion in ["admiration", "amusement", "approval", "caring", "desire", "excitement",
261
  "gratitude", "joy", "love", "optimism", "pride", "relief"] else "NEGATIVE" if top_emotion in ["anger", "annoyance", "disappointment", "disapproval", "disgust", "embarrassment", "fear", "grief", "nervousness", "remorse", "sadness"] else "NEUTRAL"
262
 
@@ -287,7 +270,7 @@ def display_analysis_results(audio_path):
287
  st.write(f"**Audio Features:** {extract_audio_features(processed_audio_path)}")
288
  st.write("""
289
  **How it works:**
290
- - Audio Emotion: Wav2Vec2 detects 7 emotions from audio.
291
  - Transcription: Whisper converts audio to text.
292
  - Text Emotion: RoBERTa refines 27 emotions from text.
293
  - Sarcasm: Analyzes text for irony.
@@ -301,25 +284,13 @@ def display_analysis_results(audio_path):
301
 
302
  # Main App Logic
303
  def main():
304
- tab1, tab2 = st.tabs(["📁 Upload Audio", "🎙️ Record Audio"])
305
-
306
- with tab1:
307
- st.header("Upload Audio File")
308
- audio_file = st.file_uploader("Upload audio (wav, mp3, ogg)", type=["wav", "mp3", "ogg"])
309
- if audio_file:
310
- temp_audio_path = process_audio_file(audio_file)
311
- if temp_audio_path:
312
- if st.button("Analyze Upload"):
313
- with st.spinner("Analyzing..."):
314
- display_analysis_results(temp_audio_path)
315
-
316
- with tab2:
317
- st.header("Record Your Voice")
318
- st.write("Record audio to analyze emotions in real-time.")
319
- temp_audio_path = record_audio()
320
  if temp_audio_path:
321
- if st.button("Analyze Recording"):
322
- with st.spinner("Processing..."):
323
  display_analysis_results(temp_audio_path)
324
 
325
  st.sidebar.header("About")
@@ -329,6 +300,7 @@ def main():
329
  - Text: SamLowe/roberta-base-go_emotions (27 emotions)
330
  - Sarcasm: cardiffnlp/twitter-roberta-base-irony
331
  - Speech: OpenAI Whisper (large-v3)
 
332
  """)
333
 
334
  if __name__ == "__main__":
 
13
  import numpy as np
14
  import librosa
15
  import subprocess
 
 
 
16
 
17
  # Suppress warnings for a clean console
18
  logging.getLogger("torch").setLevel(logging.CRITICAL)
 
29
 
30
  # Interface design
31
  st.title("🎙️ Advanced Voice Emotion Analyzer")
32
+ st.write("Analyze all 27 emotions from uploaded audio with enhanced detection to avoid neutral defaults.")
33
 
34
  # Audio Preprocessing
35
  def make_audio_scarier(audio_path, output_path):
 
80
  audio_emotions = ["neutral", "happy", "sad", "angry", "fearful", "surprise", "disgust"]
81
  emotion_dict = {emotion: float(scores[i]) for i, emotion in enumerate(audio_emotions)}
82
  top_emotion = audio_emotions[np.argmax(scores)]
83
+ # Enhanced boosting based on audio features
84
  features = extract_audio_features(audio_path)
85
  if features.get("pitch_mean", 0) < 200 and features.get("energy_mean", 0) > 0.1 and features.get("zcr_mean", 0) > 0.1:
86
+ emotion_dict["fearful"] = min(1.0, emotion_dict.get("fearful", 0) + 0.4) # Increased boost
87
  top_emotion = "fearful" if emotion_dict["fearful"] > emotion_dict[top_emotion] else top_emotion
88
+ elif features.get("energy_mean", 0) > 0.25: # Stricter threshold
89
+ emotion_dict["angry"] = min(1.0, emotion_dict.get("angry", 0) + 0.35)
90
  top_emotion = "angry" if emotion_dict["angry"] > emotion_dict[top_emotion] else top_emotion
91
+ elif features.get("pitch_mean", 0) > 500 and features.get("energy_mean", 0) < 0.05:
92
+ emotion_dict["sad"] = min(1.0, emotion_dict.get("sad", 0) + 0.3)
93
+ top_emotion = "sad" if emotion_dict["sad"] > emotion_dict[top_emotion] else top_emotion
94
+ elif features.get("energy_mean", 0) > 0.15 and features.get("pitch_mean", 0) > 300:
95
+ emotion_dict["happy"] = min(1.0, emotion_dict.get("happy", 0) + 0.3)
96
+ top_emotion = "happy" if emotion_dict["happy"] > emotion_dict[top_emotion] else top_emotion
97
+ elif features.get("zcr_mean", 0) > 0.15 and features.get("energy_mean", 0) > 0.1:
98
+ emotion_dict["surprise"] = min(1.0, emotion_dict.get("surprise", 0) + 0.25)
99
+ top_emotion = "surprise" if emotion_dict["surprise"] > emotion_dict[top_emotion] else top_emotion
100
+ # Fallback to avoid neutral if score is low
101
+ if emotion_dict["neutral"] > 0.5 and max([v for k, v in emotion_dict.items() if k != "neutral"]) > 0.3:
102
+ emotion_dict["neutral"] = max(0.0, emotion_dict["neutral"] - 0.2) # Reduce neutral weight
103
+ top_emotion = max(emotion_dict, key=emotion_dict.get)
104
  return emotion_dict, top_emotion
105
  except Exception as e:
106
  st.error(f"Audio emotion detection failed: {str(e)}")
 
124
  "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"]
125
  emotions_dict = {result['label']: result['score'] for result in results if result['label'] in emotions}
126
  top_emotion = max(emotions_dict, key=emotions_dict.get)
127
+ # Reduce neutral influence if other emotions are strong
128
+ if emotions_dict.get("neutral", 0) > 0.5 and max([v for k, v in emotions_dict.items() if k != "neutral"]) > 0.4:
129
+ emotions_dict["neutral"] = max(0.0, emotions_dict["neutral"] - 0.15)
130
+ top_emotion = max(emotions_dict, key=emotions_dict.get)
131
  return emotions_dict, top_emotion
132
  except Exception as e:
133
  st.error(f"Text emotion detection failed: {str(e)}")
 
183
  st.error(f"Transcription failed: {str(e)}")
184
  return ""
185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  # Process Audio Files
187
  def process_audio_file(audio_data):
188
  temp_dir = tempfile.gettempdir()
189
  temp_file_path = os.path.join(temp_dir, f"audio_{int(time.time())}.wav")
190
  with open(temp_file_path, "wb") as f:
191
+ f.write(audio_data.getvalue())
 
 
 
 
192
  if not validate_audio(temp_file_path):
193
  return None
194
  return temp_file_path
 
206
  audio_emotions, audio_top_emotion = perform_audio_emotion_detection(processed_audio_path)
207
  st.subheader("Audio-Based Emotion")
208
  st.write(f"**Dominant Emotion:** {audio_top_emotion} (Score: {audio_emotions.get(audio_top_emotion, 0):.3f})")
209
+ st.write("Audio Emotions:", audio_emotions) # Debug output
210
 
211
  # Transcription and text emotion detection
212
  transcribed_text = transcribe_audio(processed_audio_path)
 
215
  if transcribed_text:
216
  text_emotions, text_top_emotion = perform_text_emotion_detection(transcribed_text)
217
  st.write(f"**Text-Based Dominant Emotion:** {text_top_emotion} (Score: {text_emotions.get(text_top_emotion, 0):.3f})")
218
+ st.write("Text Emotions:", text_emotions) # Debug output
219
 
220
  # Combine emotions (prioritize audio, map to 27 emotions)
221
  emotion_map = {
 
234
  for text_emotion, score in text_emotions.items():
235
  combined_emotions[text_emotion] = combined_emotions.get(text_emotion, 0) + score * 0.3
236
 
237
+ # Avoid neutral if other emotions are competitive
238
  top_emotion = max(combined_emotions, key=combined_emotions.get)
239
+ if combined_emotions["neutral"] > 0.5 and max([v for k, v in combined_emotions.items() if k != "neutral"]) > 0.4:
240
+ combined_emotions["neutral"] = max(0.0, combined_emotions["neutral"] - 0.25) # Stronger reduction
241
+ top_emotion = max(combined_emotions, key=combined_emotions.get)
242
+
243
  sentiment = "POSITIVE" if top_emotion in ["admiration", "amusement", "approval", "caring", "desire", "excitement",
244
  "gratitude", "joy", "love", "optimism", "pride", "relief"] else "NEGATIVE" if top_emotion in ["anger", "annoyance", "disappointment", "disapproval", "disgust", "embarrassment", "fear", "grief", "nervousness", "remorse", "sadness"] else "NEUTRAL"
245
 
 
270
  st.write(f"**Audio Features:** {extract_audio_features(processed_audio_path)}")
271
  st.write("""
272
  **How it works:**
273
+ - Audio Emotion: Wav2Vec2 detects 7 emotions with feature-based boosts.
274
  - Transcription: Whisper converts audio to text.
275
  - Text Emotion: RoBERTa refines 27 emotions from text.
276
  - Sarcasm: Analyzes text for irony.
 
284
 
285
  # Main App Logic
286
  def main():
287
+ st.header("Upload Audio File")
288
+ audio_file = st.file_uploader("Upload audio (wav, mp3, ogg)", type=["wav", "mp3", "ogg"])
289
+ if audio_file:
290
+ temp_audio_path = process_audio_file(audio_file)
 
 
 
 
 
 
 
 
 
 
 
 
291
  if temp_audio_path:
292
+ if st.button("Analyze Audio"):
293
+ with st.spinner("Analyzing..."):
294
  display_analysis_results(temp_audio_path)
295
 
296
  st.sidebar.header("About")
 
300
  - Text: SamLowe/roberta-base-go_emotions (27 emotions)
301
  - Sarcasm: cardiffnlp/twitter-roberta-base-irony
302
  - Speech: OpenAI Whisper (large-v3)
303
+ **Note:** Recording is not supported on Hugging Face Spaces; use uploaded files.
304
  """)
305
 
306
  if __name__ == "__main__":