MaroofTechSorcerer commited on
Commit
f6d1ff0
Β·
verified Β·
1 Parent(s): 48302e7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +227 -446
app.py CHANGED
@@ -3,16 +3,19 @@ import streamlit as st
3
  import tempfile
4
  import torch
5
  import transformers
6
- from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
7
  import plotly.express as px
8
  import logging
9
  import warnings
10
  import whisper
11
  from pydub import AudioSegment
12
  import time
13
- import base64
 
 
 
 
14
  import io
15
- import streamlit.components.v1 as components
16
 
17
  # Suppress warnings for a clean console
18
  logging.getLogger("torch").setLevel(logging.CRITICAL)
@@ -25,100 +28,98 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
  print(f"Using device: {device}")
26
 
27
  # Set Streamlit app layout
28
- st.set_page_config(layout="wide", page_title="Voice Based Sentiment Analysis")
29
 
30
  # Interface design
31
- st.title("πŸŽ™οΈ Voice Based Sentiment Analysis")
32
- st.write("Detect emotions, sentiment, and sarcasm from your voice with state-of-the-art accuracy using OpenAI Whisper.")
33
 
34
- # Emotion Detection Function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  @st.cache_resource
36
- def get_emotion_classifier():
37
  tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions", use_fast=True)
38
  model = AutoModelForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions")
39
  model = model.to(device)
40
  return pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=None, device=-1 if device.type == "cpu" else 0)
41
 
42
- def perform_emotion_detection(text):
43
  try:
44
- if not text or len(text.strip()) < 3:
45
- return {}, "neutral", {}, "NEUTRAL"
46
-
47
- emotion_classifier = get_emotion_classifier()
48
- emotion_results = emotion_classifier(text)[0]
49
-
50
- emotion_map = {
51
- "admiration": "🀩", "amusement": "πŸ˜„", "anger": "😑", "annoyance": "πŸ˜’",
52
- "approval": "πŸ‘", "caring": "πŸ€—", "confusion": "πŸ˜•", "curiosity": "🧐",
53
- "desire": "😍", "disappointment": "😞", "disapproval": "πŸ‘Ž", "disgust": "🀒",
54
- "embarrassment": "😳", "excitement": "🀩", "fear": "😨", "gratitude": "πŸ™",
55
- "grief": "😒", "joy": "😊", "love": "❀️", "nervousness": "😰",
56
- "optimism": "🌈", "pride": "😌", "realization": "πŸ’‘", "relief": "😌",
57
- "remorse": "πŸ˜”", "sadness": "😭", "surprise": "😲", "neutral": "😐"
58
- }
59
-
60
- positive_emotions = ["admiration", "amusement", "approval", "caring", "desire",
61
- "excitement", "gratitude", "joy", "love", "optimism", "pride", "relief"]
62
- negative_emotions = ["anger", "annoyance", "disappointment", "disapproval", "disgust",
63
- "embarrassment", "fear", "grief", "nervousness", "remorse", "sadness"]
64
- neutral_emotions = ["confusion", "curiosity", "realization", "surprise", "neutral"]
65
-
66
- # Fix 1: Create a clean emotions dictionary from results
67
- emotions_dict = {}
68
- for result in emotion_results:
69
- emotions_dict[result['label']] = result['score']
70
-
71
- # Fix 2: Filter out very low scores (below threshold)
72
- filtered_emotions = {k: v for k, v in emotions_dict.items() if v > 0.05}
73
-
74
- # If filtered dictionary is empty, fall back to original
75
- if not filtered_emotions:
76
- filtered_emotions = emotions_dict
77
-
78
- # Fix 3: Make sure we properly find the top emotion
79
- top_emotion = max(filtered_emotions, key=filtered_emotions.get)
80
- top_score = filtered_emotions[top_emotion]
81
-
82
- # Fix 4: More robust sentiment assignment
83
- if top_emotion in positive_emotions:
84
- sentiment = "POSITIVE"
85
- elif top_emotion in negative_emotions:
86
- sentiment = "NEGATIVE"
87
- else:
88
- # If the top emotion is neutral but there are strong competing emotions, use them
89
- competing_emotions = sorted(filtered_emotions.items(), key=lambda x: x[1], reverse=True)[:3]
90
-
91
- # Check if there's a close second non-neutral emotion
92
- if len(competing_emotions) > 1:
93
- if (competing_emotions[0][0] in neutral_emotions and
94
- competing_emotions[1][0] not in neutral_emotions and
95
- competing_emotions[1][1] > 0.7 * competing_emotions[0][1]):
96
- # Use the second strongest emotion instead
97
- top_emotion = competing_emotions[1][0]
98
- if top_emotion in positive_emotions:
99
- sentiment = "POSITIVE"
100
- elif top_emotion in negative_emotions:
101
- sentiment = "NEGATIVE"
102
- else:
103
- sentiment = "NEUTRAL"
104
- else:
105
- sentiment = "NEUTRAL"
106
- else:
107
- sentiment = "NEUTRAL"
108
-
109
- # Log for debugging
110
- print(f"Text: {text[:50]}...")
111
- print(f"Top 3 emotions: {sorted(filtered_emotions.items(), key=lambda x: x[1], reverse=True)[:3]}")
112
- print(f"Selected top emotion: {top_emotion} ({filtered_emotions.get(top_emotion, 0):.3f})")
113
- print(f"Sentiment determined: {sentiment}")
114
-
115
- return emotions_dict, top_emotion, emotion_map, sentiment
116
  except Exception as e:
117
- st.error(f"Emotion detection failed: {str(e)}")
118
- print(f"Exception in emotion detection: {str(e)}")
119
- return {}, "neutral", {}, "NEUTRAL"
120
 
121
- # Sarcasm Detection Function
122
  @st.cache_resource
123
  def get_sarcasm_classifier():
124
  tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-irony", use_fast=True)
@@ -128,11 +129,8 @@ def get_sarcasm_classifier():
128
 
129
  def perform_sarcasm_detection(text):
130
  try:
131
- if not text or len(text.strip()) < 3:
132
- return False, 0.0
133
-
134
- sarcasm_classifier = get_sarcasm_classifier()
135
- result = sarcasm_classifier(text)[0]
136
  is_sarcastic = result['label'] == "LABEL_1"
137
  sarcasm_score = result['score'] if is_sarcastic else 1 - result['score']
138
  return is_sarcastic, sarcasm_score
@@ -140,415 +138,198 @@ def perform_sarcasm_detection(text):
140
  st.error(f"Sarcasm detection failed: {str(e)}")
141
  return False, 0.0
142
 
143
- # Validate audio quality
144
  def validate_audio(audio_path):
145
  try:
146
  sound = AudioSegment.from_file(audio_path)
147
- if sound.dBFS < -50:
148
- st.warning("Audio volume is too low. Please record or upload a louder audio.")
149
- return False
150
- if len(sound) < 1000: # Less than 1 second
151
- st.warning("Audio is too short. Please record a longer audio.")
152
  return False
153
  return True
154
- except:
155
  st.error("Invalid or corrupted audio file.")
156
  return False
157
 
158
  # Speech Recognition with Whisper
159
  @st.cache_resource
160
  def load_whisper_model():
161
- # Use 'large-v3' for maximum accuracy
162
- model = whisper.load_model("large-v3")
163
- return model
164
 
165
- def transcribe_audio(audio_path, show_alternative=False):
166
  try:
167
- st.write(f"Processing audio file: {audio_path}")
168
  sound = AudioSegment.from_file(audio_path)
169
- st.write(f"Audio duration: {len(sound)/1000:.2f}s, Sample rate: {sound.frame_rate}, Channels: {sound.channels}")
170
-
171
- # Convert to WAV format (16kHz, mono) for Whisper
172
  temp_wav_path = os.path.join(tempfile.gettempdir(), "temp_converted.wav")
173
- sound = sound.set_frame_rate(16000)
174
- sound = sound.set_channels(1)
175
  sound.export(temp_wav_path, format="wav")
176
-
177
- # Load Whisper model
178
  model = load_whisper_model()
179
-
180
- # Transcribe audio
181
  result = model.transcribe(temp_wav_path, language="en")
182
- main_text = result["text"].strip()
183
-
184
- # Clean up
185
- if os.path.exists(temp_wav_path):
186
- os.remove(temp_wav_path)
187
-
188
- # Whisper doesn't provide alternatives, so return empty list
189
- if show_alternative:
190
- return main_text, []
191
- return main_text
192
  except Exception as e:
193
  st.error(f"Transcription failed: {str(e)}")
194
- return "", [] if show_alternative else ""
195
 
196
- # Function to handle uploaded audio files
197
- def process_uploaded_audio(audio_file):
198
- if not audio_file:
199
- return None
 
 
 
 
 
 
200
 
201
- try:
202
- temp_dir = tempfile.gettempdir()
203
- temp_file_path = os.path.join(temp_dir, f"uploaded_audio_{int(time.time())}.wav")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
- with open(temp_file_path, "wb") as f:
206
- f.write(audio_file.getvalue())
207
-
208
- if not validate_audio(temp_file_path):
209
- return None
210
-
211
  return temp_file_path
212
- except Exception as e:
213
- st.error(f"Error processing uploaded audio: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
214
  return None
 
215
 
216
- # Show model information
217
- def show_model_info():
218
- st.sidebar.header("🧠 About the Models")
219
-
220
- model_tabs = st.sidebar.tabs(["Emotion", "Sarcasm", "Speech"])
221
-
222
- with model_tabs[0]:
223
- st.markdown("""
224
- **Emotion Model**: SamLowe/roberta-base-go_emotions
225
- - Fine-tuned on GoEmotions dataset (58k Reddit comments, 27 emotions)
226
- - Architecture: RoBERTa base
227
- - Micro-F1: 0.46
228
- [πŸ” Model Hub](https://huggingface.co/SamLowe/roberta-base-go_emotions)
229
- """)
230
-
231
- with model_tabs[1]:
232
- st.markdown("""
233
- **Sarcasm Model**: cardiffnlp/twitter-roberta-base-irony
234
- - Trained on SemEval-2018 Task 3 (Twitter irony dataset)
235
- - Architecture: RoBERTa base
236
- - F1-score: 0.705
237
- [πŸ” Model Hub](https://huggingface.co/cardiffnlp/twitter-roberta-base-irony)
238
- """)
239
-
240
- with model_tabs[2]:
241
- st.markdown("""
242
- **Speech Recognition**: OpenAI Whisper (large-v3)
243
- - State-of-the-art model for speech-to-text
244
- - Accuracy: ~5-10% WER on clean English audio
245
- - Robust to noise, accents, and varied conditions
246
- - Runs locally, no internet required
247
- **Tips**: Use good mic, reduce noise, speak clearly
248
- [πŸ” Model Details](https://github.com/openai/whisper)
249
- """)
250
 
251
- # Custom audio recorder using HTML/JS
252
- def custom_audio_recorder():
253
- audio_recorder_html = """
254
- <script>
255
- var audioRecorder = {
256
- audioBlobs: [],
257
- mediaRecorder: null,
258
- streamBeingCaptured: null,
259
- start: function() {
260
- if (!(navigator.mediaDevices && navigator.mediaDevices.getUserMedia)) {
261
- return Promise.reject(new Error('mediaDevices API or getUserMedia method is not supported in this browser.'));
262
- }
263
- else {
264
- return navigator.mediaDevices.getUserMedia({ audio: true })
265
- .then(stream => {
266
- audioRecorder.streamBeingCaptured = stream;
267
- audioRecorder.mediaRecorder = new MediaRecorder(stream);
268
- audioRecorder.audioBlobs = [];
269
-
270
- audioRecorder.mediaRecorder.addEventListener("dataavailable", event => {
271
- audioRecorder.audioBlobs.push(event.data);
272
- });
273
-
274
- audioRecorder.mediaRecorder.start();
275
- });
276
- }
277
- },
278
- stop: function() {
279
- return new Promise(resolve => {
280
- let mimeType = audioRecorder.mediaRecorder.mimeType;
281
-
282
- audioRecorder.mediaRecorder.addEventListener("stop", () => {
283
- let audioBlob = new Blob(audioRecorder.audioBlobs, { type: mimeType });
284
- resolve(audioBlob);
285
- });
286
-
287
- audioRecorder.mediaRecorder.stop();
288
-
289
- audioRecorder.stopStream();
290
- audioRecorder.resetRecordingProperties();
291
- });
292
- },
293
- stopStream: function() {
294
- audioRecorder.streamBeingCaptured.getTracks()
295
- .forEach(track => track.stop());
296
- },
297
- resetRecordingProperties: function() {
298
- audioRecorder.mediaRecorder = null;
299
- audioRecorder.streamBeingCaptured = null;
300
- }
301
- }
302
- var isRecording = false;
303
- var recordButton = document.getElementById('record-button');
304
- var audioElement = document.getElementById('audio-playback');
305
- var audioData = document.getElementById('audio-data');
306
-
307
- function toggleRecording() {
308
- if (!isRecording) {
309
- audioRecorder.start()
310
- .then(() => {
311
- isRecording = true;
312
- recordButton.textContent = 'Stop Recording';
313
- recordButton.classList.add('recording');
314
- })
315
- .catch(error => {
316
- alert('Error starting recording: ' + error.message);
317
- });
318
- } else {
319
- audioRecorder.stop()
320
- .then(audioBlob => {
321
- const audioUrl = URL.createObjectURL(audioBlob);
322
- audioElement.src = audioUrl;
323
-
324
- const reader = new FileReader();
325
- reader.readAsDataURL(audioBlob);
326
- reader.onloadend = function() {
327
- const base64data = reader.result;
328
- audioData.value = base64data;
329
- const streamlitMessage = {type: "streamlit:setComponentValue", value: base64data};
330
- window.parent.postMessage(streamlitMessage, "*");
331
- }
332
-
333
- isRecording = false;
334
- recordButton.textContent = 'Start Recording';
335
- recordButton.classList.remove('recording');
336
- });
337
- }
338
- }
339
- document.addEventListener('DOMContentLoaded', function() {
340
- recordButton = document.getElementById('record-button');
341
- audioElement = document.getElementById('audio-playback');
342
- audioData = document.getElementById('audio-data');
343
-
344
- recordButton.addEventListener('click', toggleRecording);
345
- });
346
- </script>
347
- <div class="audio-recorder-container">
348
- <button id="record-button" class="record-button">Start Recording</button>
349
- <audio id="audio-playback" controls style="display:block; margin-top:10px;"></audio>
350
- <input type="hidden" id="audio-data" name="audio-data">
351
- </div>
352
- <style>
353
- .audio-recorder-container {
354
- display: flex;
355
- flex-direction: column;
356
- align-items: center;
357
- padding: 20px;
358
- }
359
- .record-button {
360
- background-color: #f63366;
361
- color: white;
362
- border: none;
363
- padding: 10px 20px;
364
- border-radius: 5px;
365
- cursor: pointer;
366
- font-size: 16px;
367
- }
368
- .record-button.recording {
369
- background-color: #ff0000;
370
- animation: pulse 1.5s infinite;
371
- }
372
- @keyframes pulse {
373
- 0% { opacity: 1; }
374
- 50% { opacity: 0.7; }
375
- 100% { opacity: 1; }
376
- }
377
- </style>
378
- """
379
-
380
- return components.html(audio_recorder_html, height=150)
381
 
382
- # Function to display analysis results
383
- def display_analysis_results(transcribed_text):
384
- # Fix 5: Add debugging to track what's happening
385
- st.session_state.debug_info = st.session_state.get('debug_info', [])
386
- st.session_state.debug_info.append(f"Processing text: {transcribed_text[:50]}...")
387
-
388
- emotions_dict, top_emotion, emotion_map, sentiment = perform_emotion_detection(transcribed_text)
389
- is_sarcastic, sarcasm_score = perform_sarcasm_detection(transcribed_text)
390
-
391
- # Add results to debug info
392
- st.session_state.debug_info.append(f"Top emotion: {top_emotion}, Sentiment: {sentiment}")
393
- st.session_state.debug_info.append(f"Sarcasm: {is_sarcastic}, Score: {sarcasm_score:.3f}")
394
 
395
- st.header("Transcribed Text")
396
- st.text_area("Text", transcribed_text, height=150, disabled=True, help="The audio converted to text.")
 
 
 
 
 
397
 
398
- confidence_score = min(0.95, max(0.70, len(transcribed_text.split()) / 50))
399
- st.caption(f"Transcription confidence: {confidence_score:.2f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
 
401
- st.header("Analysis Results")
402
- col1, col2 = st.columns([1, 2])
 
 
 
 
403
 
 
 
404
  with col1:
405
  st.subheader("Sentiment")
406
  sentiment_icon = "πŸ‘" if sentiment == "POSITIVE" else "πŸ‘Ž" if sentiment == "NEGATIVE" else "😐"
407
  st.markdown(f"**{sentiment_icon} {sentiment.capitalize()}** (Based on {top_emotion})")
408
- st.info("Sentiment reflects the dominant emotion's tone.")
409
-
410
  st.subheader("Sarcasm")
411
  sarcasm_icon = "😏" if is_sarcastic else "😐"
412
- sarcasm_text = "Detected" if is_sarcastic else "Not Detected"
413
- st.markdown(f"**{sarcasm_icon} {sarcasm_text}** (Score: {sarcasm_score:.3f})")
414
- st.info("Score indicates sarcasm confidence (0 to 1).")
415
 
416
  with col2:
417
- st.subheader("Emotions")
418
- if emotions_dict:
419
- st.markdown(f"**Dominant:** {emotion_map.get(top_emotion, '❓')} {top_emotion.capitalize()} (Score: {emotions_dict[top_emotion]:.3f})")
420
- sorted_emotions = sorted(emotions_dict.items(), key=lambda x: x[1], reverse=True)
421
- top_emotions = sorted_emotions[:8]
422
- emotions = [e[0] for e in top_emotions]
423
- scores = [e[1] for e in top_emotions]
424
- fig = px.bar(x=emotions, y=scores, labels={'x': 'Emotion', 'y': 'Score'},
425
- title="Top Emotions Distribution", color=emotions,
426
- color_discrete_sequence=px.colors.qualitative.Bold)
427
- fig.update_layout(yaxis_range=[0, 1], showlegend=False, title_font_size=14)
428
- st.plotly_chart(fig, use_container_width=True)
429
- else:
430
- st.write("No emotions detected.")
431
-
432
- # Fix 6: Add debug expander for troubleshooting
433
- with st.expander("Debug Information", expanded=False):
434
- st.write("Debugging information for troubleshooting:")
435
- for i, debug_line in enumerate(st.session_state.debug_info[-10:]):
436
- st.text(f"{i+1}. {debug_line}")
437
- if emotions_dict:
438
- st.write("Raw emotion scores:")
439
- for emotion, score in sorted(emotions_dict.items(), key=lambda x: x[1], reverse=True):
440
- if score > 0.01: # Only show non-negligible scores
441
- st.text(f"{emotion}: {score:.4f}")
442
-
443
- with st.expander("Analysis Details", expanded=False):
444
  st.write("""
445
- **How this works:**
446
- 1. **Speech Recognition**: Audio transcribed using OpenAI Whisper (large-v3)
447
- 2. **Emotion Analysis**: RoBERTa model trained on GoEmotions (27 emotions)
448
- 3. **Sentiment Analysis**: Derived from dominant emotion
449
- 4. **Sarcasm Detection**: RoBERTa model for irony detection
450
- **Accuracy depends on**:
451
- - Audio quality
452
- - Speech clarity
453
- - Background noise
454
- - Speech patterns
455
  """)
456
 
457
- # Process base64 audio data
458
- def process_base64_audio(base64_data):
459
- try:
460
- base64_binary = base64_data.split(',')[1]
461
- binary_data = base64.b64decode(base64_binary)
462
-
463
- temp_dir = tempfile.gettempdir()
464
- temp_file_path = os.path.join(temp_dir, f"recording_{int(time.time())}.wav")
465
-
466
- with open(temp_file_path, "wb") as f:
467
- f.write(binary_data)
468
-
469
- if not validate_audio(temp_file_path):
470
- return None
471
-
472
- return temp_file_path
473
- except Exception as e:
474
- st.error(f"Error processing audio data: {str(e)}")
475
- return None
476
 
477
  # Main App Logic
478
  def main():
479
- # Fix 7: Initialize session state for debugging
480
- if 'debug_info' not in st.session_state:
481
- st.session_state.debug_info = []
482
-
483
  tab1, tab2 = st.tabs(["πŸ“ Upload Audio", "πŸŽ™οΈ Record Audio"])
484
 
485
  with tab1:
486
- st.header("Upload an Audio File")
487
- audio_file = st.file_uploader("Choose an audio file", type=["wav", "mp3", "ogg"],
488
- help="Upload an audio file for analysis")
489
-
490
  if audio_file:
491
- st.audio(audio_file.getvalue())
492
- st.caption("🎧 Uploaded Audio Playback")
493
-
494
- upload_button = st.button("Analyze Upload", key="analyze_upload")
495
-
496
- if upload_button:
497
- with st.spinner('Analyzing audio with advanced precision...'):
498
- temp_audio_path = process_uploaded_audio(audio_file)
499
- if temp_audio_path:
500
- main_text, alternatives = transcribe_audio(temp_audio_path, show_alternative=True)
501
-
502
- if main_text:
503
- if alternatives:
504
- with st.expander("Alternative transcriptions detected", expanded=False):
505
- for i, alt in enumerate(alternatives[:3], 1):
506
- st.write(f"{i}. {alt}")
507
-
508
- display_analysis_results(main_text)
509
- else:
510
- st.error("Could not transcribe the audio. Please try again with clearer audio.")
511
-
512
- if os.path.exists(temp_audio_path):
513
- os.remove(temp_audio_path)
514
-
515
  with tab2:
516
  st.header("Record Your Voice")
517
- st.write("Use the recorder below to analyze your speech in real-time.")
518
-
519
- st.subheader("Browser-Based Recorder")
520
- st.write("Click the button below to start/stop recording.")
521
-
522
- audio_data = custom_audio_recorder()
523
-
524
- if audio_data:
525
- analyze_rec_button = st.button("Analyze Recording", key="analyze_rec")
526
-
527
- if analyze_rec_button:
528
- with st.spinner("Processing your recording..."):
529
- temp_audio_path = process_base64_audio(audio_data)
530
-
531
- if temp_audio_path:
532
- transcribed_text = transcribe_audio(temp_audio_path)
533
-
534
- if transcribed_text:
535
- display_analysis_results(transcribed_text)
536
- else:
537
- st.error("Could not transcribe the audio. Please try speaking more clearly.")
538
-
539
- if os.path.exists(temp_audio_path):
540
- os.remove(temp_audio_path)
541
-
542
- st.subheader("Manual Text Input")
543
- st.write("If recording doesn't work, you can type your text here:")
544
-
545
- manual_text = st.text_area("Enter text to analyze:", placeholder="Type what you want to analyze...")
546
- analyze_text_button = st.button("Analyze Text", key="analyze_manual")
547
-
548
- if analyze_text_button and manual_text:
549
- display_analysis_results(manual_text)
550
 
551
- show_model_info()
 
 
 
 
 
 
 
552
 
553
  if __name__ == "__main__":
554
  main()
 
3
  import tempfile
4
  import torch
5
  import transformers
6
+ from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
7
  import plotly.express as px
8
  import logging
9
  import warnings
10
  import whisper
11
  from pydub import AudioSegment
12
  import time
13
+ import numpy as np
14
+ import librosa
15
+ import subprocess
16
+ import pyaudio
17
+ import wave
18
  import io
 
19
 
20
  # Suppress warnings for a clean console
21
  logging.getLogger("torch").setLevel(logging.CRITICAL)
 
28
  print(f"Using device: {device}")
29
 
30
  # Set Streamlit app layout
31
+ st.set_page_config(layout="wide", page_title="Advanced Voice Emotion Analyzer")
32
 
33
  # Interface design
34
+ st.title("πŸŽ™οΈ Advanced Voice Emotion Analyzer")
35
+ st.write("Analyze all emotions from audio using hybrid ML models, ensuring accurate detection across 27 emotions.")
36
 
37
+ # Audio Preprocessing
38
+ def make_audio_scarier(audio_path, output_path):
39
+ try:
40
+ commands = [
41
+ f"ffmpeg -i {audio_path} -af 'asetrate=44100*0.8,aresample=44100' temp1.wav",
42
+ f"ffmpeg -i temp1.wav -af 'reverb=0.8:0.2:0.5:0.5:0.5:0.5' temp2.wav",
43
+ f"ffmpeg -i temp2.wav -af 'atempo=1.2' {output_path}"
44
+ ]
45
+ for cmd in commands:
46
+ subprocess.run(cmd, shell=True, check=True)
47
+ for temp_file in ["temp1.wav", "temp2.wav"]:
48
+ if os.path.exists(temp_file):
49
+ os.remove(temp_file)
50
+ except Exception as e:
51
+ st.error(f"Audio processing failed: {str(e)}")
52
+ raise
53
+
54
+ # Audio Feature Extraction
55
+ def extract_audio_features(audio_path):
56
+ try:
57
+ y, sr = librosa.load(audio_path, sr=16000)
58
+ pitch_mean = np.mean(librosa.piptrack(y=y, sr=sr)[0][librosa.piptrack(y=y, sr=sr)[0] > 0]) if np.any(librosa.piptrack(y=y, sr=sr)[0] > 0) else 0
59
+ energy_mean = np.mean(librosa.feature.rms(y=y))
60
+ zcr_mean = np.mean(librosa.feature.zero_crossing_rate(y))
61
+ return {"pitch_mean": pitch_mean, "energy_mean": energy_mean, "zcr_mean": zcr_mean}
62
+ except Exception as e:
63
+ st.error(f"Audio feature extraction failed: {str(e)}")
64
+ return {}
65
+
66
+ # Audio Emotion Classification with Wav2Vec2
67
+ @st.cache_resource
68
+ def get_audio_emotion_classifier():
69
+ processor = Wav2Vec2Processor.from_pretrained("superb/wav2vec2-base-superb-er")
70
+ model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er")
71
+ model = model.to(device)
72
+ return processor, model
73
+
74
+ def perform_audio_emotion_detection(audio_path):
75
+ try:
76
+ processor, model = get_audio_emotion_classifier()
77
+ waveform, sample_rate = librosa.load(audio_path, sr=16000)
78
+ inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
79
+ inputs = {k: v.to(device) for k, v in inputs.items()}
80
+ with torch.no_grad():
81
+ logits = model(**inputs).logits
82
+ scores = torch.softmax(logits, dim=1).detach().cpu().numpy()[0]
83
+ audio_emotions = ["neutral", "happy", "sad", "angry", "fearful", "surprise", "disgust"]
84
+ emotion_dict = {emotion: float(scores[i]) for i, emotion in enumerate(audio_emotions)}
85
+ top_emotion = audio_emotions[np.argmax(scores)]
86
+ # Boost emotions for audio characteristics
87
+ features = extract_audio_features(audio_path)
88
+ if features.get("pitch_mean", 0) < 200 and features.get("energy_mean", 0) > 0.1 and features.get("zcr_mean", 0) > 0.1:
89
+ emotion_dict["fearful"] = min(1.0, emotion_dict.get("fearful", 0) + 0.3)
90
+ top_emotion = "fearful" if emotion_dict["fearful"] > emotion_dict[top_emotion] else top_emotion
91
+ elif features.get("energy_mean", 0) > 0.2:
92
+ emotion_dict["angry"] = min(1.0, emotion_dict.get("angry", 0) + 0.2)
93
+ top_emotion = "angry" if emotion_dict["angry"] > emotion_dict[top_emotion] else top_emotion
94
+ return emotion_dict, top_emotion
95
+ except Exception as e:
96
+ st.error(f"Audio emotion detection failed: {str(e)}")
97
+ return {}, "unknown"
98
+
99
+ # Text Emotion Classification with RoBERTa
100
  @st.cache_resource
101
+ def get_text_emotion_classifier():
102
  tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions", use_fast=True)
103
  model = AutoModelForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions")
104
  model = model.to(device)
105
  return pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=None, device=-1 if device.type == "cpu" else 0)
106
 
107
+ def perform_text_emotion_detection(text):
108
  try:
109
+ classifier = get_text_emotion_classifier()
110
+ results = classifier(text)[0]
111
+ emotions = ["admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion",
112
+ "curiosity", "desire", "disappointment", "disapproval", "disgust", "embarrassment",
113
+ "excitement", "fear", "gratitude", "grief", "joy", "love", "nervousness", "optimism",
114
+ "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"]
115
+ emotions_dict = {result['label']: result['score'] for result in results if result['label'] in emotions}
116
+ top_emotion = max(emotions_dict, key=emotions_dict.get)
117
+ return emotions_dict, top_emotion
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  except Exception as e:
119
+ st.error(f"Text emotion detection failed: {str(e)}")
120
+ return {}, "unknown"
 
121
 
122
+ # Sarcasm Detection
123
  @st.cache_resource
124
  def get_sarcasm_classifier():
125
  tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-irony", use_fast=True)
 
129
 
130
  def perform_sarcasm_detection(text):
131
  try:
132
+ classifier = get_sarcasm_classifier()
133
+ result = classifier(text)[0]
 
 
 
134
  is_sarcastic = result['label'] == "LABEL_1"
135
  sarcasm_score = result['score'] if is_sarcastic else 1 - result['score']
136
  return is_sarcastic, sarcasm_score
 
138
  st.error(f"Sarcasm detection failed: {str(e)}")
139
  return False, 0.0
140
 
141
+ # Validate Audio
142
  def validate_audio(audio_path):
143
  try:
144
  sound = AudioSegment.from_file(audio_path)
145
+ if sound.dBFS < -50 or len(sound) < 1000:
146
+ st.warning("Audio volume too low or too short. Please use a louder, longer audio.")
 
 
 
147
  return False
148
  return True
149
+ except Exception:
150
  st.error("Invalid or corrupted audio file.")
151
  return False
152
 
153
  # Speech Recognition with Whisper
154
  @st.cache_resource
155
  def load_whisper_model():
156
+ return whisper.load_model("large-v3")
 
 
157
 
158
+ def transcribe_audio(audio_path):
159
  try:
 
160
  sound = AudioSegment.from_file(audio_path)
 
 
 
161
  temp_wav_path = os.path.join(tempfile.gettempdir(), "temp_converted.wav")
162
+ sound = sound.set_frame_rate(16000).set_channels(1)
 
163
  sound.export(temp_wav_path, format="wav")
 
 
164
  model = load_whisper_model()
 
 
165
  result = model.transcribe(temp_wav_path, language="en")
166
+ os.remove(temp_wav_path)
167
+ return result["text"].strip()
 
 
 
 
 
 
 
 
168
  except Exception as e:
169
  st.error(f"Transcription failed: {str(e)}")
170
+ return ""
171
 
172
+ # Python Audio Recording
173
+ def record_audio():
174
+ CHUNK = 1024
175
+ FORMAT = pyaudio.paInt16
176
+ CHANNELS = 1
177
+ RATE = 16000
178
+ RECORD_SECONDS = st.slider("Recording duration (seconds)", 1, 30, 5)
179
+
180
+ p = pyaudio.PyAudio()
181
+ stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
182
 
183
+ if st.button("Start Recording"):
184
+ st.write("Recording...")
185
+ frames = []
186
+ for _ in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
187
+ data = stream.read(CHUNK)
188
+ frames.append(data)
189
+ st.write("Recording finished.")
190
+
191
+ stream.stop_stream()
192
+ stream.close()
193
+ p.terminate()
194
+
195
+ temp_file_path = os.path.join(tempfile.gettempdir(), f"recorded_audio_{int(time.time())}.wav")
196
+ wf = wave.open(temp_file_path, 'wb')
197
+ wf.setnchannels(CHANNELS)
198
+ wf.setsampwidth(p.get_sample_size(FORMAT))
199
+ wf.setframerate(RATE)
200
+ wf.writeframes(b''.join(frames))
201
+ wf.close()
202
 
 
 
 
 
 
 
203
  return temp_file_path
204
+ return None
205
+
206
+ # Process Audio Files
207
+ def process_audio_file(audio_data):
208
+ temp_dir = tempfile.gettempdir()
209
+ temp_file_path = os.path.join(temp_dir, f"audio_{int(time.time())}.wav")
210
+ with open(temp_file_path, "wb") as f:
211
+ if isinstance(audio_data, str):
212
+ with open(audio_data, "rb") as f_audio:
213
+ f.write(f_audio.read())
214
+ else:
215
+ f.write(audio_data.getvalue())
216
+ if not validate_audio(temp_file_path):
217
  return None
218
+ return temp_file_path
219
 
220
+ # Display Results
221
+ def display_analysis_results(audio_path):
222
+ st.header("Audio Analysis")
223
+ st.audio(audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
+ # Preprocess audio
226
+ processed_audio_path = os.path.join(tempfile.gettempdir(), f"processed_{int(time.time())}.wav")
227
+ make_audio_scarier(audio_path, processed_audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
+ # Audio emotion detection
230
+ audio_emotions, audio_top_emotion = perform_audio_emotion_detection(processed_audio_path)
231
+ st.subheader("Audio-Based Emotion")
232
+ st.write(f"**Dominant Emotion:** {audio_top_emotion} (Score: {audio_emotions.get(audio_top_emotion, 0):.3f})")
 
 
 
 
 
 
 
 
233
 
234
+ # Transcription and text emotion detection
235
+ transcribed_text = transcribe_audio(processed_audio_path)
236
+ st.subheader("Transcribed Text")
237
+ st.text_area("Text", transcribed_text, height=100, disabled=True)
238
+ if transcribed_text:
239
+ text_emotions, text_top_emotion = perform_text_emotion_detection(transcribed_text)
240
+ st.write(f"**Text-Based Dominant Emotion:** {text_top_emotion} (Score: {text_emotions.get(text_top_emotion, 0):.3f})")
241
 
242
+ # Combine emotions (prioritize audio, map to 27 emotions)
243
+ emotion_map = {
244
+ "neutral": "neutral", "happy": "joy", "sad": "sadness", "angry": "anger",
245
+ "fearful": "fear", "surprise": "surprise", "disgust": "disgust"
246
+ }
247
+ combined_emotions = {emotion: 0 for emotion in ["admiration", "amusement", "anger", "annoyance", "approval", "caring",
248
+ "confusion", "curiosity", "desire", "disappointment", "disapproval",
249
+ "disgust", "embarrassment", "excitement", "fear", "gratitude",
250
+ "grief", "joy", "love", "nervousness", "optimism", "pride",
251
+ "realization", "relief", "remorse", "sadness", "surprise", "neutral"]}
252
+ for audio_emotion, score in audio_emotions.items():
253
+ mapped_emotion = emotion_map.get(audio_emotion, "neutral")
254
+ combined_emotions[mapped_emotion] = max(combined_emotions[mapped_emotion], score * 0.7)
255
+ if transcribed_text:
256
+ for text_emotion, score in text_emotions.items():
257
+ combined_emotions[text_emotion] = combined_emotions.get(text_emotion, 0) + score * 0.3
258
 
259
+ top_emotion = max(combined_emotions, key=combined_emotions.get)
260
+ sentiment = "POSITIVE" if top_emotion in ["admiration", "amusement", "approval", "caring", "desire", "excitement",
261
+ "gratitude", "joy", "love", "optimism", "pride", "relief"] else "NEGATIVE" if top_emotion in ["anger", "annoyance", "disappointment", "disapproval", "disgust", "embarrassment", "fear", "grief", "nervousness", "remorse", "sadness"] else "NEUTRAL"
262
+
263
+ # Sarcasm detection
264
+ is_sarcastic, sarcasm_score = perform_sarcasm_detection(transcribed_text) if transcribed_text else (False, 0.0)
265
 
266
+ # Display results
267
+ col1, col2 = st.columns([1, 2])
268
  with col1:
269
  st.subheader("Sentiment")
270
  sentiment_icon = "πŸ‘" if sentiment == "POSITIVE" else "πŸ‘Ž" if sentiment == "NEGATIVE" else "😐"
271
  st.markdown(f"**{sentiment_icon} {sentiment.capitalize()}** (Based on {top_emotion})")
 
 
272
  st.subheader("Sarcasm")
273
  sarcasm_icon = "😏" if is_sarcastic else "😐"
274
+ st.markdown(f"**{sarcasm_icon} {'Detected' if is_sarcastic else 'Not Detected'}** (Score: {sarcasm_score:.3f})")
 
 
275
 
276
  with col2:
277
+ st.subheader("Emotion Distribution")
278
+ sorted_emotions = sorted(combined_emotions.items(), key=lambda x: x[1], reverse=True)[:10]
279
+ emotions, scores = zip(*sorted_emotions)
280
+ fig = px.bar(x=list(emotions), y=list(scores), labels={'x': 'Emotion', 'y': 'Score'},
281
+ title="Top Emotion Scores", color=list(emotions),
282
+ color_discrete_sequence=px.colors.qualitative.Bold)
283
+ fig.update_layout(yaxis_range=[0, 1], showlegend=False, title_font_size=14)
284
+ st.plotly_chart(fig, use_container_width=True)
285
+
286
+ with st.expander("Details"):
287
+ st.write(f"**Audio Features:** {extract_audio_features(processed_audio_path)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  st.write("""
289
+ **How it works:**
290
+ - Audio Emotion: Wav2Vec2 detects 7 emotions from audio.
291
+ - Transcription: Whisper converts audio to text.
292
+ - Text Emotion: RoBERTa refines 27 emotions from text.
293
+ - Sarcasm: Analyzes text for irony.
294
+ **Accuracy depends on:** Audio quality, clarity, and noise.
 
 
 
 
295
  """)
296
 
297
+ # Clean up
298
+ for path in [audio_path, processed_audio_path]:
299
+ if os.path.exists(path):
300
+ os.remove(path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
 
302
  # Main App Logic
303
  def main():
 
 
 
 
304
  tab1, tab2 = st.tabs(["πŸ“ Upload Audio", "πŸŽ™οΈ Record Audio"])
305
 
306
  with tab1:
307
+ st.header("Upload Audio File")
308
+ audio_file = st.file_uploader("Upload audio (wav, mp3, ogg)", type=["wav", "mp3", "ogg"])
 
 
309
  if audio_file:
310
+ temp_audio_path = process_audio_file(audio_file)
311
+ if temp_audio_path:
312
+ if st.button("Analyze Upload"):
313
+ with st.spinner("Analyzing..."):
314
+ display_analysis_results(temp_audio_path)
315
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  with tab2:
317
  st.header("Record Your Voice")
318
+ st.write("Record audio to analyze emotions in real-time.")
319
+ temp_audio_path = record_audio()
320
+ if temp_audio_path:
321
+ if st.button("Analyze Recording"):
322
+ with st.spinner("Processing..."):
323
+ display_analysis_results(temp_audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
+ st.sidebar.header("About")
326
+ st.sidebar.write("""
327
+ **Models Used:**
328
+ - Audio: superb/wav2vec2-base-superb-er (7 emotions)
329
+ - Text: SamLowe/roberta-base-go_emotions (27 emotions)
330
+ - Sarcasm: cardiffnlp/twitter-roberta-base-irony
331
+ - Speech: OpenAI Whisper (large-v3)
332
+ """)
333
 
334
  if __name__ == "__main__":
335
  main()