Nick021402 commited on
Commit
4c08572
Β·
verified Β·
1 Parent(s): 72324ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +294 -67
app.py CHANGED
@@ -1,70 +1,297 @@
1
  import gradio as gr
2
  import torch
3
- import numpy as np
4
  import torchaudio
5
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
6
- from transformers import pipeline
7
-
8
- # Load models once at startup
9
- asr_model_name = "facebook/wav2vec2-base-960h"
10
- emotion_model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
11
- gen_model_name = "google/flan-t5-base"
12
-
13
- # Load ASR
14
- asr_processor = Wav2Vec2Processor.from_pretrained(asr_model_name)
15
- asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name)
16
-
17
- # Load emotion detection
18
- emotion_classifier = pipeline("audio-classification", model=emotion_model_name)
19
-
20
- # Load personality generation pipeline
21
- gen_pipeline = pipeline("text2text-generation", model=gen_model_name)
22
-
23
- # Transcription Function
24
- def transcribe(audio):
25
- if isinstance(audio, tuple): # When type="numpy"
26
- sr, audio = 16000, audio[0] # Handle stereo or mono
27
- input_values = asr_processor(audio, sampling_rate=16000, return_tensors="pt").input_values
28
- with torch.no_grad():
29
- logits = asr_model(input_values).logits
30
- predicted_ids = torch.argmax(logits, dim=-1)
31
- transcription = asr_processor.decode(predicted_ids[0])
32
- return transcription.lower()
33
-
34
- # Personality Generation
35
- def generate_personality(text):
36
- prompt = f"Describe the speaker's personality based on this sentence: \"{text}\""
37
- response = gen_pipeline(prompt, max_new_tokens=50)[0]["generated_text"]
38
- return response.strip()
39
-
40
- # Emotion Detection
41
- def detect_emotion(audio):
42
- if isinstance(audio, tuple):
43
- audio = audio[0] # Extract numpy array from (array, sample_rate)
44
- results = emotion_classifier(audio, top_k=1)
45
- return results[0]["label"]
46
-
47
- # Main Pipeline
48
- def analyze(audio):
49
- transcription = transcribe(audio)
50
- emotion = detect_emotion(audio)
51
- personality = generate_personality(transcription)
52
- return transcription, emotion, personality
53
-
54
- # Gradio UI
55
- with gr.Blocks() as app:
56
- gr.Markdown("# Voice2Persona AI\nUpload or record your voice to reveal your mood and hidden persona!")
57
-
58
- with gr.Row():
59
- audio_input = gr.Audio(sources=["microphone", "upload"], type="numpy", label="🎀 Your Voice")
60
- submit_btn = gr.Button("Analyze")
61
-
62
- with gr.Column():
63
- transcript_output = gr.Textbox(label="Transcription")
64
- emotion_output = gr.Textbox(label="Detected Emotion")
65
- personality_output = gr.Textbox(label="AI-Generated Personality")
66
-
67
- submit_btn.click(fn=analyze, inputs=audio_input,
68
- outputs=[transcript_output, emotion_output, personality_output])
69
-
70
- app.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import torch
 
3
  import torchaudio
4
+ import numpy as np
5
+ from transformers import (
6
+ Wav2Vec2ForCTC,
7
+ Wav2Vec2Tokenizer,
8
+ Wav2Vec2FeatureExtractor,
9
+ AutoModelForAudioClassification,
10
+ AutoFeatureExtractor,
11
+ T5ForConditionalGeneration,
12
+ T5Tokenizer
13
+ )
14
+ import librosa
15
+ import warnings
16
+ warnings.filterwarnings("ignore")
17
+
18
+ # Initialize models and tokenizers
19
+ print("Loading models...")
20
+
21
+ # Speech-to-Text Model
22
+ stt_tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
23
+ stt_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
24
+ stt_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
25
+
26
+ # Emotion Recognition Model
27
+ emotion_feature_extractor = AutoFeatureExtractor.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
28
+ emotion_model = AutoModelForAudioClassification.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
29
+
30
+ # Personality Generation Model
31
+ personality_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
32
+ personality_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
33
+
34
+ print("Models loaded successfully!")
35
+
36
+ # Emotion labels mapping
37
+ EMOTION_LABELS = {
38
+ 0: "angry",
39
+ 1: "disgust",
40
+ 2: "fear",
41
+ 3: "happy",
42
+ 4: "neutral",
43
+ 5: "sad",
44
+ 6: "surprise"
45
+ }
46
+
47
+ def preprocess_audio(audio_path, target_sr=16000):
48
+ """Load and preprocess audio for model input"""
49
+ try:
50
+ # Load audio file
51
+ audio, sr = librosa.load(audio_path, sr=target_sr)
52
+
53
+ # Ensure audio is not too short
54
+ if len(audio) < target_sr * 0.5: # Less than 0.5 seconds
55
+ audio = np.pad(audio, (0, int(target_sr * 0.5) - len(audio)), mode='constant')
56
+
57
+ return audio, sr
58
+ except Exception as e:
59
+ print(f"Error preprocessing audio: {e}")
60
+ return None, None
61
+
62
+ def transcribe_audio(audio_path):
63
+ """Convert speech to text using Wav2Vec2"""
64
+ try:
65
+ audio, sr = preprocess_audio(audio_path)
66
+ if audio is None:
67
+ return "Error: Could not process audio file"
68
+
69
+ # Extract features
70
+ inputs = stt_feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
71
+
72
+ # Get model predictions
73
+ with torch.no_grad():
74
+ logits = stt_model(inputs.input_values).logits
75
+
76
+ # Decode predictions
77
+ predicted_ids = torch.argmax(logits, dim=-1)
78
+ transcription = stt_tokenizer.batch_decode(predicted_ids)[0]
79
+
80
+ return transcription.strip()
81
+ except Exception as e:
82
+ return f"Transcription error: {str(e)}"
83
+
84
+ def detect_emotion(audio_path):
85
+ """Detect emotion from audio using specialized model"""
86
+ try:
87
+ audio, sr = preprocess_audio(audio_path)
88
+ if audio is None:
89
+ return "Error: Could not process audio file", 0.0
90
+
91
+ # Extract features for emotion model
92
+ inputs = emotion_feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
93
+
94
+ # Get emotion predictions
95
+ with torch.no_grad():
96
+ outputs = emotion_model(**inputs)
97
+ predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
98
+
99
+ # Get the most likely emotion
100
+ emotion_id = torch.argmax(predictions, dim=-1).item()
101
+ confidence = torch.max(predictions).item()
102
+
103
+ emotion_label = EMOTION_LABELS.get(emotion_id, "unknown")
104
+
105
+ return emotion_label, confidence
106
+ except Exception as e:
107
+ return f"Emotion detection error: {str(e)}", 0.0
108
+
109
+ def generate_personality(transcription, emotion, confidence):
110
+ """Generate personality description using FLAN-T5"""
111
+ try:
112
+ # Create a comprehensive prompt for personality analysis
113
+ prompt = f"""Analyze this person's personality based on their speech:
114
+
115
+ Speech content: "{transcription}"
116
+ Detected emotion: {emotion} (confidence: {confidence:.2f})
117
+
118
+ Based on the way they speak, their word choice, emotional tone, and overall communication style, provide a detailed personality analysis. Consider their potential traits, communication style, emotional intelligence, and social characteristics. Write this as a natural, engaging personality profile in 3-4 sentences."""
119
+
120
+ # Tokenize and generate
121
+ inputs = personality_tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
122
+
123
+ with torch.no_grad():
124
+ outputs = personality_model.generate(
125
+ inputs,
126
+ max_length=200,
127
+ min_length=50,
128
+ temperature=0.7,
129
+ do_sample=True,
130
+ top_p=0.9,
131
+ pad_token_id=personality_tokenizer.eos_token_id
132
+ )
133
+
134
+ personality_description = personality_tokenizer.decode(outputs[0], skip_special_tokens=True)
135
+
136
+ return personality_description
137
+ except Exception as e:
138
+ return f"Personality generation error: {str(e)}"
139
+
140
+ def create_confidence_bar(emotion, confidence):
141
+ """Create a visual representation of emotion confidence"""
142
+ bar_length = int(confidence * 20) # Scale to 20 characters
143
+ bar = "β–ˆ" * bar_length + "β–‘" * (20 - bar_length)
144
+ return f"{emotion.upper()} {bar} {confidence:.1%}"
145
+
146
+ def analyze_voice(audio_file):
147
+ """Main function that orchestrates the entire analysis pipeline"""
148
+ if audio_file is None:
149
+ return "Please upload or record an audio file.", "", "", ""
150
+
151
+ try:
152
+ # Step 1: Transcribe speech
153
+ transcription = transcribe_audio(audio_file)
154
+
155
+ # Step 2: Detect emotion
156
+ emotion, confidence = detect_emotion(audio_file)
157
+
158
+ # Step 3: Generate personality description
159
+ personality = generate_personality(transcription, emotion, confidence)
160
+
161
+ # Create formatted output
162
+ confidence_display = create_confidence_bar(emotion, confidence)
163
+
164
+ # Format results
165
+ results_summary = f"""
166
+ 🎯 **VOICE ANALYSIS COMPLETE**
167
+
168
+ **What they said:** {transcription}
169
+
170
+ **How they felt:** {confidence_display}
171
+
172
+ **Who they might be:** {personality}
173
+ """
174
+
175
+ return transcription, confidence_display, personality, results_summary
176
+
177
+ except Exception as e:
178
+ error_msg = f"Analysis failed: {str(e)}"
179
+ return error_msg, "", "", error_msg
180
+
181
+ # Create the Gradio interface
182
+ def create_interface():
183
+ with gr.Blocks(
184
+ theme=gr.themes.Soft(),
185
+ title="Voice2Persona AI",
186
+ css="""
187
+ .main-header {
188
+ text-align: center;
189
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
190
+ -webkit-background-clip: text;
191
+ -webkit-text-fill-color: transparent;
192
+ font-size: 2.5em;
193
+ font-weight: bold;
194
+ margin-bottom: 0.5em;
195
+ }
196
+ .description {
197
+ text-align: center;
198
+ font-size: 1.1em;
199
+ color: #666;
200
+ margin-bottom: 2em;
201
+ }
202
+ .result-box {
203
+ border-radius: 10px;
204
+ padding: 20px;
205
+ margin: 10px 0;
206
+ }
207
+ """
208
+ ) as interface:
209
+
210
+ gr.HTML("""
211
+ <div class="main-header">πŸŽ™οΈ Voice2Persona AI</div>
212
+ <div class="description">
213
+ Discover your voice's hidden story! Upload or record audio to uncover what you said,
214
+ how you felt, and insights into your personality.
215
+ </div>
216
+ """)
217
+
218
+ with gr.Row():
219
+ with gr.Column(scale=1):
220
+ gr.Markdown("### 🎡 Audio Input")
221
+ audio_input = gr.Audio(
222
+ label="Record or Upload Audio",
223
+ type="filepath",
224
+ sources=["microphone", "upload"]
225
+ )
226
+
227
+ analyze_btn = gr.Button(
228
+ "πŸ” Analyze Voice",
229
+ variant="primary",
230
+ size="lg"
231
+ )
232
+
233
+ gr.Markdown("""
234
+ **Tips for best results:**
235
+ - Speak clearly for 3-10 seconds
236
+ - Use a quiet environment
237
+ - Express yourself naturally
238
+ """)
239
+
240
+ with gr.Column(scale=2):
241
+ gr.Markdown("### πŸ“Š Analysis Results")
242
+
243
+ with gr.Tab("πŸ“ Complete Analysis"):
244
+ results_display = gr.Markdown(
245
+ label="Full Analysis",
246
+ value="Upload audio to see your voice analysis here..."
247
+ )
248
+
249
+ with gr.Tab("πŸ” Detailed Breakdown"):
250
+ transcription_output = gr.Textbox(
251
+ label="πŸ’¬ Speech Content (What you said)",
252
+ placeholder="Transcription will appear here...",
253
+ lines=3
254
+ )
255
+
256
+ emotion_output = gr.Textbox(
257
+ label="😊 Emotional State (How you felt)",
258
+ placeholder="Emotion analysis will appear here...",
259
+ lines=2
260
+ )
261
+
262
+ personality_output = gr.Textbox(
263
+ label="🧠 Personality Insights (Who you might be)",
264
+ placeholder="Personality analysis will appear here...",
265
+ lines=5
266
+ )
267
+
268
+ # Connect the analyze button to the main function
269
+ analyze_btn.click(
270
+ fn=analyze_voice,
271
+ inputs=[audio_input],
272
+ outputs=[transcription_output, emotion_output, personality_output, results_display]
273
+ )
274
+
275
+ gr.Markdown("""
276
+ ---
277
+ ### About Voice2Persona AI
278
+
279
+ This AI system combines three powerful models:
280
+ - **Speech-to-Text**: Facebook's Wav2Vec2 for accurate transcription
281
+ - **Emotion Detection**: Specialized model for voice emotion recognition
282
+ - **Personality Analysis**: Google's FLAN-T5 for generating personality insights
283
+
284
+ *Built with ❀️ using Hugging Face Transformers and Gradio*
285
+ """)
286
+
287
+ return interface
288
+
289
+ # Launch the app
290
+ if __name__ == "__main__":
291
+ app = create_interface()
292
+ app.launch(
293
+ share=True,
294
+ show_error=True,
295
+ server_name="0.0.0.0",
296
+ server_port=7860
297
+ )