Spaces:
Running
Running
File size: 11,794 Bytes
66a19be 72324ef 4c08572 5a8efbe 4c08572 5a8efbe 4c08572 5a8efbe 4c08572 5a8efbe 4c08572 5a8efbe 4c08572 5a8efbe 4c08572 5a8efbe 4c08572 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 |
import gradio as gr
import torch
import torchaudio
import numpy as np
from transformers import (
Wav2Vec2ForCTC,
Wav2Vec2Tokenizer,
Wav2Vec2FeatureExtractor,
AutoModelForAudioClassification,
AutoFeatureExtractor,
T5ForConditionalGeneration,
T5Tokenizer,
Wav2Vec2ForSequenceClassification
)
import librosa
import warnings
warnings.filterwarnings("ignore")
# Initialize models and tokenizers
print("Loading models...")
# Speech-to-Text Model
stt_tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
stt_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
stt_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
# Emotion Recognition Model - using a more reliable model
try:
from transformers import Wav2Vec2ForSequenceClassification
emotion_feature_extractor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er")
emotion_model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er")
except:
# Fallback to a simpler approach using audio features
emotion_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
emotion_model = None
print("Using fallback emotion detection method")
# Personality Generation Model
personality_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
personality_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
print("Models loaded successfully!")
# Emotion labels mapping (updated for broader coverage)
EMOTION_LABELS = {
0: "angry",
1: "happy",
2: "sad",
3: "neutral",
4: "excited",
5: "calm",
6: "surprised"
}
def preprocess_audio(audio_path, target_sr=16000):
"""Load and preprocess audio for model input"""
try:
# Load audio file
audio, sr = librosa.load(audio_path, sr=target_sr)
# Ensure audio is not too short
if len(audio) < target_sr * 0.5: # Less than 0.5 seconds
audio = np.pad(audio, (0, int(target_sr * 0.5) - len(audio)), mode='constant')
return audio, sr
except Exception as e:
print(f"Error preprocessing audio: {e}")
return None, None
def transcribe_audio(audio_path):
"""Convert speech to text using Wav2Vec2"""
try:
audio, sr = preprocess_audio(audio_path)
if audio is None:
return "Error: Could not process audio file"
# Extract features
inputs = stt_feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
# Get model predictions
with torch.no_grad():
logits = stt_model(inputs.input_values).logits
# Decode predictions
predicted_ids = torch.argmax(logits, dim=-1)
transcription = stt_tokenizer.batch_decode(predicted_ids)[0]
return transcription.strip()
except Exception as e:
return f"Transcription error: {str(e)}"
def detect_emotion(audio_path):
"""Detect emotion from audio using audio features analysis"""
try:
audio, sr = preprocess_audio(audio_path)
if audio is None:
return "Error: Could not process audio file", 0.0
if emotion_model is not None:
# Use the wav2vec2 emotion model if available
inputs = emotion_feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
with torch.no_grad():
outputs = emotion_model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
emotion_id = torch.argmax(predictions, dim=-1).item()
confidence = torch.max(predictions).item()
emotion_label = EMOTION_LABELS.get(emotion_id, "neutral")
else:
# Fallback: Simple audio feature-based emotion detection
# Analyze audio characteristics
rms_energy = np.sqrt(np.mean(audio**2))
zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(audio)[0])
spectral_centroid = np.mean(librosa.feature.spectral_centroid(audio, sr=sr)[0])
# Simple heuristic-based emotion classification
if rms_energy > 0.02 and zero_crossing_rate > 0.1:
emotion_label = "excited"
confidence = 0.75
elif rms_energy < 0.005:
emotion_label = "calm"
confidence = 0.70
elif spectral_centroid > 2000:
emotion_label = "happy"
confidence = 0.65
else:
emotion_label = "neutral"
confidence = 0.60
return emotion_label, confidence
except Exception as e:
return "neutral", 0.50 # Default fallback
def generate_personality(transcription, emotion, confidence):
"""Generate personality description using FLAN-T5"""
try:
# Create a comprehensive prompt for personality analysis
prompt = f"""Analyze this person's personality based on their speech:
Speech content: "{transcription}"
Detected emotion: {emotion} (confidence: {confidence:.2f})
Based on the way they speak, their word choice, emotional tone, and overall communication style, provide a detailed personality analysis. Consider their potential traits, communication style, emotional intelligence, and social characteristics. Write this as a natural, engaging personality profile in 3-4 sentences."""
# Tokenize and generate
inputs = personality_tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
with torch.no_grad():
outputs = personality_model.generate(
inputs,
max_length=200,
min_length=50,
temperature=0.7,
do_sample=True,
top_p=0.9,
pad_token_id=personality_tokenizer.eos_token_id
)
personality_description = personality_tokenizer.decode(outputs[0], skip_special_tokens=True)
return personality_description
except Exception as e:
return f"Personality generation error: {str(e)}"
def create_confidence_bar(emotion, confidence):
"""Create a visual representation of emotion confidence"""
bar_length = int(confidence * 20) # Scale to 20 characters
bar = "β" * bar_length + "β" * (20 - bar_length)
return f"{emotion.upper()} {bar} {confidence:.1%}"
def analyze_voice(audio_file):
"""Main function that orchestrates the entire analysis pipeline"""
if audio_file is None:
return "Please upload or record an audio file.", "", "", ""
try:
# Step 1: Transcribe speech
transcription = transcribe_audio(audio_file)
# Step 2: Detect emotion
emotion, confidence = detect_emotion(audio_file)
# Step 3: Generate personality description
personality = generate_personality(transcription, emotion, confidence)
# Create formatted output
confidence_display = create_confidence_bar(emotion, confidence)
# Format results
results_summary = f"""
π― **VOICE ANALYSIS COMPLETE**
**What they said:** {transcription}
**How they felt:** {confidence_display}
**Who they might be:** {personality}
"""
return transcription, confidence_display, personality, results_summary
except Exception as e:
error_msg = f"Analysis failed: {str(e)}"
return error_msg, "", "", error_msg
# Create the Gradio interface
def create_interface():
with gr.Blocks(
theme=gr.themes.Soft(),
title="Voice2Persona AI",
css="""
.main-header {
text-align: center;
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
font-size: 2.5em;
font-weight: bold;
margin-bottom: 0.5em;
}
.description {
text-align: center;
font-size: 1.1em;
color: #666;
margin-bottom: 2em;
}
.result-box {
border-radius: 10px;
padding: 20px;
margin: 10px 0;
}
"""
) as interface:
gr.HTML("""
<div class="main-header">ποΈ Voice2Persona AI</div>
<div class="description">
Discover your voice's hidden story! Upload or record audio to uncover what you said,
how you felt, and insights into your personality.
</div>
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### π΅ Audio Input")
audio_input = gr.Audio(
label="Record or Upload Audio",
type="filepath",
sources=["microphone", "upload"]
)
analyze_btn = gr.Button(
"π Analyze Voice",
variant="primary",
size="lg"
)
gr.Markdown("""
**Tips for best results:**
- Speak clearly for 3-10 seconds
- Use a quiet environment
- Express yourself naturally
""")
with gr.Column(scale=2):
gr.Markdown("### π Analysis Results")
with gr.Tab("π Complete Analysis"):
results_display = gr.Markdown(
label="Full Analysis",
value="Upload audio to see your voice analysis here..."
)
with gr.Tab("π Detailed Breakdown"):
transcription_output = gr.Textbox(
label="π¬ Speech Content (What you said)",
placeholder="Transcription will appear here...",
lines=3
)
emotion_output = gr.Textbox(
label="π Emotional State (How you felt)",
placeholder="Emotion analysis will appear here...",
lines=2
)
personality_output = gr.Textbox(
label="π§ Personality Insights (Who you might be)",
placeholder="Personality analysis will appear here...",
lines=5
)
# Connect the analyze button to the main function
analyze_btn.click(
fn=analyze_voice,
inputs=[audio_input],
outputs=[transcription_output, emotion_output, personality_output, results_display]
)
gr.Markdown("""
---
### About Voice2Persona AI
This AI system combines three powerful models:
- **Speech-to-Text**: Facebook's Wav2Vec2 for accurate transcription
- **Emotion Detection**: Specialized model for voice emotion recognition
- **Personality Analysis**: Google's FLAN-T5 for generating personality insights
*Built with β€οΈ using Hugging Face Transformers and Gradio*
""")
return interface
# Launch the app
if __name__ == "__main__":
app = create_interface()
app.launch(
share=True,
show_error=True,
server_name="0.0.0.0",
server_port=7860
) |