Voice2PersonaAI / app.py
Nick021402's picture
Update app.py
5a8efbe verified
import gradio as gr
import torch
import torchaudio
import numpy as np
from transformers import (
Wav2Vec2ForCTC,
Wav2Vec2Tokenizer,
Wav2Vec2FeatureExtractor,
AutoModelForAudioClassification,
AutoFeatureExtractor,
T5ForConditionalGeneration,
T5Tokenizer,
Wav2Vec2ForSequenceClassification
)
import librosa
import warnings
warnings.filterwarnings("ignore")
# Initialize models and tokenizers
print("Loading models...")
# Speech-to-Text Model
stt_tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
stt_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
stt_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
# Emotion Recognition Model - using a more reliable model
try:
from transformers import Wav2Vec2ForSequenceClassification
emotion_feature_extractor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er")
emotion_model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er")
except:
# Fallback to a simpler approach using audio features
emotion_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
emotion_model = None
print("Using fallback emotion detection method")
# Personality Generation Model
personality_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
personality_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
print("Models loaded successfully!")
# Emotion labels mapping (updated for broader coverage)
EMOTION_LABELS = {
0: "angry",
1: "happy",
2: "sad",
3: "neutral",
4: "excited",
5: "calm",
6: "surprised"
}
def preprocess_audio(audio_path, target_sr=16000):
"""Load and preprocess audio for model input"""
try:
# Load audio file
audio, sr = librosa.load(audio_path, sr=target_sr)
# Ensure audio is not too short
if len(audio) < target_sr * 0.5: # Less than 0.5 seconds
audio = np.pad(audio, (0, int(target_sr * 0.5) - len(audio)), mode='constant')
return audio, sr
except Exception as e:
print(f"Error preprocessing audio: {e}")
return None, None
def transcribe_audio(audio_path):
"""Convert speech to text using Wav2Vec2"""
try:
audio, sr = preprocess_audio(audio_path)
if audio is None:
return "Error: Could not process audio file"
# Extract features
inputs = stt_feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
# Get model predictions
with torch.no_grad():
logits = stt_model(inputs.input_values).logits
# Decode predictions
predicted_ids = torch.argmax(logits, dim=-1)
transcription = stt_tokenizer.batch_decode(predicted_ids)[0]
return transcription.strip()
except Exception as e:
return f"Transcription error: {str(e)}"
def detect_emotion(audio_path):
"""Detect emotion from audio using audio features analysis"""
try:
audio, sr = preprocess_audio(audio_path)
if audio is None:
return "Error: Could not process audio file", 0.0
if emotion_model is not None:
# Use the wav2vec2 emotion model if available
inputs = emotion_feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
with torch.no_grad():
outputs = emotion_model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
emotion_id = torch.argmax(predictions, dim=-1).item()
confidence = torch.max(predictions).item()
emotion_label = EMOTION_LABELS.get(emotion_id, "neutral")
else:
# Fallback: Simple audio feature-based emotion detection
# Analyze audio characteristics
rms_energy = np.sqrt(np.mean(audio**2))
zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(audio)[0])
spectral_centroid = np.mean(librosa.feature.spectral_centroid(audio, sr=sr)[0])
# Simple heuristic-based emotion classification
if rms_energy > 0.02 and zero_crossing_rate > 0.1:
emotion_label = "excited"
confidence = 0.75
elif rms_energy < 0.005:
emotion_label = "calm"
confidence = 0.70
elif spectral_centroid > 2000:
emotion_label = "happy"
confidence = 0.65
else:
emotion_label = "neutral"
confidence = 0.60
return emotion_label, confidence
except Exception as e:
return "neutral", 0.50 # Default fallback
def generate_personality(transcription, emotion, confidence):
"""Generate personality description using FLAN-T5"""
try:
# Create a comprehensive prompt for personality analysis
prompt = f"""Analyze this person's personality based on their speech:
Speech content: "{transcription}"
Detected emotion: {emotion} (confidence: {confidence:.2f})
Based on the way they speak, their word choice, emotional tone, and overall communication style, provide a detailed personality analysis. Consider their potential traits, communication style, emotional intelligence, and social characteristics. Write this as a natural, engaging personality profile in 3-4 sentences."""
# Tokenize and generate
inputs = personality_tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
with torch.no_grad():
outputs = personality_model.generate(
inputs,
max_length=200,
min_length=50,
temperature=0.7,
do_sample=True,
top_p=0.9,
pad_token_id=personality_tokenizer.eos_token_id
)
personality_description = personality_tokenizer.decode(outputs[0], skip_special_tokens=True)
return personality_description
except Exception as e:
return f"Personality generation error: {str(e)}"
def create_confidence_bar(emotion, confidence):
"""Create a visual representation of emotion confidence"""
bar_length = int(confidence * 20) # Scale to 20 characters
bar = "β–ˆ" * bar_length + "β–‘" * (20 - bar_length)
return f"{emotion.upper()} {bar} {confidence:.1%}"
def analyze_voice(audio_file):
"""Main function that orchestrates the entire analysis pipeline"""
if audio_file is None:
return "Please upload or record an audio file.", "", "", ""
try:
# Step 1: Transcribe speech
transcription = transcribe_audio(audio_file)
# Step 2: Detect emotion
emotion, confidence = detect_emotion(audio_file)
# Step 3: Generate personality description
personality = generate_personality(transcription, emotion, confidence)
# Create formatted output
confidence_display = create_confidence_bar(emotion, confidence)
# Format results
results_summary = f"""
🎯 **VOICE ANALYSIS COMPLETE**
**What they said:** {transcription}
**How they felt:** {confidence_display}
**Who they might be:** {personality}
"""
return transcription, confidence_display, personality, results_summary
except Exception as e:
error_msg = f"Analysis failed: {str(e)}"
return error_msg, "", "", error_msg
# Create the Gradio interface
def create_interface():
with gr.Blocks(
theme=gr.themes.Soft(),
title="Voice2Persona AI",
css="""
.main-header {
text-align: center;
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
font-size: 2.5em;
font-weight: bold;
margin-bottom: 0.5em;
}
.description {
text-align: center;
font-size: 1.1em;
color: #666;
margin-bottom: 2em;
}
.result-box {
border-radius: 10px;
padding: 20px;
margin: 10px 0;
}
"""
) as interface:
gr.HTML("""
<div class="main-header">πŸŽ™οΈ Voice2Persona AI</div>
<div class="description">
Discover your voice's hidden story! Upload or record audio to uncover what you said,
how you felt, and insights into your personality.
</div>
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### 🎡 Audio Input")
audio_input = gr.Audio(
label="Record or Upload Audio",
type="filepath",
sources=["microphone", "upload"]
)
analyze_btn = gr.Button(
"πŸ” Analyze Voice",
variant="primary",
size="lg"
)
gr.Markdown("""
**Tips for best results:**
- Speak clearly for 3-10 seconds
- Use a quiet environment
- Express yourself naturally
""")
with gr.Column(scale=2):
gr.Markdown("### πŸ“Š Analysis Results")
with gr.Tab("πŸ“ Complete Analysis"):
results_display = gr.Markdown(
label="Full Analysis",
value="Upload audio to see your voice analysis here..."
)
with gr.Tab("πŸ” Detailed Breakdown"):
transcription_output = gr.Textbox(
label="πŸ’¬ Speech Content (What you said)",
placeholder="Transcription will appear here...",
lines=3
)
emotion_output = gr.Textbox(
label="😊 Emotional State (How you felt)",
placeholder="Emotion analysis will appear here...",
lines=2
)
personality_output = gr.Textbox(
label="🧠 Personality Insights (Who you might be)",
placeholder="Personality analysis will appear here...",
lines=5
)
# Connect the analyze button to the main function
analyze_btn.click(
fn=analyze_voice,
inputs=[audio_input],
outputs=[transcription_output, emotion_output, personality_output, results_display]
)
gr.Markdown("""
---
### About Voice2Persona AI
This AI system combines three powerful models:
- **Speech-to-Text**: Facebook's Wav2Vec2 for accurate transcription
- **Emotion Detection**: Specialized model for voice emotion recognition
- **Personality Analysis**: Google's FLAN-T5 for generating personality insights
*Built with ❀️ using Hugging Face Transformers and Gradio*
""")
return interface
# Launch the app
if __name__ == "__main__":
app = create_interface()
app.launch(
share=True,
show_error=True,
server_name="0.0.0.0",
server_port=7860
)