Spaces:
Running
Running
import gradio as gr | |
import torch | |
import torchaudio | |
import numpy as np | |
from transformers import ( | |
Wav2Vec2ForCTC, | |
Wav2Vec2Tokenizer, | |
Wav2Vec2FeatureExtractor, | |
AutoModelForAudioClassification, | |
AutoFeatureExtractor, | |
T5ForConditionalGeneration, | |
T5Tokenizer, | |
Wav2Vec2ForSequenceClassification | |
) | |
import librosa | |
import warnings | |
warnings.filterwarnings("ignore") | |
# Initialize models and tokenizers | |
print("Loading models...") | |
# Speech-to-Text Model | |
stt_tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") | |
stt_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") | |
stt_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h") | |
# Emotion Recognition Model - using a more reliable model | |
try: | |
from transformers import Wav2Vec2ForSequenceClassification | |
emotion_feature_extractor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er") | |
emotion_model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er") | |
except: | |
# Fallback to a simpler approach using audio features | |
emotion_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h") | |
emotion_model = None | |
print("Using fallback emotion detection method") | |
# Personality Generation Model | |
personality_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base") | |
personality_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base") | |
print("Models loaded successfully!") | |
# Emotion labels mapping (updated for broader coverage) | |
EMOTION_LABELS = { | |
0: "angry", | |
1: "happy", | |
2: "sad", | |
3: "neutral", | |
4: "excited", | |
5: "calm", | |
6: "surprised" | |
} | |
def preprocess_audio(audio_path, target_sr=16000): | |
"""Load and preprocess audio for model input""" | |
try: | |
# Load audio file | |
audio, sr = librosa.load(audio_path, sr=target_sr) | |
# Ensure audio is not too short | |
if len(audio) < target_sr * 0.5: # Less than 0.5 seconds | |
audio = np.pad(audio, (0, int(target_sr * 0.5) - len(audio)), mode='constant') | |
return audio, sr | |
except Exception as e: | |
print(f"Error preprocessing audio: {e}") | |
return None, None | |
def transcribe_audio(audio_path): | |
"""Convert speech to text using Wav2Vec2""" | |
try: | |
audio, sr = preprocess_audio(audio_path) | |
if audio is None: | |
return "Error: Could not process audio file" | |
# Extract features | |
inputs = stt_feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True) | |
# Get model predictions | |
with torch.no_grad(): | |
logits = stt_model(inputs.input_values).logits | |
# Decode predictions | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = stt_tokenizer.batch_decode(predicted_ids)[0] | |
return transcription.strip() | |
except Exception as e: | |
return f"Transcription error: {str(e)}" | |
def detect_emotion(audio_path): | |
"""Detect emotion from audio using audio features analysis""" | |
try: | |
audio, sr = preprocess_audio(audio_path) | |
if audio is None: | |
return "Error: Could not process audio file", 0.0 | |
if emotion_model is not None: | |
# Use the wav2vec2 emotion model if available | |
inputs = emotion_feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True) | |
with torch.no_grad(): | |
outputs = emotion_model(**inputs) | |
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1) | |
emotion_id = torch.argmax(predictions, dim=-1).item() | |
confidence = torch.max(predictions).item() | |
emotion_label = EMOTION_LABELS.get(emotion_id, "neutral") | |
else: | |
# Fallback: Simple audio feature-based emotion detection | |
# Analyze audio characteristics | |
rms_energy = np.sqrt(np.mean(audio**2)) | |
zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(audio)[0]) | |
spectral_centroid = np.mean(librosa.feature.spectral_centroid(audio, sr=sr)[0]) | |
# Simple heuristic-based emotion classification | |
if rms_energy > 0.02 and zero_crossing_rate > 0.1: | |
emotion_label = "excited" | |
confidence = 0.75 | |
elif rms_energy < 0.005: | |
emotion_label = "calm" | |
confidence = 0.70 | |
elif spectral_centroid > 2000: | |
emotion_label = "happy" | |
confidence = 0.65 | |
else: | |
emotion_label = "neutral" | |
confidence = 0.60 | |
return emotion_label, confidence | |
except Exception as e: | |
return "neutral", 0.50 # Default fallback | |
def generate_personality(transcription, emotion, confidence): | |
"""Generate personality description using FLAN-T5""" | |
try: | |
# Create a comprehensive prompt for personality analysis | |
prompt = f"""Analyze this person's personality based on their speech: | |
Speech content: "{transcription}" | |
Detected emotion: {emotion} (confidence: {confidence:.2f}) | |
Based on the way they speak, their word choice, emotional tone, and overall communication style, provide a detailed personality analysis. Consider their potential traits, communication style, emotional intelligence, and social characteristics. Write this as a natural, engaging personality profile in 3-4 sentences.""" | |
# Tokenize and generate | |
inputs = personality_tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True) | |
with torch.no_grad(): | |
outputs = personality_model.generate( | |
inputs, | |
max_length=200, | |
min_length=50, | |
temperature=0.7, | |
do_sample=True, | |
top_p=0.9, | |
pad_token_id=personality_tokenizer.eos_token_id | |
) | |
personality_description = personality_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return personality_description | |
except Exception as e: | |
return f"Personality generation error: {str(e)}" | |
def create_confidence_bar(emotion, confidence): | |
"""Create a visual representation of emotion confidence""" | |
bar_length = int(confidence * 20) # Scale to 20 characters | |
bar = "β" * bar_length + "β" * (20 - bar_length) | |
return f"{emotion.upper()} {bar} {confidence:.1%}" | |
def analyze_voice(audio_file): | |
"""Main function that orchestrates the entire analysis pipeline""" | |
if audio_file is None: | |
return "Please upload or record an audio file.", "", "", "" | |
try: | |
# Step 1: Transcribe speech | |
transcription = transcribe_audio(audio_file) | |
# Step 2: Detect emotion | |
emotion, confidence = detect_emotion(audio_file) | |
# Step 3: Generate personality description | |
personality = generate_personality(transcription, emotion, confidence) | |
# Create formatted output | |
confidence_display = create_confidence_bar(emotion, confidence) | |
# Format results | |
results_summary = f""" | |
π― **VOICE ANALYSIS COMPLETE** | |
**What they said:** {transcription} | |
**How they felt:** {confidence_display} | |
**Who they might be:** {personality} | |
""" | |
return transcription, confidence_display, personality, results_summary | |
except Exception as e: | |
error_msg = f"Analysis failed: {str(e)}" | |
return error_msg, "", "", error_msg | |
# Create the Gradio interface | |
def create_interface(): | |
with gr.Blocks( | |
theme=gr.themes.Soft(), | |
title="Voice2Persona AI", | |
css=""" | |
.main-header { | |
text-align: center; | |
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
font-size: 2.5em; | |
font-weight: bold; | |
margin-bottom: 0.5em; | |
} | |
.description { | |
text-align: center; | |
font-size: 1.1em; | |
color: #666; | |
margin-bottom: 2em; | |
} | |
.result-box { | |
border-radius: 10px; | |
padding: 20px; | |
margin: 10px 0; | |
} | |
""" | |
) as interface: | |
gr.HTML(""" | |
<div class="main-header">ποΈ Voice2Persona AI</div> | |
<div class="description"> | |
Discover your voice's hidden story! Upload or record audio to uncover what you said, | |
how you felt, and insights into your personality. | |
</div> | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("### π΅ Audio Input") | |
audio_input = gr.Audio( | |
label="Record or Upload Audio", | |
type="filepath", | |
sources=["microphone", "upload"] | |
) | |
analyze_btn = gr.Button( | |
"π Analyze Voice", | |
variant="primary", | |
size="lg" | |
) | |
gr.Markdown(""" | |
**Tips for best results:** | |
- Speak clearly for 3-10 seconds | |
- Use a quiet environment | |
- Express yourself naturally | |
""") | |
with gr.Column(scale=2): | |
gr.Markdown("### π Analysis Results") | |
with gr.Tab("π Complete Analysis"): | |
results_display = gr.Markdown( | |
label="Full Analysis", | |
value="Upload audio to see your voice analysis here..." | |
) | |
with gr.Tab("π Detailed Breakdown"): | |
transcription_output = gr.Textbox( | |
label="π¬ Speech Content (What you said)", | |
placeholder="Transcription will appear here...", | |
lines=3 | |
) | |
emotion_output = gr.Textbox( | |
label="π Emotional State (How you felt)", | |
placeholder="Emotion analysis will appear here...", | |
lines=2 | |
) | |
personality_output = gr.Textbox( | |
label="π§ Personality Insights (Who you might be)", | |
placeholder="Personality analysis will appear here...", | |
lines=5 | |
) | |
# Connect the analyze button to the main function | |
analyze_btn.click( | |
fn=analyze_voice, | |
inputs=[audio_input], | |
outputs=[transcription_output, emotion_output, personality_output, results_display] | |
) | |
gr.Markdown(""" | |
--- | |
### About Voice2Persona AI | |
This AI system combines three powerful models: | |
- **Speech-to-Text**: Facebook's Wav2Vec2 for accurate transcription | |
- **Emotion Detection**: Specialized model for voice emotion recognition | |
- **Personality Analysis**: Google's FLAN-T5 for generating personality insights | |
*Built with β€οΈ using Hugging Face Transformers and Gradio* | |
""") | |
return interface | |
# Launch the app | |
if __name__ == "__main__": | |
app = create_interface() | |
app.launch( | |
share=True, | |
show_error=True, | |
server_name="0.0.0.0", | |
server_port=7860 | |
) |