import gradio as gr import torch import torchaudio import numpy as np from transformers import ( Wav2Vec2ForCTC, Wav2Vec2Tokenizer, Wav2Vec2FeatureExtractor, AutoModelForAudioClassification, AutoFeatureExtractor, T5ForConditionalGeneration, T5Tokenizer, Wav2Vec2ForSequenceClassification ) import librosa import warnings warnings.filterwarnings("ignore") # Initialize models and tokenizers print("Loading models...") # Speech-to-Text Model stt_tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") stt_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") stt_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h") # Emotion Recognition Model - using a more reliable model try: from transformers import Wav2Vec2ForSequenceClassification emotion_feature_extractor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er") emotion_model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er") except: # Fallback to a simpler approach using audio features emotion_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h") emotion_model = None print("Using fallback emotion detection method") # Personality Generation Model personality_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base") personality_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base") print("Models loaded successfully!") # Emotion labels mapping (updated for broader coverage) EMOTION_LABELS = { 0: "angry", 1: "happy", 2: "sad", 3: "neutral", 4: "excited", 5: "calm", 6: "surprised" } def preprocess_audio(audio_path, target_sr=16000): """Load and preprocess audio for model input""" try: # Load audio file audio, sr = librosa.load(audio_path, sr=target_sr) # Ensure audio is not too short if len(audio) < target_sr * 0.5: # Less than 0.5 seconds audio = np.pad(audio, (0, int(target_sr * 0.5) - len(audio)), mode='constant') return audio, sr except Exception as e: print(f"Error preprocessing audio: {e}") return None, None def transcribe_audio(audio_path): """Convert speech to text using Wav2Vec2""" try: audio, sr = preprocess_audio(audio_path) if audio is None: return "Error: Could not process audio file" # Extract features inputs = stt_feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True) # Get model predictions with torch.no_grad(): logits = stt_model(inputs.input_values).logits # Decode predictions predicted_ids = torch.argmax(logits, dim=-1) transcription = stt_tokenizer.batch_decode(predicted_ids)[0] return transcription.strip() except Exception as e: return f"Transcription error: {str(e)}" def detect_emotion(audio_path): """Detect emotion from audio using audio features analysis""" try: audio, sr = preprocess_audio(audio_path) if audio is None: return "Error: Could not process audio file", 0.0 if emotion_model is not None: # Use the wav2vec2 emotion model if available inputs = emotion_feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True) with torch.no_grad(): outputs = emotion_model(**inputs) predictions = torch.nn.functional.softmax(outputs.logits, dim=-1) emotion_id = torch.argmax(predictions, dim=-1).item() confidence = torch.max(predictions).item() emotion_label = EMOTION_LABELS.get(emotion_id, "neutral") else: # Fallback: Simple audio feature-based emotion detection # Analyze audio characteristics rms_energy = np.sqrt(np.mean(audio**2)) zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(audio)[0]) spectral_centroid = np.mean(librosa.feature.spectral_centroid(audio, sr=sr)[0]) # Simple heuristic-based emotion classification if rms_energy > 0.02 and zero_crossing_rate > 0.1: emotion_label = "excited" confidence = 0.75 elif rms_energy < 0.005: emotion_label = "calm" confidence = 0.70 elif spectral_centroid > 2000: emotion_label = "happy" confidence = 0.65 else: emotion_label = "neutral" confidence = 0.60 return emotion_label, confidence except Exception as e: return "neutral", 0.50 # Default fallback def generate_personality(transcription, emotion, confidence): """Generate personality description using FLAN-T5""" try: # Create a comprehensive prompt for personality analysis prompt = f"""Analyze this person's personality based on their speech: Speech content: "{transcription}" Detected emotion: {emotion} (confidence: {confidence:.2f}) Based on the way they speak, their word choice, emotional tone, and overall communication style, provide a detailed personality analysis. Consider their potential traits, communication style, emotional intelligence, and social characteristics. Write this as a natural, engaging personality profile in 3-4 sentences.""" # Tokenize and generate inputs = personality_tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True) with torch.no_grad(): outputs = personality_model.generate( inputs, max_length=200, min_length=50, temperature=0.7, do_sample=True, top_p=0.9, pad_token_id=personality_tokenizer.eos_token_id ) personality_description = personality_tokenizer.decode(outputs[0], skip_special_tokens=True) return personality_description except Exception as e: return f"Personality generation error: {str(e)}" def create_confidence_bar(emotion, confidence): """Create a visual representation of emotion confidence""" bar_length = int(confidence * 20) # Scale to 20 characters bar = "█" * bar_length + "░" * (20 - bar_length) return f"{emotion.upper()} {bar} {confidence:.1%}" def analyze_voice(audio_file): """Main function that orchestrates the entire analysis pipeline""" if audio_file is None: return "Please upload or record an audio file.", "", "", "" try: # Step 1: Transcribe speech transcription = transcribe_audio(audio_file) # Step 2: Detect emotion emotion, confidence = detect_emotion(audio_file) # Step 3: Generate personality description personality = generate_personality(transcription, emotion, confidence) # Create formatted output confidence_display = create_confidence_bar(emotion, confidence) # Format results results_summary = f""" 🎯 **VOICE ANALYSIS COMPLETE** **What they said:** {transcription} **How they felt:** {confidence_display} **Who they might be:** {personality} """ return transcription, confidence_display, personality, results_summary except Exception as e: error_msg = f"Analysis failed: {str(e)}" return error_msg, "", "", error_msg # Create the Gradio interface def create_interface(): with gr.Blocks( theme=gr.themes.Soft(), title="Voice2Persona AI", css=""" .main-header { text-align: center; background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-size: 2.5em; font-weight: bold; margin-bottom: 0.5em; } .description { text-align: center; font-size: 1.1em; color: #666; margin-bottom: 2em; } .result-box { border-radius: 10px; padding: 20px; margin: 10px 0; } """ ) as interface: gr.HTML("""
🎙️ Voice2Persona AI
Discover your voice's hidden story! Upload or record audio to uncover what you said, how you felt, and insights into your personality.
""") with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 🎵 Audio Input") audio_input = gr.Audio( label="Record or Upload Audio", type="filepath", sources=["microphone", "upload"] ) analyze_btn = gr.Button( "🔍 Analyze Voice", variant="primary", size="lg" ) gr.Markdown(""" **Tips for best results:** - Speak clearly for 3-10 seconds - Use a quiet environment - Express yourself naturally """) with gr.Column(scale=2): gr.Markdown("### 📊 Analysis Results") with gr.Tab("📝 Complete Analysis"): results_display = gr.Markdown( label="Full Analysis", value="Upload audio to see your voice analysis here..." ) with gr.Tab("🔍 Detailed Breakdown"): transcription_output = gr.Textbox( label="💬 Speech Content (What you said)", placeholder="Transcription will appear here...", lines=3 ) emotion_output = gr.Textbox( label="😊 Emotional State (How you felt)", placeholder="Emotion analysis will appear here...", lines=2 ) personality_output = gr.Textbox( label="🧠 Personality Insights (Who you might be)", placeholder="Personality analysis will appear here...", lines=5 ) # Connect the analyze button to the main function analyze_btn.click( fn=analyze_voice, inputs=[audio_input], outputs=[transcription_output, emotion_output, personality_output, results_display] ) gr.Markdown(""" --- ### About Voice2Persona AI This AI system combines three powerful models: - **Speech-to-Text**: Facebook's Wav2Vec2 for accurate transcription - **Emotion Detection**: Specialized model for voice emotion recognition - **Personality Analysis**: Google's FLAN-T5 for generating personality insights *Built with ❤️ using Hugging Face Transformers and Gradio* """) return interface # Launch the app if __name__ == "__main__": app = create_interface() app.launch( share=True, show_error=True, server_name="0.0.0.0", server_port=7860 )