Spaces:

Nick021402
/

Voice2PersonaAI

Running

File size: 11,794 Bytes

import gradio as gr
import torch
import torchaudio
import numpy as np
from transformers import (
    Wav2Vec2ForCTC, 
    Wav2Vec2Tokenizer,
    Wav2Vec2FeatureExtractor,
    AutoModelForAudioClassification,
    AutoFeatureExtractor,
    T5ForConditionalGeneration,
    T5Tokenizer,
    Wav2Vec2ForSequenceClassification
)
import librosa
import warnings
warnings.filterwarnings("ignore")

# Initialize models and tokenizers
print("Loading models...")

# Speech-to-Text Model
stt_tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
stt_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
stt_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")

# Emotion Recognition Model - using a more reliable model
try:
    from transformers import Wav2Vec2ForSequenceClassification
    emotion_feature_extractor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er")
    emotion_model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er")
except:
    # Fallback to a simpler approach using audio features
    emotion_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
    emotion_model = None
    print("Using fallback emotion detection method")

# Personality Generation Model
personality_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
personality_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

print("Models loaded successfully!")

# Emotion labels mapping (updated for broader coverage)
EMOTION_LABELS = {
    0: "angry",
    1: "happy", 
    2: "sad",
    3: "neutral",
    4: "excited",
    5: "calm",
    6: "surprised"
}

def preprocess_audio(audio_path, target_sr=16000):
    """Load and preprocess audio for model input"""
    try:
        # Load audio file
        audio, sr = librosa.load(audio_path, sr=target_sr)
        
        # Ensure audio is not too short
        if len(audio) < target_sr * 0.5:  # Less than 0.5 seconds
            audio = np.pad(audio, (0, int(target_sr * 0.5) - len(audio)), mode='constant')
        
        return audio, sr
    except Exception as e:
        print(f"Error preprocessing audio: {e}")
        return None, None

def transcribe_audio(audio_path):
    """Convert speech to text using Wav2Vec2"""
    try:
        audio, sr = preprocess_audio(audio_path)
        if audio is None:
            return "Error: Could not process audio file"
        
        # Extract features
        inputs = stt_feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
        
        # Get model predictions
        with torch.no_grad():
            logits = stt_model(inputs.input_values).logits
        
        # Decode predictions
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = stt_tokenizer.batch_decode(predicted_ids)[0]
        
        return transcription.strip()
    except Exception as e:
        return f"Transcription error: {str(e)}"

def detect_emotion(audio_path):
    """Detect emotion from audio using audio features analysis"""
    try:
        audio, sr = preprocess_audio(audio_path)
        if audio is None:
            return "Error: Could not process audio file", 0.0
        
        if emotion_model is not None:
            # Use the wav2vec2 emotion model if available
            inputs = emotion_feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
            
            with torch.no_grad():
                outputs = emotion_model(**inputs)
                predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            
            emotion_id = torch.argmax(predictions, dim=-1).item()
            confidence = torch.max(predictions).item()
            emotion_label = EMOTION_LABELS.get(emotion_id, "neutral")
        else:
            # Fallback: Simple audio feature-based emotion detection
            # Analyze audio characteristics
            rms_energy = np.sqrt(np.mean(audio**2))
            zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(audio)[0])
            spectral_centroid = np.mean(librosa.feature.spectral_centroid(audio, sr=sr)[0])
            
            # Simple heuristic-based emotion classification
            if rms_energy > 0.02 and zero_crossing_rate > 0.1:
                emotion_label = "excited"
                confidence = 0.75
            elif rms_energy < 0.005:
                emotion_label = "calm"
                confidence = 0.70
            elif spectral_centroid > 2000:
                emotion_label = "happy"
                confidence = 0.65
            else:
                emotion_label = "neutral"
                confidence = 0.60
        
        return emotion_label, confidence
    except Exception as e:
        return "neutral", 0.50  # Default fallback

def generate_personality(transcription, emotion, confidence):
    """Generate personality description using FLAN-T5"""
    try:
        # Create a comprehensive prompt for personality analysis
        prompt = f"""Analyze this person's personality based on their speech:

Speech content: "{transcription}"
Detected emotion: {emotion} (confidence: {confidence:.2f})

Based on the way they speak, their word choice, emotional tone, and overall communication style, provide a detailed personality analysis. Consider their potential traits, communication style, emotional intelligence, and social characteristics. Write this as a natural, engaging personality profile in 3-4 sentences."""

        # Tokenize and generate
        inputs = personality_tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
        
        with torch.no_grad():
            outputs = personality_model.generate(
                inputs, 
                max_length=200,
                min_length=50,
                temperature=0.7,
                do_sample=True,
                top_p=0.9,
                pad_token_id=personality_tokenizer.eos_token_id
            )
        
        personality_description = personality_tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        return personality_description
    except Exception as e:
        return f"Personality generation error: {str(e)}"

def create_confidence_bar(emotion, confidence):
    """Create a visual representation of emotion confidence"""
    bar_length = int(confidence * 20)  # Scale to 20 characters
    bar = "█" * bar_length + "░" * (20 - bar_length)
    return f"{emotion.upper()} {bar} {confidence:.1%}"

def analyze_voice(audio_file):
    """Main function that orchestrates the entire analysis pipeline"""
    if audio_file is None:
        return "Please upload or record an audio file.", "", "", ""
    
    try:
        # Step 1: Transcribe speech
        transcription = transcribe_audio(audio_file)
        
        # Step 2: Detect emotion
        emotion, confidence = detect_emotion(audio_file)
        
        # Step 3: Generate personality description
        personality = generate_personality(transcription, emotion, confidence)
        
        # Create formatted output
        confidence_display = create_confidence_bar(emotion, confidence)
        
        # Format results
        results_summary = f"""
🎯 **VOICE ANALYSIS COMPLETE**

**What they said:** {transcription}

**How they felt:** {confidence_display}

**Who they might be:** {personality}
        """
        
        return transcription, confidence_display, personality, results_summary
        
    except Exception as e:
        error_msg = f"Analysis failed: {str(e)}"
        return error_msg, "", "", error_msg

# Create the Gradio interface
def create_interface():
    with gr.Blocks(
        theme=gr.themes.Soft(),
        title="Voice2Persona AI",
        css="""
        .main-header {
            text-align: center;
            background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
            -webkit-background-clip: text;
            -webkit-text-fill-color: transparent;
            font-size: 2.5em;
            font-weight: bold;
            margin-bottom: 0.5em;
        }
        .description {
            text-align: center;
            font-size: 1.1em;
            color: #666;
            margin-bottom: 2em;
        }
        .result-box {
            border-radius: 10px;
            padding: 20px;
            margin: 10px 0;
        }
        """
    ) as interface:
        
        gr.HTML("""
        <div class="main-header">🎙️ Voice2Persona AI</div>
        <div class="description">
            Discover your voice's hidden story! Upload or record audio to uncover what you said, 
            how you felt, and insights into your personality.
        </div>
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### 🎵 Audio Input")
                audio_input = gr.Audio(
                    label="Record or Upload Audio",
                    type="filepath",
                    sources=["microphone", "upload"]
                )
                
                analyze_btn = gr.Button(
                    "🔍 Analyze Voice",
                    variant="primary",
                    size="lg"
                )
                
                gr.Markdown("""
                **Tips for best results:**
                - Speak clearly for 3-10 seconds
                - Use a quiet environment
                - Express yourself naturally
                """)
            
            with gr.Column(scale=2):
                gr.Markdown("### 📊 Analysis Results")
                
                with gr.Tab("📝 Complete Analysis"):
                    results_display = gr.Markdown(
                        label="Full Analysis",
                        value="Upload audio to see your voice analysis here..."
                    )
                
                with gr.Tab("🔍 Detailed Breakdown"):
                    transcription_output = gr.Textbox(
                        label="💬 Speech Content (What you said)",
                        placeholder="Transcription will appear here...",
                        lines=3
                    )
                    
                    emotion_output = gr.Textbox(
                        label="😊 Emotional State (How you felt)",
                        placeholder="Emotion analysis will appear here...",
                        lines=2
                    )
                    
                    personality_output = gr.Textbox(
                        label="🧠 Personality Insights (Who you might be)",
                        placeholder="Personality analysis will appear here...",
                        lines=5
                    )
        
        # Connect the analyze button to the main function
        analyze_btn.click(
            fn=analyze_voice,
            inputs=[audio_input],
            outputs=[transcription_output, emotion_output, personality_output, results_display]
        )
        
        gr.Markdown("""
        ---
        ### About Voice2Persona AI
        
        This AI system combines three powerful models:
        - **Speech-to-Text**: Facebook's Wav2Vec2 for accurate transcription
        - **Emotion Detection**: Specialized model for voice emotion recognition  
        - **Personality Analysis**: Google's FLAN-T5 for generating personality insights
        
        *Built with ❤️ using Hugging Face Transformers and Gradio*
        """)
    
    return interface

# Launch the app
if __name__ == "__main__":
    app = create_interface()
    app.launch(
        share=True,
        show_error=True,
        server_name="0.0.0.0",
        server_port=7860
    )