Spaces:

Nick021402
/

Voice2PersonaAI

Sleeping

App Files Files Community

Nick021402 commited on May 23

Commit

4c08572

verified ·

1 Parent(s): 72324ef

Update app.py

Browse files

Files changed (1) hide show

app.py +294 -67

app.py CHANGED Viewed

@@ -1,70 +1,297 @@
 import gradio as gr
 import torch
-import numpy as np
 import torchaudio
-from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
-from transformers import pipeline
-# Load models once at startup
-asr_model_name = "facebook/wav2vec2-base-960h"
-emotion_model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
-gen_model_name = "google/flan-t5-base"
-# Load ASR
-asr_processor = Wav2Vec2Processor.from_pretrained(asr_model_name)
-asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name)
-# Load emotion detection
-emotion_classifier = pipeline("audio-classification", model=emotion_model_name)
-# Load personality generation pipeline
-gen_pipeline = pipeline("text2text-generation", model=gen_model_name)
-# Transcription Function
-def transcribe(audio):
-    if isinstance(audio, tuple):  # When type="numpy"
-        sr, audio = 16000, audio[0]  # Handle stereo or mono
-    input_values = asr_processor(audio, sampling_rate=16000, return_tensors="pt").input_values
-    with torch.no_grad():
-        logits = asr_model(input_values).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    transcription = asr_processor.decode(predicted_ids[0])
-    return transcription.lower()
-# Personality Generation
-def generate_personality(text):
-    prompt = f"Describe the speaker's personality based on this sentence: \"{text}\""
-    response = gen_pipeline(prompt, max_new_tokens=50)[0]["generated_text"]
-    return response.strip()
-# Emotion Detection
-def detect_emotion(audio):
-    if isinstance(audio, tuple):
-        audio = audio[0]  # Extract numpy array from (array, sample_rate)
-    results = emotion_classifier(audio, top_k=1)
-    return results[0]["label"]
-# Main Pipeline
-def analyze(audio):
-    transcription = transcribe(audio)
-    emotion = detect_emotion(audio)
-    personality = generate_personality(transcription)
-    return transcription, emotion, personality
-# Gradio UI
-with gr.Blocks() as app:
-    gr.Markdown("# Voice2Persona AI\nUpload or record your voice to reveal your mood and hidden persona!")
-    with gr.Row():
-        audio_input = gr.Audio(sources=["microphone", "upload"], type="numpy", label="🎤 Your Voice")
-        submit_btn = gr.Button("Analyze")
-    with gr.Column():
-        transcript_output = gr.Textbox(label="Transcription")
-        emotion_output = gr.Textbox(label="Detected Emotion")
-        personality_output = gr.Textbox(label="AI-Generated Personality")
-    submit_btn.click(fn=analyze, inputs=audio_input,
-                     outputs=[transcript_output, emotion_output, personality_output])
-app.launch()

 import gradio as gr
 import torch
 import torchaudio
+import numpy as np
+from transformers import (
+    Wav2Vec2ForCTC,
+    Wav2Vec2Tokenizer,
+    Wav2Vec2FeatureExtractor,
+    AutoModelForAudioClassification,
+    AutoFeatureExtractor,
+    T5ForConditionalGeneration,
+    T5Tokenizer
+)
+import librosa
+import warnings
+warnings.filterwarnings("ignore")
+# Initialize models and tokenizers
+print("Loading models...")
+# Speech-to-Text Model
+stt_tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
+stt_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+stt_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
+# Emotion Recognition Model
+emotion_feature_extractor = AutoFeatureExtractor.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
+emotion_model = AutoModelForAudioClassification.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
+# Personality Generation Model
+personality_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
+personality_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
+print("Models loaded successfully!")
+# Emotion labels mapping
+EMOTION_LABELS = {
+    0: "angry",
+    1: "disgust",
+    2: "fear",
+    3: "happy",
+    4: "neutral",
+    5: "sad",
+    6: "surprise"
+}
+def preprocess_audio(audio_path, target_sr=16000):
+    """Load and preprocess audio for model input"""
+    try:
+        # Load audio file
+        audio, sr = librosa.load(audio_path, sr=target_sr)
+        # Ensure audio is not too short
+        if len(audio) < target_sr * 0.5:  # Less than 0.5 seconds
+            audio = np.pad(audio, (0, int(target_sr * 0.5) - len(audio)), mode='constant')
+        return audio, sr
+    except Exception as e:
+        print(f"Error preprocessing audio: {e}")
+        return None, None
+def transcribe_audio(audio_path):
+    """Convert speech to text using Wav2Vec2"""
+    try:
+        audio, sr = preprocess_audio(audio_path)
+        if audio is None:
+            return "Error: Could not process audio file"
+        # Extract features
+        inputs = stt_feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
+        # Get model predictions
+        with torch.no_grad():
+            logits = stt_model(inputs.input_values).logits
+        # Decode predictions
+        predicted_ids = torch.argmax(logits, dim=-1)
+        transcription = stt_tokenizer.batch_decode(predicted_ids)[0]
+        return transcription.strip()
+    except Exception as e:
+        return f"Transcription error: {str(e)}"
+def detect_emotion(audio_path):
+    """Detect emotion from audio using specialized model"""
+    try:
+        audio, sr = preprocess_audio(audio_path)
+        if audio is None:
+            return "Error: Could not process audio file", 0.0
+        # Extract features for emotion model
+        inputs = emotion_feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
+        # Get emotion predictions
+        with torch.no_grad():
+            outputs = emotion_model(**inputs)
+            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
+        # Get the most likely emotion
+        emotion_id = torch.argmax(predictions, dim=-1).item()
+        confidence = torch.max(predictions).item()
+        emotion_label = EMOTION_LABELS.get(emotion_id, "unknown")
+        return emotion_label, confidence
+    except Exception as e:
+        return f"Emotion detection error: {str(e)}", 0.0
+def generate_personality(transcription, emotion, confidence):
+    """Generate personality description using FLAN-T5"""
+    try:
+        # Create a comprehensive prompt for personality analysis
+        prompt = f"""Analyze this person's personality based on their speech:
+Speech content: "{transcription}"
+Detected emotion: {emotion} (confidence: {confidence:.2f})
+Based on the way they speak, their word choice, emotional tone, and overall communication style, provide a detailed personality analysis. Consider their potential traits, communication style, emotional intelligence, and social characteristics. Write this as a natural, engaging personality profile in 3-4 sentences."""
+        # Tokenize and generate
+        inputs = personality_tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
+        with torch.no_grad():
+            outputs = personality_model.generate(
+                inputs,
+                max_length=200,
+                min_length=50,
+                temperature=0.7,
+                do_sample=True,
+                top_p=0.9,
+                pad_token_id=personality_tokenizer.eos_token_id
+            )
+        personality_description = personality_tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return personality_description
+    except Exception as e:
+        return f"Personality generation error: {str(e)}"
+def create_confidence_bar(emotion, confidence):
+    """Create a visual representation of emotion confidence"""
+    bar_length = int(confidence * 20)  # Scale to 20 characters
+    bar = "█" * bar_length + "░" * (20 - bar_length)
+    return f"{emotion.upper()} {bar} {confidence:.1%}"
+def analyze_voice(audio_file):
+    """Main function that orchestrates the entire analysis pipeline"""
+    if audio_file is None:
+        return "Please upload or record an audio file.", "", "", ""
+    try:
+        # Step 1: Transcribe speech
+        transcription = transcribe_audio(audio_file)
+        # Step 2: Detect emotion
+        emotion, confidence = detect_emotion(audio_file)
+        # Step 3: Generate personality description
+        personality = generate_personality(transcription, emotion, confidence)
+        # Create formatted output
+        confidence_display = create_confidence_bar(emotion, confidence)
+        # Format results
+        results_summary = f"""
+🎯 **VOICE ANALYSIS COMPLETE**
+**What they said:** {transcription}
+**How they felt:** {confidence_display}
+**Who they might be:** {personality}
+        """
+        return transcription, confidence_display, personality, results_summary
+    except Exception as e:
+        error_msg = f"Analysis failed: {str(e)}"
+        return error_msg, "", "", error_msg
+# Create the Gradio interface
+def create_interface():
+    with gr.Blocks(
+        theme=gr.themes.Soft(),
+        title="Voice2Persona AI",
+        css="""
+        .main-header {
+            text-align: center;
+            background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
+            -webkit-background-clip: text;
+            -webkit-text-fill-color: transparent;
+            font-size: 2.5em;
+            font-weight: bold;
+            margin-bottom: 0.5em;
+        }
+        .description {
+            text-align: center;
+            font-size: 1.1em;
+            color: #666;
+            margin-bottom: 2em;
+        }
+        .result-box {
+            border-radius: 10px;
+            padding: 20px;
+            margin: 10px 0;
+        }
+        """
+    ) as interface:
+        gr.HTML("""
+        <div class="main-header">🎙️ Voice2Persona AI</div>
+        <div class="description">
+            Discover your voice's hidden story! Upload or record audio to uncover what you said,
+            how you felt, and insights into your personality.
+        </div>
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### 🎵 Audio Input")
+                audio_input = gr.Audio(
+                    label="Record or Upload Audio",
+                    type="filepath",
+                    sources=["microphone", "upload"]
+                )
+                analyze_btn = gr.Button(
+                    "🔍 Analyze Voice",
+                    variant="primary",
+                    size="lg"
+                )
+                gr.Markdown("""
+                **Tips for best results:**
+                - Speak clearly for 3-10 seconds
+                - Use a quiet environment
+                - Express yourself naturally
+                """)
+            with gr.Column(scale=2):
+                gr.Markdown("### 📊 Analysis Results")
+                with gr.Tab("📝 Complete Analysis"):
+                    results_display = gr.Markdown(
+                        label="Full Analysis",
+                        value="Upload audio to see your voice analysis here..."
+                    )
+                with gr.Tab("🔍 Detailed Breakdown"):
+                    transcription_output = gr.Textbox(
+                        label="💬 Speech Content (What you said)",
+                        placeholder="Transcription will appear here...",
+                        lines=3
+                    )
+                    emotion_output = gr.Textbox(
+                        label="😊 Emotional State (How you felt)",
+                        placeholder="Emotion analysis will appear here...",
+                        lines=2
+                    )
+                    personality_output = gr.Textbox(
+                        label="🧠 Personality Insights (Who you might be)",
+                        placeholder="Personality analysis will appear here...",
+                        lines=5
+                    )
+        # Connect the analyze button to the main function
+        analyze_btn.click(
+            fn=analyze_voice,
+            inputs=[audio_input],
+            outputs=[transcription_output, emotion_output, personality_output, results_display]
+        )
+        gr.Markdown("""
+        ---
+        ### About Voice2Persona AI
+        This AI system combines three powerful models:
+        - **Speech-to-Text**: Facebook's Wav2Vec2 for accurate transcription
+        - **Emotion Detection**: Specialized model for voice emotion recognition
+        - **Personality Analysis**: Google's FLAN-T5 for generating personality insights
+        *Built with ❤️ using Hugging Face Transformers and Gradio*
+        """)
+    return interface
+# Launch the app
+if __name__ == "__main__":
+    app = create_interface()
+    app.launch(
+        share=True,
+        show_error=True,
+        server_name="0.0.0.0",
+        server_port=7860
+    )