Spaces:

Nick021402
/

Voice2PersonaAI

Running

App Files Files Community

Voice2PersonaAI / app.py

Nick021402

Update app.py

5a8efbe verified about 1 month ago

raw

history blame contribute delete

11.8 kB

	import gradio as gr
	import torch
	import torchaudio
	import numpy as np
	from transformers import (
	Wav2Vec2ForCTC,
	Wav2Vec2Tokenizer,
	Wav2Vec2FeatureExtractor,
	AutoModelForAudioClassification,
	AutoFeatureExtractor,
	T5ForConditionalGeneration,
	T5Tokenizer,
	Wav2Vec2ForSequenceClassification
	)
	import librosa
	import warnings
	warnings.filterwarnings("ignore")

	# Initialize models and tokenizers
	print("Loading models...")

	# Speech-to-Text Model
	stt_tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
	stt_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
	stt_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")

	# Emotion Recognition Model - using a more reliable model
	try:
	from transformers import Wav2Vec2ForSequenceClassification
	emotion_feature_extractor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er")
	emotion_model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er")
	except:
	# Fallback to a simpler approach using audio features
	emotion_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
	emotion_model = None
	print("Using fallback emotion detection method")

	# Personality Generation Model
	personality_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
	personality_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

	print("Models loaded successfully!")

	# Emotion labels mapping (updated for broader coverage)
	EMOTION_LABELS = {
	0: "angry",
	1: "happy",
	2: "sad",
	3: "neutral",
	4: "excited",
	5: "calm",
	6: "surprised"
	}

	def preprocess_audio(audio_path, target_sr=16000):
	"""Load and preprocess audio for model input"""
	try:
	# Load audio file
	audio, sr = librosa.load(audio_path, sr=target_sr)

	# Ensure audio is not too short
	if len(audio) < target_sr * 0.5: # Less than 0.5 seconds
	audio = np.pad(audio, (0, int(target_sr * 0.5) - len(audio)), mode='constant')

	return audio, sr
	except Exception as e:
	print(f"Error preprocessing audio: {e}")
	return None, None

	def transcribe_audio(audio_path):
	"""Convert speech to text using Wav2Vec2"""
	try:
	audio, sr = preprocess_audio(audio_path)
	if audio is None:
	return "Error: Could not process audio file"

	# Extract features
	inputs = stt_feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)

	# Get model predictions
	with torch.no_grad():
	logits = stt_model(inputs.input_values).logits

	# Decode predictions
	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = stt_tokenizer.batch_decode(predicted_ids)[0]

	return transcription.strip()
	except Exception as e:
	return f"Transcription error: {str(e)}"

	def detect_emotion(audio_path):
	"""Detect emotion from audio using audio features analysis"""
	try:
	audio, sr = preprocess_audio(audio_path)
	if audio is None:
	return "Error: Could not process audio file", 0.0

	if emotion_model is not None:
	# Use the wav2vec2 emotion model if available
	inputs = emotion_feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding=True)

	with torch.no_grad():
	outputs = emotion_model(**inputs)
	predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

	emotion_id = torch.argmax(predictions, dim=-1).item()
	confidence = torch.max(predictions).item()
	emotion_label = EMOTION_LABELS.get(emotion_id, "neutral")
	else:
	# Fallback: Simple audio feature-based emotion detection
	# Analyze audio characteristics
	rms_energy = np.sqrt(np.mean(audio**2))
	zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(audio)[0])
	spectral_centroid = np.mean(librosa.feature.spectral_centroid(audio, sr=sr)[0])

	# Simple heuristic-based emotion classification
	if rms_energy > 0.02 and zero_crossing_rate > 0.1:
	emotion_label = "excited"
	confidence = 0.75
	elif rms_energy < 0.005:
	emotion_label = "calm"
	confidence = 0.70
	elif spectral_centroid > 2000:
	emotion_label = "happy"
	confidence = 0.65
	else:
	emotion_label = "neutral"
	confidence = 0.60

	return emotion_label, confidence
	except Exception as e:
	return "neutral", 0.50 # Default fallback

	def generate_personality(transcription, emotion, confidence):
	"""Generate personality description using FLAN-T5"""
	try:
	# Create a comprehensive prompt for personality analysis
	prompt = f"""Analyze this person's personality based on their speech:

	Speech content: "{transcription}"
	Detected emotion: {emotion} (confidence: {confidence:.2f})

	Based on the way they speak, their word choice, emotional tone, and overall communication style, provide a detailed personality analysis. Consider their potential traits, communication style, emotional intelligence, and social characteristics. Write this as a natural, engaging personality profile in 3-4 sentences."""

	# Tokenize and generate
	inputs = personality_tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)

	with torch.no_grad():
	outputs = personality_model.generate(
	inputs,
	max_length=200,
	min_length=50,
	temperature=0.7,
	do_sample=True,
	top_p=0.9,
	pad_token_id=personality_tokenizer.eos_token_id
	)

	personality_description = personality_tokenizer.decode(outputs[0], skip_special_tokens=True)

	return personality_description
	except Exception as e:
	return f"Personality generation error: {str(e)}"

	def create_confidence_bar(emotion, confidence):
	"""Create a visual representation of emotion confidence"""
	bar_length = int(confidence * 20) # Scale to 20 characters
	bar = "█" * bar_length + "░" * (20 - bar_length)
	return f"{emotion.upper()} {bar} {confidence:.1%}"

	def analyze_voice(audio_file):
	"""Main function that orchestrates the entire analysis pipeline"""
	if audio_file is None:
	return "Please upload or record an audio file.", "", "", ""

	try:
	# Step 1: Transcribe speech
	transcription = transcribe_audio(audio_file)

	# Step 2: Detect emotion
	emotion, confidence = detect_emotion(audio_file)

	# Step 3: Generate personality description
	personality = generate_personality(transcription, emotion, confidence)

	# Create formatted output
	confidence_display = create_confidence_bar(emotion, confidence)

	# Format results
	results_summary = f"""
	🎯 VOICE ANALYSIS COMPLETE

	What they said: {transcription}

	How they felt: {confidence_display}

	Who they might be: {personality}
	"""

	return transcription, confidence_display, personality, results_summary

	except Exception as e:
	error_msg = f"Analysis failed: {str(e)}"
	return error_msg, "", "", error_msg

	# Create the Gradio interface
	def create_interface():
	with gr.Blocks(
	theme=gr.themes.Soft(),
	title="Voice2Persona AI",
	css="""
	.main-header {
	text-align: center;
	background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	font-size: 2.5em;
	font-weight: bold;
	margin-bottom: 0.5em;
	}
	.description {
	text-align: center;
	font-size: 1.1em;
	color: #666;
	margin-bottom: 2em;
	}
	.result-box {
	border-radius: 10px;
	padding: 20px;
	margin: 10px 0;
	}
	"""
	) as interface:

	gr.HTML("""
	<div class="main-header">🎙️ Voice2Persona AI</div>
	<div class="description">
	Discover your voice's hidden story! Upload or record audio to uncover what you said,
	how you felt, and insights into your personality.
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 🎵 Audio Input")
	audio_input = gr.Audio(
	label="Record or Upload Audio",
	type="filepath",
	sources=["microphone", "upload"]
	)

	analyze_btn = gr.Button(
	"🔍 Analyze Voice",
	variant="primary",
	size="lg"
	)

	gr.Markdown("""
	Tips for best results:
	- Speak clearly for 3-10 seconds
	- Use a quiet environment
	- Express yourself naturally
	""")

	with gr.Column(scale=2):
	gr.Markdown("### 📊 Analysis Results")

	with gr.Tab("📝 Complete Analysis"):
	results_display = gr.Markdown(
	label="Full Analysis",
	value="Upload audio to see your voice analysis here..."
	)

	with gr.Tab("🔍 Detailed Breakdown"):
	transcription_output = gr.Textbox(
	label="💬 Speech Content (What you said)",
	placeholder="Transcription will appear here...",
	lines=3
	)

	emotion_output = gr.Textbox(
	label="😊 Emotional State (How you felt)",
	placeholder="Emotion analysis will appear here...",
	lines=2
	)

	personality_output = gr.Textbox(
	label="🧠 Personality Insights (Who you might be)",
	placeholder="Personality analysis will appear here...",
	lines=5
	)

	# Connect the analyze button to the main function
	analyze_btn.click(
	fn=analyze_voice,
	inputs=[audio_input],
	outputs=[transcription_output, emotion_output, personality_output, results_display]
	)

	gr.Markdown("""
	---
	### About Voice2Persona AI

	This AI system combines three powerful models:
	- Speech-to-Text: Facebook's Wav2Vec2 for accurate transcription
	- Emotion Detection: Specialized model for voice emotion recognition
	- Personality Analysis: Google's FLAN-T5 for generating personality insights

	Built with ❤️ using Hugging Face Transformers and Gradio
	""")

	return interface

	# Launch the app
	if __name__ == "__main__":
	app = create_interface()
	app.launch(
	share=True,
	show_error=True,
	server_name="0.0.0.0",
	server_port=7860
	)