Voice2PersonaAI / app.py
Nick021402's picture
Update app.py
6861cf1 verified
raw
history blame
1.91 kB
from transformers import pipeline, Wav2Vec2ForCTC, Wav2Vec2Processor
import gradio as gr
import torch
import numpy as np
# Load models
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cpu")
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
labels = [
"secret agent", "lecturer", "motivational speaker", "comedian",
"philosopher", "calm leader", "emotional artist", "enthusiastic teacher",
"strict officer", "mysterious thinker"
]
# Transcription + personality
def transcribe_and_analyze(audio):
if audio is None:
return "No audio provided", "No analysis"
# Mono audio
if len(audio.shape) > 1:
audio = np.mean(audio, axis=1)
inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
input_values = inputs.input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0]).lower()
result = classifier(transcription, candidate_labels=labels)
top_label = result["labels"][0]
score = result["scores"][0]
return transcription, f"You sound like a **{top_label}** (confidence: {score:.2f})"
# Gradio UI
with gr.Blocks(theme=gr.themes.Soft()) as app:
gr.Markdown("# Voice2PersonaAI (Free AI-Based Personality Engine)")
with gr.Row():
audio_input = gr.Audio(sources=["microphone", "upload"], type="numpy", label="🎀 Your Voice")
transcribe_button = gr.Button("Transcribe & Analyze")
with gr.Row():
transcribed_text = gr.Textbox(label="Transcription")
persona_text = gr.Textbox(label="AI-Predicted Persona")
transcribe_button.click(transcribe_and_analyze, inputs=audio_input, outputs=[transcribed_text, persona_text])
app.launch()