Spaces:
Running
Running
from transformers import pipeline, Wav2Vec2ForCTC, Wav2Vec2Processor | |
import gradio as gr | |
import torch | |
import numpy as np | |
# Load models | |
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") | |
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cpu") | |
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") | |
labels = [ | |
"secret agent", "lecturer", "motivational speaker", "comedian", | |
"philosopher", "calm leader", "emotional artist", "enthusiastic teacher", | |
"strict officer", "mysterious thinker" | |
] | |
# Transcription + personality | |
def transcribe_and_analyze(audio): | |
if audio is None: | |
return "No audio provided", "No analysis" | |
# Mono audio | |
if len(audio.shape) > 1: | |
audio = np.mean(audio, axis=1) | |
inputs = processor(audio, sampling_rate=16000, return_tensors="pt") | |
input_values = inputs.input_values | |
with torch.no_grad(): | |
logits = model(input_values).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = processor.decode(predicted_ids[0]).lower() | |
result = classifier(transcription, candidate_labels=labels) | |
top_label = result["labels"][0] | |
score = result["scores"][0] | |
return transcription, f"You sound like a **{top_label}** (confidence: {score:.2f})" | |
# Gradio UI | |
with gr.Blocks(theme=gr.themes.Soft()) as app: | |
gr.Markdown("# Voice2PersonaAI (Free AI-Based Personality Engine)") | |
with gr.Row(): | |
audio_input = gr.Audio(sources=["microphone", "upload"], type="numpy", label="π€ Your Voice") | |
transcribe_button = gr.Button("Transcribe & Analyze") | |
with gr.Row(): | |
transcribed_text = gr.Textbox(label="Transcription") | |
persona_text = gr.Textbox(label="AI-Predicted Persona") | |
transcribe_button.click(transcribe_and_analyze, inputs=audio_input, outputs=[transcribed_text, persona_text]) | |
app.launch() |