Spaces:

Nick021402
/

Voice2PersonaAI

Sleeping

App Files Files Community

Nick021402 commited on May 21

Commit

1449709

verified ·

1 Parent(s): 6861cf1

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -28

app.py CHANGED Viewed

@@ -1,54 +1,87 @@
-from transformers import pipeline, Wav2Vec2ForCTC, Wav2Vec2Processor
 import gradio as gr
 import torch
 import numpy as np
-# Load models
 processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
-model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cpu")
-classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
-labels = [
-    "secret agent", "lecturer", "motivational speaker", "comedian",
-    "philosopher", "calm leader", "emotional artist", "enthusiastic teacher",
-    "strict officer", "mysterious thinker"
 ]
-# Transcription + personality
 def transcribe_and_analyze(audio):
     if audio is None:
-        return "No audio provided", "No analysis"
-    # Mono audio
     if len(audio.shape) > 1:
-        audio = np.mean(audio, axis=1)
-    inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
-    input_values = inputs.input_values
     with torch.no_grad():
-        logits = model(input_values).logits
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = processor.decode(predicted_ids[0]).lower()
-    result = classifier(transcription, candidate_labels=labels)
-    top_label = result["labels"][0]
-    score = result["scores"][0]
-    return transcription, f"You sound like a **{top_label}** (confidence: {score:.2f})"
 # Gradio UI
-with gr.Blocks(theme=gr.themes.Soft()) as app:
-    gr.Markdown("# Voice2PersonaAI (Free AI-Based Personality Engine)")
     with gr.Row():
         audio_input = gr.Audio(sources=["microphone", "upload"], type="numpy", label="🎤 Your Voice")
-    transcribe_button = gr.Button("Transcribe & Analyze")
     with gr.Row():
-        transcribed_text = gr.Textbox(label="Transcription")
-        persona_text = gr.Textbox(label="AI-Predicted Persona")
-    transcribe_button.click(transcribe_and_analyze, inputs=audio_input, outputs=[transcribed_text, persona_text])
-app.launch()

 import gradio as gr
 import torch
 import numpy as np
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
+# Load Wav2Vec2 model and processor
 processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
+model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+# Load Zero-Shot classifier for persona detection
+persona_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+# Load Emotion classifier
+emotion_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)
+# Define possible personality types
+persona_labels = [
+    "secret agent",
+    "radio host",
+    "teacher",
+    "comedian",
+    "motivational speaker",
+    "villain",
+    "storyteller",
+    "detective",
+    "rapper",
+    "romantic poet",
+    "angry customer"
 ]
+# Define the analysis function
 def transcribe_and_analyze(audio):
     if audio is None:
+        return "No audio provided", "No persona detected", "No emotion detected"
+    # Handle uploaded vs mic
+    if isinstance(audio, tuple):
+        audio, sr = audio
+    else:
+        sr = 16000  # default
     if len(audio.shape) > 1:
+        audio = np.mean(audio, axis=1)  # convert to mono
+    # Transcribe
+    inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
     with torch.no_grad():
+        logits = model(inputs.input_values).logits
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = processor.decode(predicted_ids[0]).lower()
+    # Persona classification
+    persona_result = persona_classifier(transcription, candidate_labels=persona_labels)
+    persona = persona_result["labels"][0]
+    confidence = persona_result["scores"][0]
+    # Emotion classification
+    emotion_result = emotion_classifier(transcription)
+    emotion_scores = emotion_result[0]
+    sorted_emotions = sorted(emotion_scores, key=lambda x: x['score'], reverse=True)
+    top_emotion = sorted_emotions[0]
+    # Format results
+    persona_output = f"You sound like a **{persona}** (confidence: {confidence:.2f})"
+    emotion_output = f"Emotion detected: **{top_emotion['label']}** (score: {top_emotion['score']:.2f})"
+    return transcription, persona_output, emotion_output
 # Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("## Voice2Persona AI (Free & Fully Local with Hugging Face)")
     with gr.Row():
         audio_input = gr.Audio(sources=["microphone", "upload"], type="numpy", label="🎤 Your Voice")
+    analyze_btn = gr.Button("Analyze")
     with gr.Row():
+        transcript_output = gr.Textbox(label="Transcription")
+    with gr.Row():
+        persona_output = gr.Textbox(label="Persona Detected")
+    with gr.Row():
+        emotion_output = gr.Textbox(label="Emotion Detected")
+    analyze_btn.click(transcribe_and_analyze, inputs=[audio_input], outputs=[transcript_output, persona_output, emotion_output])
+demo.launch()