Spaces:

Nick021402
/

Voice2PersonaAI

Running

App Files Files Community

Nick021402 commited on May 21

Commit

72324ef

verified ·

1 Parent(s): 1449709

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -70

app.py CHANGED Viewed

@@ -1,87 +1,70 @@
 import gradio as gr
 import torch
 import numpy as np
-from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
-# Load Wav2Vec2 model and processor
-processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
-model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
-# Load Zero-Shot classifier for persona detection
-persona_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
-# Load Emotion classifier
-emotion_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)
-# Define possible personality types
-persona_labels = [
-    "secret agent",
-    "radio host",
-    "teacher",
-    "comedian",
-    "motivational speaker",
-    "villain",
-    "storyteller",
-    "detective",
-    "rapper",
-    "romantic poet",
-    "angry customer"
-]
-# Define the analysis function
-def transcribe_and_analyze(audio):
-    if audio is None:
-        return "No audio provided", "No persona detected", "No emotion detected"
-    # Handle uploaded vs mic
-    if isinstance(audio, tuple):
-        audio, sr = audio
-    else:
-        sr = 16000  # default
-    if len(audio.shape) > 1:
-        audio = np.mean(audio, axis=1)  # convert to mono
-    # Transcribe
-    inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
     with torch.no_grad():
-        logits = model(inputs.input_values).logits
     predicted_ids = torch.argmax(logits, dim=-1)
-    transcription = processor.decode(predicted_ids[0]).lower()
-    # Persona classification
-    persona_result = persona_classifier(transcription, candidate_labels=persona_labels)
-    persona = persona_result["labels"][0]
-    confidence = persona_result["scores"][0]
-    # Emotion classification
-    emotion_result = emotion_classifier(transcription)
-    emotion_scores = emotion_result[0]
-    sorted_emotions = sorted(emotion_scores, key=lambda x: x['score'], reverse=True)
-    top_emotion = sorted_emotions[0]
-    # Format results
-    persona_output = f"You sound like a **{persona}** (confidence: {confidence:.2f})"
-    emotion_output = f"Emotion detected: **{top_emotion['label']}** (score: {top_emotion['score']:.2f})"
-    return transcription, persona_output, emotion_output
 # Gradio UI
-with gr.Blocks() as demo:
-    gr.Markdown("## Voice2Persona AI (Free & Fully Local with Hugging Face)")
     with gr.Row():
         audio_input = gr.Audio(sources=["microphone", "upload"], type="numpy", label="🎤 Your Voice")
-    analyze_btn = gr.Button("Analyze")
-    with gr.Row():
         transcript_output = gr.Textbox(label="Transcription")
-    with gr.Row():
-        persona_output = gr.Textbox(label="Persona Detected")
-    with gr.Row():
-        emotion_output = gr.Textbox(label="Emotion Detected")
-    analyze_btn.click(transcribe_and_analyze, inputs=[audio_input], outputs=[transcript_output, persona_output, emotion_output])
-demo.launch()

 import gradio as gr
 import torch
 import numpy as np
+import torchaudio
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+from transformers import pipeline
+# Load models once at startup
+asr_model_name = "facebook/wav2vec2-base-960h"
+emotion_model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+gen_model_name = "google/flan-t5-base"
+# Load ASR
+asr_processor = Wav2Vec2Processor.from_pretrained(asr_model_name)
+asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name)
+# Load emotion detection
+emotion_classifier = pipeline("audio-classification", model=emotion_model_name)
+# Load personality generation pipeline
+gen_pipeline = pipeline("text2text-generation", model=gen_model_name)
+# Transcription Function
+def transcribe(audio):
+    if isinstance(audio, tuple):  # When type="numpy"
+        sr, audio = 16000, audio[0]  # Handle stereo or mono
+    input_values = asr_processor(audio, sampling_rate=16000, return_tensors="pt").input_values
     with torch.no_grad():
+        logits = asr_model(input_values).logits
     predicted_ids = torch.argmax(logits, dim=-1)
+    transcription = asr_processor.decode(predicted_ids[0])
+    return transcription.lower()
+# Personality Generation
+def generate_personality(text):
+    prompt = f"Describe the speaker's personality based on this sentence: \"{text}\""
+    response = gen_pipeline(prompt, max_new_tokens=50)[0]["generated_text"]
+    return response.strip()
+# Emotion Detection
+def detect_emotion(audio):
+    if isinstance(audio, tuple):
+        audio = audio[0]  # Extract numpy array from (array, sample_rate)
+    results = emotion_classifier(audio, top_k=1)
+    return results[0]["label"]
+# Main Pipeline
+def analyze(audio):
+    transcription = transcribe(audio)
+    emotion = detect_emotion(audio)
+    personality = generate_personality(transcription)
+    return transcription, emotion, personality
 # Gradio UI
+with gr.Blocks() as app:
+    gr.Markdown("# Voice2Persona AI\nUpload or record your voice to reveal your mood and hidden persona!")
     with gr.Row():
         audio_input = gr.Audio(sources=["microphone", "upload"], type="numpy", label="🎤 Your Voice")
+        submit_btn = gr.Button("Analyze")
+    with gr.Column():
         transcript_output = gr.Textbox(label="Transcription")
+        emotion_output = gr.Textbox(label="Detected Emotion")
+        personality_output = gr.Textbox(label="AI-Generated Personality")
+    submit_btn.click(fn=analyze, inputs=audio_input,
+                     outputs=[transcript_output, emotion_output, personality_output])
+app.launch()