Nick021402 commited on
Commit
72324ef
·
verified ·
1 Parent(s): 1449709

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -70
app.py CHANGED
@@ -1,87 +1,70 @@
1
  import gradio as gr
2
  import torch
3
  import numpy as np
4
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
5
-
6
- # Load Wav2Vec2 model and processor
7
- processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
8
- model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
9
-
10
- # Load Zero-Shot classifier for persona detection
11
- persona_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
12
-
13
- # Load Emotion classifier
14
- emotion_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)
15
-
16
- # Define possible personality types
17
- persona_labels = [
18
- "secret agent",
19
- "radio host",
20
- "teacher",
21
- "comedian",
22
- "motivational speaker",
23
- "villain",
24
- "storyteller",
25
- "detective",
26
- "rapper",
27
- "romantic poet",
28
- "angry customer"
29
- ]
30
-
31
- # Define the analysis function
32
- def transcribe_and_analyze(audio):
33
- if audio is None:
34
- return "No audio provided", "No persona detected", "No emotion detected"
35
-
36
- # Handle uploaded vs mic
37
- if isinstance(audio, tuple):
38
- audio, sr = audio
39
- else:
40
- sr = 16000 # default
41
-
42
- if len(audio.shape) > 1:
43
- audio = np.mean(audio, axis=1) # convert to mono
44
-
45
- # Transcribe
46
- inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
47
  with torch.no_grad():
48
- logits = model(inputs.input_values).logits
49
  predicted_ids = torch.argmax(logits, dim=-1)
50
- transcription = processor.decode(predicted_ids[0]).lower()
 
51
 
52
- # Persona classification
53
- persona_result = persona_classifier(transcription, candidate_labels=persona_labels)
54
- persona = persona_result["labels"][0]
55
- confidence = persona_result["scores"][0]
 
56
 
57
- # Emotion classification
58
- emotion_result = emotion_classifier(transcription)
59
- emotion_scores = emotion_result[0]
60
- sorted_emotions = sorted(emotion_scores, key=lambda x: x['score'], reverse=True)
61
- top_emotion = sorted_emotions[0]
62
-
63
- # Format results
64
- persona_output = f"You sound like a **{persona}** (confidence: {confidence:.2f})"
65
- emotion_output = f"Emotion detected: **{top_emotion['label']}** (score: {top_emotion['score']:.2f})"
66
 
67
- return transcription, persona_output, emotion_output
 
 
 
 
 
68
 
69
  # Gradio UI
70
- with gr.Blocks() as demo:
71
- gr.Markdown("## Voice2Persona AI (Free & Fully Local with Hugging Face)")
72
 
73
  with gr.Row():
74
  audio_input = gr.Audio(sources=["microphone", "upload"], type="numpy", label="🎤 Your Voice")
75
-
76
- analyze_btn = gr.Button("Analyze")
77
 
78
- with gr.Row():
79
  transcript_output = gr.Textbox(label="Transcription")
80
- with gr.Row():
81
- persona_output = gr.Textbox(label="Persona Detected")
82
- with gr.Row():
83
- emotion_output = gr.Textbox(label="Emotion Detected")
84
 
85
- analyze_btn.click(transcribe_and_analyze, inputs=[audio_input], outputs=[transcript_output, persona_output, emotion_output])
 
86
 
87
- demo.launch()
 
1
  import gradio as gr
2
  import torch
3
  import numpy as np
4
+ import torchaudio
5
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
6
+ from transformers import pipeline
7
+
8
+ # Load models once at startup
9
+ asr_model_name = "facebook/wav2vec2-base-960h"
10
+ emotion_model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
11
+ gen_model_name = "google/flan-t5-base"
12
+
13
+ # Load ASR
14
+ asr_processor = Wav2Vec2Processor.from_pretrained(asr_model_name)
15
+ asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name)
16
+
17
+ # Load emotion detection
18
+ emotion_classifier = pipeline("audio-classification", model=emotion_model_name)
19
+
20
+ # Load personality generation pipeline
21
+ gen_pipeline = pipeline("text2text-generation", model=gen_model_name)
22
+
23
+ # Transcription Function
24
+ def transcribe(audio):
25
+ if isinstance(audio, tuple): # When type="numpy"
26
+ sr, audio = 16000, audio[0] # Handle stereo or mono
27
+ input_values = asr_processor(audio, sampling_rate=16000, return_tensors="pt").input_values
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  with torch.no_grad():
29
+ logits = asr_model(input_values).logits
30
  predicted_ids = torch.argmax(logits, dim=-1)
31
+ transcription = asr_processor.decode(predicted_ids[0])
32
+ return transcription.lower()
33
 
34
+ # Personality Generation
35
+ def generate_personality(text):
36
+ prompt = f"Describe the speaker's personality based on this sentence: \"{text}\""
37
+ response = gen_pipeline(prompt, max_new_tokens=50)[0]["generated_text"]
38
+ return response.strip()
39
 
40
+ # Emotion Detection
41
+ def detect_emotion(audio):
42
+ if isinstance(audio, tuple):
43
+ audio = audio[0] # Extract numpy array from (array, sample_rate)
44
+ results = emotion_classifier(audio, top_k=1)
45
+ return results[0]["label"]
 
 
 
46
 
47
+ # Main Pipeline
48
+ def analyze(audio):
49
+ transcription = transcribe(audio)
50
+ emotion = detect_emotion(audio)
51
+ personality = generate_personality(transcription)
52
+ return transcription, emotion, personality
53
 
54
  # Gradio UI
55
+ with gr.Blocks() as app:
56
+ gr.Markdown("# Voice2Persona AI\nUpload or record your voice to reveal your mood and hidden persona!")
57
 
58
  with gr.Row():
59
  audio_input = gr.Audio(sources=["microphone", "upload"], type="numpy", label="🎤 Your Voice")
60
+ submit_btn = gr.Button("Analyze")
 
61
 
62
+ with gr.Column():
63
  transcript_output = gr.Textbox(label="Transcription")
64
+ emotion_output = gr.Textbox(label="Detected Emotion")
65
+ personality_output = gr.Textbox(label="AI-Generated Personality")
 
 
66
 
67
+ submit_btn.click(fn=analyze, inputs=audio_input,
68
+ outputs=[transcript_output, emotion_output, personality_output])
69
 
70
+ app.launch()