Nick021402 commited on
Commit
1449709
·
verified ·
1 Parent(s): 6861cf1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -28
app.py CHANGED
@@ -1,54 +1,87 @@
1
- from transformers import pipeline, Wav2Vec2ForCTC, Wav2Vec2Processor
2
  import gradio as gr
3
  import torch
4
  import numpy as np
 
5
 
6
- # Load models
7
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
8
- model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cpu")
9
- classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
10
 
11
- labels = [
12
- "secret agent", "lecturer", "motivational speaker", "comedian",
13
- "philosopher", "calm leader", "emotional artist", "enthusiastic teacher",
14
- "strict officer", "mysterious thinker"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  ]
16
 
17
- # Transcription + personality
18
  def transcribe_and_analyze(audio):
19
  if audio is None:
20
- return "No audio provided", "No analysis"
 
 
 
 
 
 
21
 
22
- # Mono audio
23
  if len(audio.shape) > 1:
24
- audio = np.mean(audio, axis=1)
25
-
26
- inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
27
- input_values = inputs.input_values
28
  with torch.no_grad():
29
- logits = model(input_values).logits
30
  predicted_ids = torch.argmax(logits, dim=-1)
31
  transcription = processor.decode(predicted_ids[0]).lower()
32
 
33
- result = classifier(transcription, candidate_labels=labels)
34
- top_label = result["labels"][0]
35
- score = result["scores"][0]
 
 
 
 
 
 
 
36
 
37
- return transcription, f"You sound like a **{top_label}** (confidence: {score:.2f})"
 
 
 
 
38
 
39
  # Gradio UI
40
- with gr.Blocks(theme=gr.themes.Soft()) as app:
41
- gr.Markdown("# Voice2PersonaAI (Free AI-Based Personality Engine)")
42
 
43
  with gr.Row():
44
  audio_input = gr.Audio(sources=["microphone", "upload"], type="numpy", label="🎤 Your Voice")
 
 
45
 
46
- transcribe_button = gr.Button("Transcribe & Analyze")
47
-
48
  with gr.Row():
49
- transcribed_text = gr.Textbox(label="Transcription")
50
- persona_text = gr.Textbox(label="AI-Predicted Persona")
 
 
 
51
 
52
- transcribe_button.click(transcribe_and_analyze, inputs=audio_input, outputs=[transcribed_text, persona_text])
53
 
54
- app.launch()
 
 
1
  import gradio as gr
2
  import torch
3
  import numpy as np
4
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
5
 
6
+ # Load Wav2Vec2 model and processor
7
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
8
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
 
9
 
10
+ # Load Zero-Shot classifier for persona detection
11
+ persona_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
12
+
13
+ # Load Emotion classifier
14
+ emotion_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)
15
+
16
+ # Define possible personality types
17
+ persona_labels = [
18
+ "secret agent",
19
+ "radio host",
20
+ "teacher",
21
+ "comedian",
22
+ "motivational speaker",
23
+ "villain",
24
+ "storyteller",
25
+ "detective",
26
+ "rapper",
27
+ "romantic poet",
28
+ "angry customer"
29
  ]
30
 
31
+ # Define the analysis function
32
  def transcribe_and_analyze(audio):
33
  if audio is None:
34
+ return "No audio provided", "No persona detected", "No emotion detected"
35
+
36
+ # Handle uploaded vs mic
37
+ if isinstance(audio, tuple):
38
+ audio, sr = audio
39
+ else:
40
+ sr = 16000 # default
41
 
 
42
  if len(audio.shape) > 1:
43
+ audio = np.mean(audio, axis=1) # convert to mono
44
+
45
+ # Transcribe
46
+ inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
47
  with torch.no_grad():
48
+ logits = model(inputs.input_values).logits
49
  predicted_ids = torch.argmax(logits, dim=-1)
50
  transcription = processor.decode(predicted_ids[0]).lower()
51
 
52
+ # Persona classification
53
+ persona_result = persona_classifier(transcription, candidate_labels=persona_labels)
54
+ persona = persona_result["labels"][0]
55
+ confidence = persona_result["scores"][0]
56
+
57
+ # Emotion classification
58
+ emotion_result = emotion_classifier(transcription)
59
+ emotion_scores = emotion_result[0]
60
+ sorted_emotions = sorted(emotion_scores, key=lambda x: x['score'], reverse=True)
61
+ top_emotion = sorted_emotions[0]
62
 
63
+ # Format results
64
+ persona_output = f"You sound like a **{persona}** (confidence: {confidence:.2f})"
65
+ emotion_output = f"Emotion detected: **{top_emotion['label']}** (score: {top_emotion['score']:.2f})"
66
+
67
+ return transcription, persona_output, emotion_output
68
 
69
  # Gradio UI
70
+ with gr.Blocks() as demo:
71
+ gr.Markdown("## Voice2Persona AI (Free & Fully Local with Hugging Face)")
72
 
73
  with gr.Row():
74
  audio_input = gr.Audio(sources=["microphone", "upload"], type="numpy", label="🎤 Your Voice")
75
+
76
+ analyze_btn = gr.Button("Analyze")
77
 
 
 
78
  with gr.Row():
79
+ transcript_output = gr.Textbox(label="Transcription")
80
+ with gr.Row():
81
+ persona_output = gr.Textbox(label="Persona Detected")
82
+ with gr.Row():
83
+ emotion_output = gr.Textbox(label="Emotion Detected")
84
 
85
+ analyze_btn.click(transcribe_and_analyze, inputs=[audio_input], outputs=[transcript_output, persona_output, emotion_output])
86
 
87
+ demo.launch()