Nick021402 commited on
Commit
6861cf1
Β·
verified Β·
1 Parent(s): 6566df7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -45
app.py CHANGED
@@ -1,66 +1,54 @@
 
1
  import gradio as gr
2
  import torch
3
  import numpy as np
4
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
5
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
 
7
- # Set device
8
- device = "cuda" if torch.cuda.is_available() else "cpu"
9
-
10
- # Load Wav2Vec2 model and processor for speech recognition
11
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
12
- model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)
 
13
 
14
- # Load FLAN-T5 model for personality generation
15
- gen_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
16
- gen_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(device)
 
 
17
 
18
- # Function to transcribe audio to text
19
- def transcribe(audio):
20
  if audio is None:
21
- return "Please upload or record an audio file."
22
-
23
- if isinstance(audio, tuple):
24
- audio_np = audio[1]
25
- else:
26
- audio_np = audio
27
 
28
- if isinstance(audio_np, np.ndarray) and audio_np.ndim > 1:
29
- audio_np = np.mean(audio_np, axis=1)
30
-
31
- input_values = processor(audio_np, sampling_rate=16000, return_tensors="pt").input_values.to(device)
 
 
32
  with torch.no_grad():
33
  logits = model(input_values).logits
34
  predicted_ids = torch.argmax(logits, dim=-1)
35
- transcription = processor.decode(predicted_ids[0])
36
- return transcription.lower()
37
 
38
- # Function to generate personality from transcription
39
- def generate_persona_from_text(transcription):
40
- prompt = f"Describe the speaker's personality and role as if they are a fictional character, based on this message:\n\"{transcription}\""
41
- inputs = gen_tokenizer(prompt, return_tensors="pt").to(device)
42
- output_ids = gen_model.generate(**inputs, max_length=100)
43
- return gen_tokenizer.decode(output_ids[0], skip_special_tokens=True)
44
 
45
- # Complete function for Gradio
46
- def analyze_speaker(audio):
47
- transcription = transcribe(audio)
48
- if "please upload" in transcription:
49
- return transcription, ""
50
- persona = generate_persona_from_text(transcription)
51
- return transcription, persona
52
 
53
- # Gradio Interface
54
  with gr.Blocks(theme=gr.themes.Soft()) as app:
55
- gr.Markdown("# Voice2Persona AI")
56
- gr.Markdown("Upload or record your voice. We'll transcribe it and guess your fictional personality.")
 
 
 
 
57
 
58
  with gr.Row():
59
- audio_input = gr.Audio(sources=["microphone", "upload"], type="numpy", label="🎀 Your Voice")
60
- transcribed_text = gr.Textbox(label="πŸ“ Transcription")
61
- persona_output = gr.Textbox(label="🧠 Persona Analysis")
62
- analyze_button = gr.Button("Analyze")
63
 
64
- analyze_button.click(fn=analyze_speaker, inputs=audio_input, outputs=[transcribed_text, persona_output])
65
 
66
  app.launch()
 
1
+ from transformers import pipeline, Wav2Vec2ForCTC, Wav2Vec2Processor
2
  import gradio as gr
3
  import torch
4
  import numpy as np
 
 
5
 
6
+ # Load models
 
 
 
7
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
8
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cpu")
9
+ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
10
 
11
+ labels = [
12
+ "secret agent", "lecturer", "motivational speaker", "comedian",
13
+ "philosopher", "calm leader", "emotional artist", "enthusiastic teacher",
14
+ "strict officer", "mysterious thinker"
15
+ ]
16
 
17
+ # Transcription + personality
18
+ def transcribe_and_analyze(audio):
19
  if audio is None:
20
+ return "No audio provided", "No analysis"
 
 
 
 
 
21
 
22
+ # Mono audio
23
+ if len(audio.shape) > 1:
24
+ audio = np.mean(audio, axis=1)
25
+
26
+ inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
27
+ input_values = inputs.input_values
28
  with torch.no_grad():
29
  logits = model(input_values).logits
30
  predicted_ids = torch.argmax(logits, dim=-1)
31
+ transcription = processor.decode(predicted_ids[0]).lower()
 
32
 
33
+ result = classifier(transcription, candidate_labels=labels)
34
+ top_label = result["labels"][0]
35
+ score = result["scores"][0]
 
 
 
36
 
37
+ return transcription, f"You sound like a **{top_label}** (confidence: {score:.2f})"
 
 
 
 
 
 
38
 
39
+ # Gradio UI
40
  with gr.Blocks(theme=gr.themes.Soft()) as app:
41
+ gr.Markdown("# Voice2PersonaAI (Free AI-Based Personality Engine)")
42
+
43
+ with gr.Row():
44
+ audio_input = gr.Audio(sources=["microphone", "upload"], type="numpy", label="🎀 Your Voice")
45
+
46
+ transcribe_button = gr.Button("Transcribe & Analyze")
47
 
48
  with gr.Row():
49
+ transcribed_text = gr.Textbox(label="Transcription")
50
+ persona_text = gr.Textbox(label="AI-Predicted Persona")
 
 
51
 
52
+ transcribe_button.click(transcribe_and_analyze, inputs=audio_input, outputs=[transcribed_text, persona_text])
53
 
54
  app.launch()