File size: 7,252 Bytes
c2962bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148

import gradio as gr
from transformers import pipeline
import whisper
from collections import Counter
import matplotlib.pyplot as plt

# Load models
emotion_classifier = pipeline("audio-classification", model="superb/hubert-large-superb-er")
whisper_model = whisper.load_model("base")

def create_emotion_chart(labels, scores):
    emoji_map = {
        "hap": "๐Ÿ˜Š Happy", "sad": "๐Ÿ˜” Sad", "neu": "๐Ÿ˜ Neutral",
        "ang": "๐Ÿ˜  Angry", "fea": "๐Ÿ˜จ Fear", "dis": "๐Ÿคข Disgust", "sur": "๐Ÿ˜ฎ Surprise"
    }
    color_map = {
        "hap": "#facc15", "sad": "#60a5fa", "neu": "#a1a1aa",
        "ang": "#ef4444", "fea": "#818cf8", "dis": "#14b8a6", "sur": "#f472b6"
    }
    display_labels = [emoji_map.get(label, label) for label in labels]
    colors = [color_map.get(label, "#60a5fa") for label in labels]
    fig, ax = plt.subplots(figsize=(5, 3.5))
    bars = ax.barh(display_labels, scores, color=colors, edgecolor="black", height=0.5)
    for bar, score in zip(bars, scores):
        ax.text(bar.get_width() + 0.02, bar.get_y() + bar.get_height() / 2, f"{score:.2f}", va='center', fontsize=10)
    ax.set_xlim(0, 1)
    ax.set_title("๐ŸŽญ Emotion Confidence Scores", fontsize=13, pad=10)
    ax.invert_yaxis()
    ax.set_facecolor("#f9fafb")
    fig.patch.set_facecolor("#f9fafb")
    for spine in ax.spines.values():
        spine.set_visible(False)
    ax.tick_params(axis='x', colors='gray')
    ax.tick_params(axis='y', colors='gray')
    return fig

def generate_next_moves(dominant_emotion, conf_score, transcript=""):
    suggestions = []
    harsh_words = ["bad", "ugly", "terrible", "hate", "worst"]
    positive_tone_negative_words = any(word in transcript.lower() for word in harsh_words) if "happiness" in dominant_emotion else False
    if 'sadness' in dominant_emotion:
        suggestions.append("Your tone feels low โ€” try lifting the pitch slightly to bring more warmth.")
        suggestions.append("Even if the words are positive, a brighter tone helps convey enthusiasm.")
    elif 'happiness' in dominant_emotion and conf_score >= 80:
        suggestions.append("Nice energy! Try modulating your tone even more for emphasis in key moments.")
        suggestions.append("Experiment with subtle emotional shifts as you speak for more depth.")
    elif 'neutral' in dominant_emotion:
        suggestions.append("Add inflection to break a monotone pattern โ€” especially at the ends of sentences.")
        suggestions.append("Highlight your message by stressing emotionally important words.")
    elif conf_score < 50:
        suggestions.append("Try exaggerating vocal ups and downs when reading to unlock more expression.")
        suggestions.append("Slow down slightly and stretch certain words to vary your delivery.")
    else:
        suggestions.append("Keep practicing tone variation โ€” youโ€™re building a solid base.")
    if positive_tone_negative_words:
        suggestions.append("Your tone was upbeat, but the word choices were harsh โ€” aim to align both for better impact.")
    return "\n- " + "\n- ".join(suggestions)

def generate_personacoach_report(emotions, transcript):
    report = "## ๐Ÿ“ **Your PersonaCoach Report**\n---\n\n"
    report += "### ๐Ÿ—’๏ธ **What You Said:**\n"
    report += f"> _{transcript.strip()}_\n\n"
    label_map = {
        'hap': '๐Ÿ˜Š happiness', 'sad': '๐Ÿ˜” sadness', 'neu': '๐Ÿ˜ neutral',
        'ang': '๐Ÿ˜  anger', 'fea': '๐Ÿ˜จ fear', 'dis': '๐Ÿคข disgust', 'sur': '๐Ÿ˜ฎ surprise'
    }
    for e in emotions:
        e['emotion'] = label_map.get(e['label'], e['label'])
    scores = [s['score'] for s in emotions]
    top_score = max(scores)
    conf_score = int(top_score * 100)
    meaningful_emotions = [(e['emotion'], e['score']) for e in emotions if e['score'] >= 0.2]
    emotion_labels = [e[0] for e in meaningful_emotions]
    dominant_emotion = emotion_labels[0] if emotion_labels else "neutral"

    report += f"### ๐ŸŽฏ **Tone Strength:**\n- Your tone scored **{conf_score}/100** in clarity.\n\n"
    report += "### ๐Ÿ—ฃ๏ธ **Emotion & Delivery:**\n"
    if meaningful_emotions:
        emotions_str = ", ".join([f"**{label}** ({score:.2f})" for label, score in meaningful_emotions])
        report += f"- Emotionally, your voice showed: {emotions_str}\n"
    else:
        report += "- Your tone wasnโ€™t clearly expressive. Try reading with a bit more emphasis or emotion.\n"
    filler_words = ["um", "uh", "like", "you know", "so", "actually", "basically", "literally"]
    words = transcript.lower().split()
    total_words = len(words)
    filler_count = sum(words.count(fw) for fw in filler_words)
    filler_ratio = filler_count / total_words if total_words > 0 else 0

    report += "\n### ๐Ÿ’ฌ **Pausing Style (e.g., 'um', 'like', 'you know'):**\n"
    report += f"- You used **{filler_count}** hesitation phrases out of **{total_words}** words.\n"
    if filler_ratio > 0.06:
        report += "- Try pausing instead of using fillers โ€” it builds stronger presence.\n"
    elif filler_ratio > 0.03:
        report += "- A few slipped in. Practice holding space with silence instead.\n"
    else:
        report += "- Great fluency โ€” you stayed focused and controlled.\n"

    report += "\n### โœ… **What You're Doing Well:**\n"
    if top_score >= 0.75 and filler_ratio < 0.03:
        report += "- Confident tone and smooth delivery โ€” keep it up!\n"
    else:
        report += "- Youโ€™re on track. Keep refining tone and pacing.\n"

    report += "\n### ๐Ÿงญ **Next Moves:**\n"
    report += generate_next_moves(dominant_emotion, conf_score, transcript) + "\n"
    return report

def analyze_audio(audio_path):
    result = whisper_model.transcribe(audio_path)
    transcript = result['text']
    emotion_results = emotion_classifier(audio_path)
    labels = [r['label'] for r in emotion_results]
    scores = [r['score'] for r in emotion_results]
    fig = create_emotion_chart(labels, scores)
    report = generate_personacoach_report(emotion_results, transcript)
    return transcript, fig, report

with gr.Blocks(title="SPEAK: PersonaCoach", theme=gr.themes.Soft()) as app:
    gr.Markdown("""
    <div style="text-align:center; margin-bottom: 1rem;">
        <h1 style="font-size: 2.2rem; margin-bottom: 0.2rem;">๐ŸŽค SPEAK: PersonaCoach</h1>
        <p style="color: gray;">Your smart voice reflection tool โ€” assess tone, confidence, and delivery</p>
    </div>
    """, elem_id="header")

    with gr.Row():
        with gr.Column(scale=4):
            audio_input = gr.Audio(type="filepath", label="๐ŸŽง Upload Your Voice (.wav)", elem_id="upload-audio")
        with gr.Column(scale=1, min_width=120):
            analyze_btn = gr.Button("๐Ÿ” Analyze", size="sm", elem_id="analyze-btn")

    gr.Markdown("## ๐Ÿง  Results", elem_id="results-header")

    with gr.Row(equal_height=True):
        with gr.Column(scale=2):
            feedback_output = gr.Markdown(label="๐Ÿ“„ PersonaCoach Feedback", elem_id="report-section")
        with gr.Column(scale=1):
            emotion_plot = gr.Plot(label="๐ŸŽญ Emotion Chart", elem_id="chart")

    analyze_btn.click(
        fn=analyze_audio,
        inputs=audio_input,
        outputs=[gr.Textbox(visible=False), emotion_plot, feedback_output]
    )

app.launch()