File size: 6,649 Bytes
94e402f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137

import gradio as gr
from transformers import pipeline
import whisper
from collections import Counter
import matplotlib.pyplot as plt

# Load models
emotion_classifier = pipeline("audio-classification", model="superb/hubert-large-superb-er")
whisper_model = whisper.load_model("base")

# ==== Chart Logic ====
def create_emotion_chart(labels, scores):
    emoji_map = {
        "hap": "๐Ÿ˜Š Happy", "sad": "๐Ÿ˜” Sad", "neu": "๐Ÿ˜ Neutral",
        "ang": "๐Ÿ˜  Angry", "fea": "๐Ÿ˜จ Fear", "dis": "๐Ÿคข Disgust", "sur": "๐Ÿ˜ฎ Surprise"
    }
    color_map = {
        "hap": "#facc15", "sad": "#60a5fa", "neu": "#a1a1aa",
        "ang": "#ef4444", "fea": "#818cf8", "dis": "#14b8a6", "sur": "#f472b6"
    }
    display_labels = [emoji_map.get(label, label) for label in labels]
    colors = [color_map.get(label, "#60a5fa") for label in labels]
    fig, ax = plt.subplots(figsize=(5, 3.5))
    bars = ax.barh(display_labels, scores, color=colors, edgecolor="black", height=0.5)
    for bar, score in zip(bars, scores):
        ax.text(bar.get_width() + 0.02, bar.get_y() + bar.get_height() / 2, f"{score:.2f}", va='center', fontsize=10)
    ax.set_xlim(0, 1)
    ax.set_title("๐ŸŽญ Emotion Confidence Scores", fontsize=13, pad=10)
    ax.invert_yaxis()
    ax.set_facecolor("#f9fafb")
    fig.patch.set_facecolor("#f9fafb")
    for spine in ax.spines.values():
        spine.set_visible(False)
    ax.tick_params(axis='x', colors='gray')
    ax.tick_params(axis='y', colors='gray')
    return fig

# ==== Feedback Generation ====
def generate_next_moves(dominant_emotion, conf_score, transcript=""):
    suggestions = []
    harsh_words = ["bad", "ugly", "terrible", "hate", "worst"]
    positive_tone_negative_words = any(word in transcript.lower() for word in harsh_words) if "happiness" in dominant_emotion else False
    if 'sadness' in dominant_emotion:
        suggestions.append("Your tone feels low โ€” try lifting the pitch slightly to bring more warmth.")
        suggestions.append("Even if the words are positive, a brighter tone helps convey enthusiasm.")
    elif 'happiness' in dominant_emotion and conf_score >= 80:
        suggestions.append("Nice energy! Try modulating your tone even more for emphasis in key moments.")
        suggestions.append("Experiment with subtle emotional shifts as you speak for more depth.")
    elif 'neutral' in dominant_emotion:
        suggestions.append("Add inflection to break a monotone pattern โ€” especially at the ends of sentences.")
        suggestions.append("Highlight your message by stressing emotionally important words.")
    elif conf_score < 50:
        suggestions.append("Try exaggerating vocal ups and downs when reading to unlock more expression.")
        suggestions.append("Slow down slightly and stretch certain words to vary your delivery.")
    else:
        suggestions.append("Keep practicing tone variation โ€” youโ€™re building a solid base.")
    if positive_tone_negative_words:
        suggestions.append("Your tone was upbeat, but the word choices were harsh โ€” aim to align both for better impact.")
    return "\n- " + "\n- ".join(suggestions)

def generate_personacoach_report(emotions, transcript):
    report = "## ๐Ÿ“ **Your PersonaCoach Report**\n---\n\n"
    report += "### ๐Ÿ—’๏ธ **What You Said:**\n"
    report += f"> _{transcript.strip()}_\n\n"
    label_map = {
        'hap': '๐Ÿ˜Š happiness', 'sad': '๐Ÿ˜” sadness', 'neu': '๐Ÿ˜ neutral',
        'ang': '๐Ÿ˜  anger', 'fea': '๐Ÿ˜จ fear', 'dis': '๐Ÿคข disgust', 'sur': '๐Ÿ˜ฎ surprise'
    }
    for e in emotions:
        e['emotion'] = label_map.get(e['label'], e['label'])
    scores = [s['score'] for s in emotions]
    top_score = max(scores)
    conf_score = int(top_score * 100)
    meaningful_emotions = [(e['emotion'], e['score']) for e in emotions if e['score'] >= 0.2]
    emotion_labels = [e[0] for e in meaningful_emotions]
    dominant_emotion = emotion_labels[0] if emotion_labels else "neutral"

    report += "### ๐ŸŽฏ **Tone Strength:**\n"
    report += f"- Your tone scored **{conf_score}/100** in clarity.\n\n"
    report += "### ๐Ÿ—ฃ๏ธ **Emotion & Delivery:**\n"
    if meaningful_emotions:
        emotions_str = ", ".join([f"**{label}** ({score:.2f})" for label, score in meaningful_emotions])
        report += f"- Emotionally, your voice showed: {emotions_str}\n"
    else:
        report += "- Your tone wasnโ€™t clearly expressive. Try reading with a bit more emphasis or emotion.\n"
    filler_words = ["um", "uh", "like", "you know", "so", "actually", "basically", "literally"]
    words = transcript.lower().split()
    total_words = len(words)
    filler_count = sum(words.count(fw) for fw in filler_words)
    filler_ratio = filler_count / total_words if total_words > 0 else 0

    report += "\n### ๐Ÿ’ฌ **Pausing Style (e.g., 'um', 'like', 'you know'):**\n"
    report += f"- You used **{filler_count}** hesitation phrases out of **{total_words}** words.\n"
    if filler_ratio > 0.06:
        report += "- Try pausing instead of using fillers โ€” it builds stronger presence.\n"
    elif filler_ratio > 0.03:
        report += "- A few slipped in. Practice holding space with silence instead.\n"
    else:
        report += "- Great fluency โ€” you stayed focused and controlled.\n"

    report += "\n### โœ… **What You're Doing Well:**\n"
    if top_score >= 0.75 and filler_ratio < 0.03:
        report += "- Confident tone and smooth delivery โ€” keep it up!\n"
    else:
        report += "- Youโ€™re on track. Keep refining tone and pacing.\n"

    report += "\n### ๐Ÿงญ **Next Moves:**\n"
    report += generate_next_moves(dominant_emotion, conf_score, transcript) + "\n"
    return report

# ==== Main Interface Logic ====
def analyze_audio(audio_path):
    result = whisper_model.transcribe(audio_path)
    transcript = result['text']
    emotion_results = emotion_classifier(audio_path)
    labels = [r['label'] for r in emotion_results]
    scores = [r['score'] for r in emotion_results]
    fig = create_emotion_chart(labels, scores)
    report = generate_personacoach_report(emotion_results, transcript)
    return transcript, fig, report

# ==== Gradio UI ====
interface = gr.Interface(
    fn=analyze_audio,
    inputs=gr.Audio(type="filepath", label="๐ŸŽง Upload Voice"),
    outputs=[
        gr.Textbox(label="๐Ÿ“ Transcription"),
        gr.Plot(label="๐ŸŽญ Emotion Chart"),
        gr.Markdown(label="๐Ÿ“„ PersonaCoach Feedback")
    ],
    title="SPEAK โ€“ Speech Performance Evaluation and Affective Knowledge",
    description="Upload your voice and receive a tone-based coaching report powered by HuBERT + Whisper."
)

interface.launch()