Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,45 +1,35 @@
|
|
1 |
import gradio as gr
|
2 |
import random
|
3 |
import difflib
|
|
|
|
|
|
|
|
|
4 |
from faster_whisper import WhisperModel
|
5 |
from indic_transliteration import sanscript
|
6 |
from indic_transliteration.sanscript import transliterate
|
7 |
-
import
|
8 |
-
import jiwer
|
9 |
|
10 |
# ---------------- CONFIG ---------------- #
|
11 |
MODEL_NAME = "large-v2"
|
12 |
DEVICE = "cpu"
|
13 |
|
14 |
LANG_CODES = {
|
15 |
-
"English": "en",
|
16 |
-
"
|
17 |
-
"Malayalam": "ml",
|
18 |
-
"Hindi": "hi",
|
19 |
-
"Sanskrit": "sa"
|
20 |
}
|
21 |
|
22 |
LANG_PRIMERS = {
|
23 |
-
"English": (
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
"
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
"
|
32 |
-
|
33 |
-
"മലയാള ലിപിയിൽ മാത്രം എഴുതുക, വിവർത്തനം ചെയ്യരുത്. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്. എനിക്ക് മലയാളം അറിയാം."
|
34 |
-
),
|
35 |
-
"Hindi": (
|
36 |
-
"प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
|
37 |
-
"केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: यह एक हिंदी वाक्य है।"
|
38 |
-
),
|
39 |
-
"Sanskrit": (
|
40 |
-
"प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
|
41 |
-
"केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: अहं संस्कृतं जानामि।"
|
42 |
-
)
|
43 |
}
|
44 |
|
45 |
SCRIPT_PATTERNS = {
|
@@ -51,72 +41,69 @@ SCRIPT_PATTERNS = {
|
|
51 |
}
|
52 |
|
53 |
SENTENCE_BANK = {
|
54 |
-
"English": [
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
"Hindi": [
|
70 |
-
"आज मौसम अच्छा है।",
|
71 |
-
"मुझे हिंदी बोलना पसंद है।",
|
72 |
-
"मैं किताब पढ़ रहा हूँ।"
|
73 |
-
],
|
74 |
-
"Sanskrit": [
|
75 |
-
"अहं ग्रन्थं पठामि।",
|
76 |
-
"अद्य सूर्यः तेजस्वी अस्ति।",
|
77 |
-
"मम नाम रामः।"
|
78 |
-
]
|
79 |
}
|
80 |
|
81 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
print("Loading Whisper model...")
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
# ---------------- HELPERS ---------------- #
|
|
|
|
|
|
|
86 |
def is_script(text, lang_name):
|
87 |
-
|
88 |
-
return bool(
|
89 |
|
90 |
def transliterate_to_hk(text, lang_choice):
|
91 |
mapping = {
|
92 |
-
"Tamil": sanscript.TAMIL,
|
93 |
-
"
|
94 |
-
"Hindi": sanscript.DEVANAGARI,
|
95 |
-
"Sanskrit": sanscript.DEVANAGARI,
|
96 |
"English": None
|
97 |
}
|
98 |
return transliterate(text, mapping[lang_choice], sanscript.HK) if mapping[lang_choice] else text
|
99 |
|
100 |
def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperature, condition_on_previous_text):
|
101 |
-
segments, _ =
|
102 |
-
audio_path,
|
103 |
-
|
104 |
-
|
105 |
-
initial_prompt=initial_prompt,
|
106 |
-
beam_size=beam_size,
|
107 |
-
temperature=temperature,
|
108 |
-
condition_on_previous_text=condition_on_previous_text,
|
109 |
word_timestamps=False
|
110 |
)
|
111 |
return "".join(s.text for s in segments).strip()
|
112 |
|
113 |
-
def get_random_sentence(language_choice):
|
114 |
-
return random.choice(SENTENCE_BANK[language_choice])
|
115 |
-
|
116 |
def highlight_differences(ref, hyp):
|
117 |
-
|
118 |
-
ref_words = ref.strip().split()
|
119 |
-
hyp_words = hyp.strip().split()
|
120 |
sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
|
121 |
out_html = []
|
122 |
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
@@ -131,57 +118,73 @@ def highlight_differences(ref, hyp):
|
|
131 |
out_html.extend([f"<span style='color:orange'>{w}</span>" for w in hyp_words[j1:j2]])
|
132 |
return " ".join(out_html)
|
133 |
|
134 |
-
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
if audio is None or not intended_sentence.strip():
|
137 |
-
return "No audio or intended sentence
|
138 |
|
139 |
lang_code = LANG_CODES[language_choice]
|
140 |
primer_weak, primer_strong = LANG_PRIMERS[language_choice]
|
141 |
|
142 |
-
# Pass 1
|
143 |
-
actual_text = transcribe_once(
|
144 |
-
|
145 |
-
lang_code=lang_code,
|
146 |
-
initial_prompt=primer_weak,
|
147 |
-
beam_size=pass1_beam,
|
148 |
-
temperature=pass1_temp,
|
149 |
-
condition_on_previous_text=pass1_condition
|
150 |
-
)
|
151 |
|
152 |
-
# Pass 2 (fixed
|
153 |
strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
|
154 |
-
corrected_text = transcribe_once(
|
155 |
-
|
156 |
-
lang_code=lang_code,
|
157 |
-
initial_prompt=strict_prompt,
|
158 |
-
beam_size=5,
|
159 |
-
temperature=0.0,
|
160 |
-
condition_on_previous_text=False
|
161 |
-
)
|
162 |
|
163 |
# Scores
|
164 |
wer_val = jiwer.wer(intended_sentence, actual_text)
|
165 |
cer_val = jiwer.cer(intended_sentence, actual_text)
|
166 |
|
167 |
-
#
|
168 |
-
hk_translit = transliterate_to_hk(actual_text, language_choice)
|
|
|
|
|
169 |
|
170 |
-
# Highlighted diff
|
171 |
diff_html = highlight_differences(intended_sentence, actual_text)
|
|
|
|
|
|
|
|
|
|
|
172 |
|
173 |
-
return actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}", diff_html
|
174 |
|
175 |
# ---------------- UI ---------------- #
|
176 |
with gr.Blocks() as demo:
|
177 |
-
gr.Markdown("
|
178 |
-
"Generate
|
179 |
|
180 |
with gr.Row():
|
181 |
lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
|
182 |
gen_btn = gr.Button("🎲 Generate Sentence")
|
183 |
|
184 |
-
intended_display = gr.Textbox(label="Generated Sentence (Read
|
185 |
|
186 |
with gr.Row():
|
187 |
audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
|
@@ -195,10 +198,15 @@ with gr.Blocks() as demo:
|
|
195 |
hk_out = gr.Textbox(label="Harvard-Kyoto Transliteration (Pass 1)")
|
196 |
|
197 |
with gr.Row():
|
198 |
-
wer_out = gr.Textbox(label="Word Error Rate
|
199 |
-
cer_out = gr.Textbox(label="Character Error Rate
|
200 |
|
201 |
-
diff_html_box = gr.HTML(label="Differences Highlighted")
|
|
|
|
|
|
|
|
|
|
|
202 |
|
203 |
gen_btn.click(fn=get_random_sentence, inputs=[lang_choice], outputs=[intended_display])
|
204 |
|
@@ -206,7 +214,11 @@ with gr.Blocks() as demo:
|
|
206 |
submit_btn.click(
|
207 |
fn=compare_pronunciation,
|
208 |
inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp, pass1_condition],
|
209 |
-
outputs=[
|
|
|
|
|
|
|
|
|
210 |
)
|
211 |
|
212 |
if __name__ == "__main__":
|
|
|
1 |
import gradio as gr
|
2 |
import random
|
3 |
import difflib
|
4 |
+
import re
|
5 |
+
import jiwer
|
6 |
+
import torch
|
7 |
+
import soundfile as sf
|
8 |
from faster_whisper import WhisperModel
|
9 |
from indic_transliteration import sanscript
|
10 |
from indic_transliteration.sanscript import transliterate
|
11 |
+
from transformers import AutoModelForTextToSpeech, AutoTokenizer, pipeline
|
|
|
12 |
|
13 |
# ---------------- CONFIG ---------------- #
|
14 |
MODEL_NAME = "large-v2"
|
15 |
DEVICE = "cpu"
|
16 |
|
17 |
LANG_CODES = {
|
18 |
+
"English": "en", "Tamil": "ta", "Malayalam": "ml",
|
19 |
+
"Hindi": "hi", "Sanskrit": "sa"
|
|
|
|
|
|
|
20 |
}
|
21 |
|
22 |
LANG_PRIMERS = {
|
23 |
+
"English": ("The transcript should be in English only.",
|
24 |
+
"Write only in English without translation. Example: This is an English sentence."),
|
25 |
+
"Tamil": ("நகல் தமிழ் எழுத்துக்களில் மட்டும் இருக்க வேண்டும்.",
|
26 |
+
"தமிழ் எழுத்துக்களில் மட்டும் எழுதவும், மொழிபெயர்ப்பு செய்யக்கூடாது. உதாரணம்: இது ஒரு தமிழ் வாக்கியம்."),
|
27 |
+
"Malayalam": ("ട്രാൻസ്ക്രിപ്റ്റ് മലയാള ലിപിയിൽ ആയിരിക്കണം.",
|
28 |
+
"മലയാള ലിപിയിൽ മാത്രം എഴുതുക, വിവർത്തനം ചെയ്യരുത്. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്. എനിക്ക് മലയാളം അറിയാം."),
|
29 |
+
"Hindi": ("प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
|
30 |
+
"केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: यह एक हिंदी वाक्य है।"),
|
31 |
+
"Sanskrit": ("प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
|
32 |
+
"केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: अहं संस्कृतं जानामि।")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
}
|
34 |
|
35 |
SCRIPT_PATTERNS = {
|
|
|
41 |
}
|
42 |
|
43 |
SENTENCE_BANK = {
|
44 |
+
"English": ["The sun sets over the horizon.",
|
45 |
+
"Learning languages is fun.",
|
46 |
+
"I like to drink coffee in the morning."],
|
47 |
+
"Tamil": ["இன்று நல்ல வானிலை உள்ளது.",
|
48 |
+
"நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
|
49 |
+
"எனக்கு புத்தகம் படிக்க விருப்பம்."],
|
50 |
+
"Malayalam": ["എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
|
51 |
+
"ഇന്ന് മഴപെയ്യുന്നു.",
|
52 |
+
"ഞാൻ പുസ്തകം വായിക്കുന്നു."],
|
53 |
+
"Hindi": ["आज मौसम अच्छा है।",
|
54 |
+
"मुझे हिंदी बोलना पसंद है।",
|
55 |
+
"मैं किताब पढ़ रहा हूँ।"],
|
56 |
+
"Sanskrit": ["अहं ग्रन्थं पठामि।",
|
57 |
+
"अद्य सूर्यः तेजस्वी अस्ति।",
|
58 |
+
"मम नाम रामः।"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
}
|
60 |
|
61 |
+
# Voice/style mapping for IndicParler-TTS
|
62 |
+
VOICE_STYLE = {
|
63 |
+
"English": "An English female voice with a neutral Indian accent.",
|
64 |
+
"Tamil": "A female speaker with a clear Tamil accent.",
|
65 |
+
"Malayalam": "A female speaker with a clear Malayali accent.",
|
66 |
+
"Hindi": "A female speaker with a neutral Hindi accent.",
|
67 |
+
"Sanskrit": "A female speaker reading in classical Sanskrit style."
|
68 |
+
}
|
69 |
+
|
70 |
+
# ---------------- LOAD MODELS ---------------- #
|
71 |
print("Loading Whisper model...")
|
72 |
+
whisper_model = WhisperModel(MODEL_NAME, device=DEVICE)
|
73 |
+
|
74 |
+
print("Loading IndicParler-TTS...")
|
75 |
+
TTS_MODEL_ID = "ai4bharat/indic-parler-tts"
|
76 |
+
tts_model = AutoModelForTextToSpeech.from_pretrained(TTS_MODEL_ID)
|
77 |
+
tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_ID)
|
78 |
+
tts_pipe = pipeline("text-to-speech", model=tts_model, tokenizer=tts_tokenizer)
|
79 |
|
80 |
# ---------------- HELPERS ---------------- #
|
81 |
+
def get_random_sentence(language_choice):
|
82 |
+
return random.choice(SENTENCE_BANK[language_choice])
|
83 |
+
|
84 |
def is_script(text, lang_name):
|
85 |
+
pat = SCRIPT_PATTERNS.get(lang_name)
|
86 |
+
return bool(pat.search(text)) if pat else True
|
87 |
|
88 |
def transliterate_to_hk(text, lang_choice):
|
89 |
mapping = {
|
90 |
+
"Tamil": sanscript.TAMIL, "Malayalam": sanscript.MALAYALAM,
|
91 |
+
"Hindi": sanscript.DEVANAGARI, "Sanskrit": sanscript.DEVANAGARI,
|
|
|
|
|
92 |
"English": None
|
93 |
}
|
94 |
return transliterate(text, mapping[lang_choice], sanscript.HK) if mapping[lang_choice] else text
|
95 |
|
96 |
def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperature, condition_on_previous_text):
|
97 |
+
segments, _ = whisper_model.transcribe(
|
98 |
+
audio_path, language=lang_code, task="transcribe",
|
99 |
+
initial_prompt=initial_prompt, beam_size=beam_size,
|
100 |
+
temperature=temperature, condition_on_previous_text=condition_on_previous_text,
|
|
|
|
|
|
|
|
|
101 |
word_timestamps=False
|
102 |
)
|
103 |
return "".join(s.text for s in segments).strip()
|
104 |
|
|
|
|
|
|
|
105 |
def highlight_differences(ref, hyp):
|
106 |
+
ref_words, hyp_words = ref.strip().split(), hyp.strip().split()
|
|
|
|
|
107 |
sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
|
108 |
out_html = []
|
109 |
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
|
|
118 |
out_html.extend([f"<span style='color:orange'>{w}</span>" for w in hyp_words[j1:j2]])
|
119 |
return " ".join(out_html)
|
120 |
|
121 |
+
def char_level_highlight(ref, hyp):
|
122 |
+
# Highlight correct in green, incorrect in red underline
|
123 |
+
sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
|
124 |
+
out = []
|
125 |
+
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
126 |
+
if tag == 'equal':
|
127 |
+
out.extend([f"<span style='color:green'>{c}</span>" for c in ref[i1:i2]])
|
128 |
+
elif tag in ('replace', 'delete'):
|
129 |
+
out.extend([f"<span style='color:red;text-decoration:underline'>{c}</span>" for c in ref[i1:i2]])
|
130 |
+
elif tag == 'insert':
|
131 |
+
# Characters only in hyp - show orange
|
132 |
+
out.extend([f"<span style='color:orange'>{c}</span>" for c in hyp[j1:j2]])
|
133 |
+
return "".join(out)
|
134 |
+
|
135 |
+
def synthesize_tts(text, lang_choice):
|
136 |
+
if not text.strip():
|
137 |
+
return None
|
138 |
+
prompt_style = VOICE_STYLE.get(lang_choice, "")
|
139 |
+
audio_out = tts_pipe(text, forward_params={"description": prompt_style})
|
140 |
+
return (audio_out["sampling_rate"], audio_out["audio"])
|
141 |
+
|
142 |
+
# ---------------- MAIN ---------------- #
|
143 |
+
def compare_pronunciation(audio, language_choice, intended_sentence,
|
144 |
+
pass1_beam, pass1_temp, pass1_condition):
|
145 |
if audio is None or not intended_sentence.strip():
|
146 |
+
return "No audio or intended sentence.", "", "", "", "", "", None, None, "", ""
|
147 |
|
148 |
lang_code = LANG_CODES[language_choice]
|
149 |
primer_weak, primer_strong = LANG_PRIMERS[language_choice]
|
150 |
|
151 |
+
# Pass 1 - actual speech
|
152 |
+
actual_text = transcribe_once(audio, lang_code, primer_weak,
|
153 |
+
pass1_beam, pass1_temp, pass1_condition)
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
+
# Pass 2 - target biased (fixed)
|
156 |
strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
|
157 |
+
corrected_text = transcribe_once(audio, lang_code, strict_prompt,
|
158 |
+
beam_size=5, temperature=0.0, condition_on_previous_text=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
# Scores
|
161 |
wer_val = jiwer.wer(intended_sentence, actual_text)
|
162 |
cer_val = jiwer.cer(intended_sentence, actual_text)
|
163 |
|
164 |
+
# Transliteration - pass1
|
165 |
+
hk_translit = transliterate_to_hk(actual_text, language_choice) \
|
166 |
+
if is_script(actual_text, language_choice) \
|
167 |
+
else f"[Script mismatch: expected {language_choice}]"
|
168 |
|
|
|
169 |
diff_html = highlight_differences(intended_sentence, actual_text)
|
170 |
+
char_html = char_level_highlight(intended_sentence, actual_text)
|
171 |
+
|
172 |
+
# TTS for intended & pass1
|
173 |
+
tts_intended = synthesize_tts(intended_sentence, language_choice)
|
174 |
+
tts_pass1 = synthesize_tts(actual_text, language_choice)
|
175 |
|
176 |
+
return actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}", diff_html, tts_intended, tts_pass1, char_html, intended_sentence
|
177 |
|
178 |
# ---------------- UI ---------------- #
|
179 |
with gr.Blocks() as demo:
|
180 |
+
gr.Markdown("## 🎙 Pronunciation Comparator + IndicParler‑TTS + Error Highlighting\n"
|
181 |
+
"Generate sentence → Listen to TTS → Read aloud → See errors → Listen to your transcription")
|
182 |
|
183 |
with gr.Row():
|
184 |
lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
|
185 |
gen_btn = gr.Button("🎲 Generate Sentence")
|
186 |
|
187 |
+
intended_display = gr.Textbox(label="Generated Sentence (Read aloud)", interactive=False)
|
188 |
|
189 |
with gr.Row():
|
190 |
audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
|
|
|
198 |
hk_out = gr.Textbox(label="Harvard-Kyoto Transliteration (Pass 1)")
|
199 |
|
200 |
with gr.Row():
|
201 |
+
wer_out = gr.Textbox(label="Word Error Rate")
|
202 |
+
cer_out = gr.Textbox(label="Character Error Rate")
|
203 |
|
204 |
+
diff_html_box = gr.HTML(label="Word Differences Highlighted")
|
205 |
+
char_html_box = gr.HTML(label="Character-Level Highlighting (mispronounced = red underline)")
|
206 |
+
|
207 |
+
with gr.Row():
|
208 |
+
intended_tts_audio = gr.Audio(label="TTS - Intended Sentence", type="numpy")
|
209 |
+
pass1_tts_audio = gr.Audio(label="TTS - Pass1 Output", type="numpy")
|
210 |
|
211 |
gen_btn.click(fn=get_random_sentence, inputs=[lang_choice], outputs=[intended_display])
|
212 |
|
|
|
214 |
submit_btn.click(
|
215 |
fn=compare_pronunciation,
|
216 |
inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp, pass1_condition],
|
217 |
+
outputs=[
|
218 |
+
pass1_out, pass2_out, hk_out, wer_out, cer_out,
|
219 |
+
diff_html_box, intended_tts_audio, pass1_tts_audio,
|
220 |
+
char_html_box, intended_display
|
221 |
+
]
|
222 |
)
|
223 |
|
224 |
if __name__ == "__main__":
|