Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -3,18 +3,24 @@ import random
|
|
3 |
import difflib
|
4 |
import re
|
5 |
import jiwer
|
|
|
|
|
|
|
6 |
from faster_whisper import WhisperModel
|
7 |
from indic_transliteration import sanscript
|
8 |
from indic_transliteration.sanscript import transliterate
|
9 |
-
|
10 |
|
11 |
# ---------------- CONFIG ---------------- #
|
12 |
MODEL_NAME = "large-v2"
|
13 |
-
DEVICE = "
|
14 |
|
15 |
LANG_CODES = {
|
16 |
-
"English": "en",
|
17 |
-
"
|
|
|
|
|
|
|
18 |
}
|
19 |
|
20 |
LANG_PRIMERS = {
|
@@ -22,7 +28,7 @@ LANG_PRIMERS = {
|
|
22 |
"Write only in English without translation. Example: This is an English sentence."),
|
23 |
"Tamil": ("நகல் தமிழ் எழுத்துக்களில் மட்டும் இருக்க வேண்டும்.",
|
24 |
"தமிழ் எழுத்துக்களில் மட்டும் எழுதவும், மொழிபெயர்ப்பு செய்யக்கூடாது. உதாரணம்: இது ஒரு தமிழ் வாக்கியம்."),
|
25 |
-
"Malayalam": ("
|
26 |
"മലയാള ലിപിയിൽ മാത്രം എഴുതുക, വിവർത്തനം ചെയ്യരുത്. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്. എനിക്ക് മലയാളം അറിയാം."),
|
27 |
"Hindi": ("प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
|
28 |
"केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: यह एक हिंदी वाक्य है।"),
|
@@ -31,29 +37,39 @@ LANG_PRIMERS = {
|
|
31 |
}
|
32 |
|
33 |
SCRIPT_PATTERNS = {
|
34 |
-
"Tamil": re.compile(r"[
|
35 |
-
"Malayalam": re.compile(r"[
|
36 |
-
"Hindi": re.compile(r"[
|
37 |
-
"Sanskrit": re.compile(r"[
|
38 |
"English": re.compile(r"[A-Za-z]")
|
39 |
}
|
40 |
|
41 |
SENTENCE_BANK = {
|
42 |
-
"English": [
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
}
|
58 |
|
59 |
VOICE_STYLE = {
|
@@ -68,31 +84,38 @@ VOICE_STYLE = {
|
|
68 |
print("Loading Whisper model...")
|
69 |
whisper_model = WhisperModel(MODEL_NAME, device=DEVICE)
|
70 |
|
71 |
-
print("Loading
|
72 |
-
|
73 |
-
|
|
|
74 |
|
75 |
# ---------------- HELPERS ---------------- #
|
76 |
def get_random_sentence(language_choice):
|
77 |
return random.choice(SENTENCE_BANK[language_choice])
|
78 |
|
79 |
def is_script(text, lang_name):
|
80 |
-
|
81 |
-
return bool(
|
82 |
|
83 |
def transliterate_to_hk(text, lang_choice):
|
84 |
mapping = {
|
85 |
-
"Tamil": sanscript.TAMIL,
|
86 |
-
"
|
|
|
|
|
87 |
"English": None
|
88 |
}
|
89 |
return transliterate(text, mapping[lang_choice], sanscript.HK) if mapping[lang_choice] else text
|
90 |
|
91 |
def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperature, condition_on_previous_text):
|
92 |
segments, _ = whisper_model.transcribe(
|
93 |
-
audio_path,
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
96 |
word_timestamps=False
|
97 |
)
|
98 |
return "".join(s.text for s in segments).strip()
|
@@ -128,50 +151,59 @@ def char_level_highlight(ref, hyp):
|
|
128 |
def synthesize_tts(text, lang_choice):
|
129 |
if not text.strip():
|
130 |
return None
|
131 |
-
|
132 |
-
|
133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
# ---------------- MAIN ---------------- #
|
136 |
def compare_pronunciation(audio, language_choice, intended_sentence,
|
137 |
-
|
138 |
if audio is None or not intended_sentence.strip():
|
139 |
-
return "No audio or intended sentence.", "", "", "", "", "",
|
|
|
140 |
|
141 |
lang_code = LANG_CODES[language_choice]
|
142 |
primer_weak, primer_strong = LANG_PRIMERS[language_choice]
|
143 |
|
144 |
-
# Pass 1
|
145 |
actual_text = transcribe_once(audio, lang_code, primer_weak,
|
146 |
pass1_beam, pass1_temp, pass1_condition)
|
147 |
|
148 |
-
# Pass 2 (fixed
|
149 |
strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
|
150 |
corrected_text = transcribe_once(audio, lang_code, strict_prompt,
|
151 |
beam_size=5, temperature=0.0, condition_on_previous_text=False)
|
152 |
|
153 |
-
#
|
154 |
wer_val = jiwer.wer(intended_sentence, actual_text)
|
155 |
cer_val = jiwer.cer(intended_sentence, actual_text)
|
156 |
|
157 |
-
# Transliteration
|
158 |
-
hk_translit = transliterate_to_hk(actual_text, language_choice)
|
159 |
-
if is_script(actual_text, language_choice) \
|
160 |
-
else f"[Script mismatch: expected {language_choice}]"
|
161 |
|
162 |
-
#
|
163 |
diff_html = highlight_differences(intended_sentence, actual_text)
|
164 |
char_html = char_level_highlight(intended_sentence, actual_text)
|
165 |
|
166 |
-
# TTS
|
167 |
tts_intended = synthesize_tts(intended_sentence, language_choice)
|
168 |
tts_pass1 = synthesize_tts(actual_text, language_choice)
|
169 |
|
170 |
-
return actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}",
|
|
|
171 |
|
172 |
# ---------------- UI ---------------- #
|
173 |
with gr.Blocks() as demo:
|
174 |
-
gr.Markdown("## 🎙 Pronunciation Comparator +
|
175 |
|
176 |
with gr.Row():
|
177 |
lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
|
@@ -204,6 +236,7 @@ with gr.Blocks() as demo:
|
|
204 |
gen_btn.click(fn=get_random_sentence, inputs=[lang_choice], outputs=[intended_display])
|
205 |
|
206 |
submit_btn = gr.Button("Analyze Pronunciation")
|
|
|
207 |
submit_btn.click(
|
208 |
fn=compare_pronunciation,
|
209 |
inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp, pass1_condition],
|
|
|
3 |
import difflib
|
4 |
import re
|
5 |
import jiwer
|
6 |
+
import torch
|
7 |
+
from parler_tts import ParlerTTSForConditionalGeneration
|
8 |
+
from transformers import AutoTokenizer
|
9 |
from faster_whisper import WhisperModel
|
10 |
from indic_transliteration import sanscript
|
11 |
from indic_transliteration.sanscript import transliterate
|
12 |
+
import soundfile as sf
|
13 |
|
14 |
# ---------------- CONFIG ---------------- #
|
15 |
MODEL_NAME = "large-v2"
|
16 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
17 |
|
18 |
LANG_CODES = {
|
19 |
+
"English": "en",
|
20 |
+
"Tamil": "ta",
|
21 |
+
"Malayalam": "ml",
|
22 |
+
"Hindi": "hi",
|
23 |
+
"Sanskrit": "sa"
|
24 |
}
|
25 |
|
26 |
LANG_PRIMERS = {
|
|
|
28 |
"Write only in English without translation. Example: This is an English sentence."),
|
29 |
"Tamil": ("நகல் தமிழ் எழுத்துக்களில் மட்டும் இருக்க வேண்டும்.",
|
30 |
"தமிழ் எழுத்துக்களில் மட்டும் எழுதவும், மொழிபெயர்ப்பு செய்யக்கூடாது. உதாரணம்: இது ஒரு தமிழ் வாக்கியம்."),
|
31 |
+
"Malayalam": ("ട്രാൻസ്ഖ്രിപ്റ്റ് മലയാള ലിപിയിൽ ആയിരിക്കണം.",
|
32 |
"മലയാള ലിപിയിൽ മാത്രം എഴുതുക, വിവർത്തനം ചെയ്യരുത്. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്. എനിക്ക് മലയാളം അറിയാം."),
|
33 |
"Hindi": ("प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
|
34 |
"केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: यह एक हिंदी वाक्य है।"),
|
|
|
37 |
}
|
38 |
|
39 |
SCRIPT_PATTERNS = {
|
40 |
+
"Tamil": re.compile(r"[-]"),
|
41 |
+
"Malayalam": re.compile(r"[ഀ-ൿ]"),
|
42 |
+
"Hindi": re.compile(r"[ऀ-ॿ]"),
|
43 |
+
"Sanskrit": re.compile(r"[ऀ-ॿ]"),
|
44 |
"English": re.compile(r"[A-Za-z]")
|
45 |
}
|
46 |
|
47 |
SENTENCE_BANK = {
|
48 |
+
"English": [
|
49 |
+
"The sun sets over the horizon.",
|
50 |
+
"Learning languages is fun.",
|
51 |
+
"I like to drink coffee in the morning."
|
52 |
+
],
|
53 |
+
"Tamil": [
|
54 |
+
"இன்று நல்ல வானிலை உள்ளது.",
|
55 |
+
"நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
|
56 |
+
"எனக்கு புத்தகம் படிக்க விருப்பம்."
|
57 |
+
],
|
58 |
+
"Malayalam": [
|
59 |
+
"എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
|
60 |
+
"ഇന്ന് മഴപെയ്യുന്നു.",
|
61 |
+
"ഞാൻ പുസ്തകം വായിക്കുന്നു."
|
62 |
+
],
|
63 |
+
"Hindi": [
|
64 |
+
"आज मौसम अच्छा है।",
|
65 |
+
"मुझे हिंदी बोलना पसंद है।",
|
66 |
+
"मैं किताब पढ़ रहा हूँ।"
|
67 |
+
],
|
68 |
+
"Sanskrit": [
|
69 |
+
"अहं ग्रन्थं पठा���ि।",
|
70 |
+
"अद्य सूर्यः तेजस्वी अस्ति।",
|
71 |
+
"मम नाम रामः।"
|
72 |
+
]
|
73 |
}
|
74 |
|
75 |
VOICE_STYLE = {
|
|
|
84 |
print("Loading Whisper model...")
|
85 |
whisper_model = WhisperModel(MODEL_NAME, device=DEVICE)
|
86 |
|
87 |
+
print("Loading Parler-TTS model...")
|
88 |
+
parler_model_id = "parler-tts/parler-tts-mini-v1" # You may switch to larger models if desired
|
89 |
+
parler_tts_model = ParlerTTSForConditionalGeneration.from_pretrained(parler_model_id).to(DEVICE)
|
90 |
+
parler_tts_tokenizer = AutoTokenizer.from_pretrained(parler_model_id)
|
91 |
|
92 |
# ---------------- HELPERS ---------------- #
|
93 |
def get_random_sentence(language_choice):
|
94 |
return random.choice(SENTENCE_BANK[language_choice])
|
95 |
|
96 |
def is_script(text, lang_name):
|
97 |
+
pattern = SCRIPT_PATTERNS.get(lang_name)
|
98 |
+
return bool(pattern.search(text)) if pattern else True
|
99 |
|
100 |
def transliterate_to_hk(text, lang_choice):
|
101 |
mapping = {
|
102 |
+
"Tamil": sanscript.TAMIL,
|
103 |
+
"Malayalam": sanscript.MALAYALAM,
|
104 |
+
"Hindi": sanscript.DEVANAGARI,
|
105 |
+
"Sanskrit": sanscript.DEVANAGARI,
|
106 |
"English": None
|
107 |
}
|
108 |
return transliterate(text, mapping[lang_choice], sanscript.HK) if mapping[lang_choice] else text
|
109 |
|
110 |
def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperature, condition_on_previous_text):
|
111 |
segments, _ = whisper_model.transcribe(
|
112 |
+
audio_path,
|
113 |
+
language=lang_code,
|
114 |
+
task="transcribe",
|
115 |
+
initial_prompt=initial_prompt,
|
116 |
+
beam_size=beam_size,
|
117 |
+
temperature=temperature,
|
118 |
+
condition_on_previous_text=condition_on_previous_text,
|
119 |
word_timestamps=False
|
120 |
)
|
121 |
return "".join(s.text for s in segments).strip()
|
|
|
151 |
def synthesize_tts(text, lang_choice):
|
152 |
if not text.strip():
|
153 |
return None
|
154 |
+
description = VOICE_STYLE.get(lang_choice, "")
|
155 |
+
description_input = parler_tts_tokenizer(description, return_tensors='pt').to(DEVICE)
|
156 |
+
prompt_input = parler_tts_tokenizer(text, return_tensors='pt').to(DEVICE)
|
157 |
+
generation = parler_tts_model.generate(
|
158 |
+
input_ids=description_input.input_ids,
|
159 |
+
attention_mask=description_input.attention_mask,
|
160 |
+
prompt_input_ids=prompt_input.input_ids,
|
161 |
+
prompt_attention_mask=prompt_input.attention_mask
|
162 |
+
)
|
163 |
+
audio_arr = generation.cpu().numpy().squeeze()
|
164 |
+
# Parler-TTS default sample rate is 24000
|
165 |
+
return 24000, audio_arr
|
166 |
|
167 |
# ---------------- MAIN ---------------- #
|
168 |
def compare_pronunciation(audio, language_choice, intended_sentence,
|
169 |
+
pass1_beam, pass1_temp, pass1_condition):
|
170 |
if audio is None or not intended_sentence.strip():
|
171 |
+
return ("No audio or intended sentence.", "", "", "", "", "",
|
172 |
+
None, None, "", "")
|
173 |
|
174 |
lang_code = LANG_CODES[language_choice]
|
175 |
primer_weak, primer_strong = LANG_PRIMERS[language_choice]
|
176 |
|
177 |
+
# Pass 1: raw transcription with user-configured decoding parameters
|
178 |
actual_text = transcribe_once(audio, lang_code, primer_weak,
|
179 |
pass1_beam, pass1_temp, pass1_condition)
|
180 |
|
181 |
+
# Pass 2: strict transcription biased by intended sentence (fixed decoding params)
|
182 |
strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
|
183 |
corrected_text = transcribe_once(audio, lang_code, strict_prompt,
|
184 |
beam_size=5, temperature=0.0, condition_on_previous_text=False)
|
185 |
|
186 |
+
# Compute WER and CER
|
187 |
wer_val = jiwer.wer(intended_sentence, actual_text)
|
188 |
cer_val = jiwer.cer(intended_sentence, actual_text)
|
189 |
|
190 |
+
# Transliteration of Pass 1 output
|
191 |
+
hk_translit = transliterate_to_hk(actual_text, language_choice) if is_script(actual_text, language_choice) else f"[Script mismatch: expected {language_choice}]"
|
|
|
|
|
192 |
|
193 |
+
# Highlight word-level and character-level differences
|
194 |
diff_html = highlight_differences(intended_sentence, actual_text)
|
195 |
char_html = char_level_highlight(intended_sentence, actual_text)
|
196 |
|
197 |
+
# Synthesized TTS audios for intended and Pass 1 text
|
198 |
tts_intended = synthesize_tts(intended_sentence, language_choice)
|
199 |
tts_pass1 = synthesize_tts(actual_text, language_choice)
|
200 |
|
201 |
+
return (actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}",
|
202 |
+
diff_html, tts_intended, tts_pass1, char_html, intended_sentence)
|
203 |
|
204 |
# ---------------- UI ---------------- #
|
205 |
with gr.Blocks() as demo:
|
206 |
+
gr.Markdown("## 🎙 Pronunciation Comparator + Parler-TTS + Highlights")
|
207 |
|
208 |
with gr.Row():
|
209 |
lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
|
|
|
236 |
gen_btn.click(fn=get_random_sentence, inputs=[lang_choice], outputs=[intended_display])
|
237 |
|
238 |
submit_btn = gr.Button("Analyze Pronunciation")
|
239 |
+
|
240 |
submit_btn.click(
|
241 |
fn=compare_pronunciation,
|
242 |
inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp, pass1_condition],
|