sudhanm commited on
Commit
024461f
·
verified ·
1 Parent(s): 455645c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -106
app.py CHANGED
@@ -1,45 +1,35 @@
1
  import gradio as gr
2
  import random
3
  import difflib
 
 
 
 
4
  from faster_whisper import WhisperModel
5
  from indic_transliteration import sanscript
6
  from indic_transliteration.sanscript import transliterate
7
- import re
8
- import jiwer
9
 
10
  # ---------------- CONFIG ---------------- #
11
  MODEL_NAME = "large-v2"
12
  DEVICE = "cpu"
13
 
14
  LANG_CODES = {
15
- "English": "en",
16
- "Tamil": "ta",
17
- "Malayalam": "ml",
18
- "Hindi": "hi",
19
- "Sanskrit": "sa"
20
  }
21
 
22
  LANG_PRIMERS = {
23
- "English": (
24
- "The transcript should be in English only.",
25
- "Write only in English without translation. Example: This is an English sentence."
26
- ),
27
- "Tamil": (
28
- "நகல் தமிழ் எழுத்துக்களில் மட்டும் இருக்க வேண்டும்.",
29
- "தமிழ் எழுத்துக்களில் மட்டும் எழுதவும், மொழிபெயர்ப்பு செய்யக்கூடாது. உதாரணம்: இது ஒரு தமிழ் வாக்கியம்."
30
- ),
31
- "Malayalam": (
32
- "ട്രാൻസ്ക്രിപ്റ്റ് മലയാള ലിപിയിൽ ആയിരിക്കണം.",
33
- "മലയാള ലിപിയിൽ മാത്രം എഴുതുക, വിവർത്തനം ചെയ്യരുത്. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്. എനിക്ക് മലയാളം അറിയാം."
34
- ),
35
- "Hindi": (
36
- "प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
37
- "केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: यह एक हिंदी वाक्य है।"
38
- ),
39
- "Sanskrit": (
40
- "प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
41
- "केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: अहं संस्कृतं जानामि।"
42
- )
43
  }
44
 
45
  SCRIPT_PATTERNS = {
@@ -51,72 +41,69 @@ SCRIPT_PATTERNS = {
51
  }
52
 
53
  SENTENCE_BANK = {
54
- "English": [
55
- "The sun sets over the horizon.",
56
- "Learning languages is fun.",
57
- "I like to drink coffee in the morning."
58
- ],
59
- "Tamil": [
60
- "இன்று நல்ல வானிலை உள்ளது.",
61
- "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
62
- "எனக்கு புத்தகம் படிக்க விருப்பம்."
63
- ],
64
- "Malayalam": [
65
- "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
66
- "ഇന്ന് മഴപെയ്യുന്നു.",
67
- "ഞാൻ പുസ്തകം വായിക്കുന്നു."
68
- ],
69
- "Hindi": [
70
- "आज मौसम अच्छा है।",
71
- "मुझे हिंदी बोलना पसंद है।",
72
- "मैं किताब पढ़ रहा हूँ।"
73
- ],
74
- "Sanskrit": [
75
- "अहं ग्रन्थं पठामि।",
76
- "अद्य सूर्यः तेजस्वी अस्ति।",
77
- "मम नाम रामः।"
78
- ]
79
  }
80
 
81
- # ---------------- MODEL ---------------- #
 
 
 
 
 
 
 
 
 
82
  print("Loading Whisper model...")
83
- model = WhisperModel(MODEL_NAME, device=DEVICE)
 
 
 
 
 
 
84
 
85
  # ---------------- HELPERS ---------------- #
 
 
 
86
  def is_script(text, lang_name):
87
- pattern = SCRIPT_PATTERNS.get(lang_name)
88
- return bool(pattern.search(text)) if pattern else True
89
 
90
  def transliterate_to_hk(text, lang_choice):
91
  mapping = {
92
- "Tamil": sanscript.TAMIL,
93
- "Malayalam": sanscript.MALAYALAM,
94
- "Hindi": sanscript.DEVANAGARI,
95
- "Sanskrit": sanscript.DEVANAGARI,
96
  "English": None
97
  }
98
  return transliterate(text, mapping[lang_choice], sanscript.HK) if mapping[lang_choice] else text
99
 
100
  def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperature, condition_on_previous_text):
101
- segments, _ = model.transcribe(
102
- audio_path,
103
- language=lang_code,
104
- task="transcribe",
105
- initial_prompt=initial_prompt,
106
- beam_size=beam_size,
107
- temperature=temperature,
108
- condition_on_previous_text=condition_on_previous_text,
109
  word_timestamps=False
110
  )
111
  return "".join(s.text for s in segments).strip()
112
 
113
- def get_random_sentence(language_choice):
114
- return random.choice(SENTENCE_BANK[language_choice])
115
-
116
  def highlight_differences(ref, hyp):
117
- """Return HTML string highlighting differences between ref and hyp at word level."""
118
- ref_words = ref.strip().split()
119
- hyp_words = hyp.strip().split()
120
  sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
121
  out_html = []
122
  for tag, i1, i2, j1, j2 in sm.get_opcodes():
@@ -131,57 +118,73 @@ def highlight_differences(ref, hyp):
131
  out_html.extend([f"<span style='color:orange'>{w}</span>" for w in hyp_words[j1:j2]])
132
  return " ".join(out_html)
133
 
134
- # ---------------- MAIN PIPELINE ---------------- #
135
- def compare_pronunciation(audio, language_choice, intended_sentence, pass1_beam, pass1_temp, pass1_condition):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  if audio is None or not intended_sentence.strip():
137
- return "No audio or intended sentence provided.", "", "", "", "", ""
138
 
139
  lang_code = LANG_CODES[language_choice]
140
  primer_weak, primer_strong = LANG_PRIMERS[language_choice]
141
 
142
- # Pass 1
143
- actual_text = transcribe_once(
144
- audio_path=audio,
145
- lang_code=lang_code,
146
- initial_prompt=primer_weak,
147
- beam_size=pass1_beam,
148
- temperature=pass1_temp,
149
- condition_on_previous_text=pass1_condition
150
- )
151
 
152
- # Pass 2 (fixed settings)
153
  strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
154
- corrected_text = transcribe_once(
155
- audio_path=audio,
156
- lang_code=lang_code,
157
- initial_prompt=strict_prompt,
158
- beam_size=5,
159
- temperature=0.0,
160
- condition_on_previous_text=False
161
- )
162
 
163
  # Scores
164
  wer_val = jiwer.wer(intended_sentence, actual_text)
165
  cer_val = jiwer.cer(intended_sentence, actual_text)
166
 
167
- # HK translit
168
- hk_translit = transliterate_to_hk(actual_text, language_choice) if is_script(actual_text, language_choice) else f"[Script mismatch: expected {language_choice}]"
 
 
169
 
170
- # Highlighted diff
171
  diff_html = highlight_differences(intended_sentence, actual_text)
 
 
 
 
 
172
 
173
- return actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}", diff_html
174
 
175
  # ---------------- UI ---------------- #
176
  with gr.Blocks() as demo:
177
- gr.Markdown("# 🎙 Pronunciation Comparator with Random Sentence & Word Highlighting\n"
178
- "Generate a sentence, read it aloud, and see exactly which words differ from the target.")
179
 
180
  with gr.Row():
181
  lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
182
  gen_btn = gr.Button("🎲 Generate Sentence")
183
 
184
- intended_display = gr.Textbox(label="Generated Sentence (Read this aloud)", interactive=False)
185
 
186
  with gr.Row():
187
  audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
@@ -195,10 +198,15 @@ with gr.Blocks() as demo:
195
  hk_out = gr.Textbox(label="Harvard-Kyoto Transliteration (Pass 1)")
196
 
197
  with gr.Row():
198
- wer_out = gr.Textbox(label="Word Error Rate vs Intended")
199
- cer_out = gr.Textbox(label="Character Error Rate vs Intended")
200
 
201
- diff_html_box = gr.HTML(label="Differences Highlighted")
 
 
 
 
 
202
 
203
  gen_btn.click(fn=get_random_sentence, inputs=[lang_choice], outputs=[intended_display])
204
 
@@ -206,7 +214,11 @@ with gr.Blocks() as demo:
206
  submit_btn.click(
207
  fn=compare_pronunciation,
208
  inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp, pass1_condition],
209
- outputs=[pass1_out, pass2_out, hk_out, wer_out, cer_out, diff_html_box]
 
 
 
 
210
  )
211
 
212
  if __name__ == "__main__":
 
1
  import gradio as gr
2
  import random
3
  import difflib
4
+ import re
5
+ import jiwer
6
+ import torch
7
+ import soundfile as sf
8
  from faster_whisper import WhisperModel
9
  from indic_transliteration import sanscript
10
  from indic_transliteration.sanscript import transliterate
11
+ from transformers import AutoModelForTextToSpeech, AutoTokenizer, pipeline
 
12
 
13
  # ---------------- CONFIG ---------------- #
14
  MODEL_NAME = "large-v2"
15
  DEVICE = "cpu"
16
 
17
  LANG_CODES = {
18
+ "English": "en", "Tamil": "ta", "Malayalam": "ml",
19
+ "Hindi": "hi", "Sanskrit": "sa"
 
 
 
20
  }
21
 
22
  LANG_PRIMERS = {
23
+ "English": ("The transcript should be in English only.",
24
+ "Write only in English without translation. Example: This is an English sentence."),
25
+ "Tamil": ("நகல் தமிழ் எழுத்துக்களில் மட்டும் இருக்க வேண்டும்.",
26
+ "தமிழ் எழுத்துக்களில் மட்டும் எழுதவும், மொழிபெயர்ப்பு செய்யக்கூடாது. உதாரணம்: இது ஒரு தமிழ் வாக்கியம்."),
27
+ "Malayalam": ("ട്രാൻസ്ക്രിപ്റ്റ് മലയാള ലിപിയിൽ ആയിരിക്കണം.",
28
+ "മലയാള ലിപിയിൽ മാത്രം എഴുതുക, വിവർത്തനം ചെയ്യരുത്. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്. എനിക്ക് മലയാളം അറിയാം."),
29
+ "Hindi": ("प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
30
+ "केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: यह एक हिंदी वाक्य है।"),
31
+ "Sanskrit": ("प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
32
+ "केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: अहं संस्कृतं जानामि।")
 
 
 
 
 
 
 
 
 
 
33
  }
34
 
35
  SCRIPT_PATTERNS = {
 
41
  }
42
 
43
  SENTENCE_BANK = {
44
+ "English": ["The sun sets over the horizon.",
45
+ "Learning languages is fun.",
46
+ "I like to drink coffee in the morning."],
47
+ "Tamil": ["இன்று நல்ல வானிலை உள்ளது.",
48
+ "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
49
+ "எனக்கு புத்தகம் படிக்க விருப்பம்."],
50
+ "Malayalam": ["എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
51
+ "ഇന്ന് മഴപെയ്യുന്നു.",
52
+ "ഞാൻ പുസ്തകം വായിക്കുന്നു."],
53
+ "Hindi": ["आज मौसम अच्छा है।",
54
+ "मुझे हिंदी बोलना पसंद है।",
55
+ "मैं किताब पढ़ रहा हूँ।"],
56
+ "Sanskrit": ["अहं ग्रन्थं पठामि।",
57
+ "अद्य सूर्यः तेजस्वी अस्ति।",
58
+ "मम नाम रामः।"]
 
 
 
 
 
 
 
 
 
 
59
  }
60
 
61
+ # Voice/style mapping for IndicParler-TTS
62
+ VOICE_STYLE = {
63
+ "English": "An English female voice with a neutral Indian accent.",
64
+ "Tamil": "A female speaker with a clear Tamil accent.",
65
+ "Malayalam": "A female speaker with a clear Malayali accent.",
66
+ "Hindi": "A female speaker with a neutral Hindi accent.",
67
+ "Sanskrit": "A female speaker reading in classical Sanskrit style."
68
+ }
69
+
70
+ # ---------------- LOAD MODELS ---------------- #
71
  print("Loading Whisper model...")
72
+ whisper_model = WhisperModel(MODEL_NAME, device=DEVICE)
73
+
74
+ print("Loading IndicParler-TTS...")
75
+ TTS_MODEL_ID = "ai4bharat/indic-parler-tts"
76
+ tts_model = AutoModelForTextToSpeech.from_pretrained(TTS_MODEL_ID)
77
+ tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_ID)
78
+ tts_pipe = pipeline("text-to-speech", model=tts_model, tokenizer=tts_tokenizer)
79
 
80
  # ---------------- HELPERS ---------------- #
81
+ def get_random_sentence(language_choice):
82
+ return random.choice(SENTENCE_BANK[language_choice])
83
+
84
  def is_script(text, lang_name):
85
+ pat = SCRIPT_PATTERNS.get(lang_name)
86
+ return bool(pat.search(text)) if pat else True
87
 
88
  def transliterate_to_hk(text, lang_choice):
89
  mapping = {
90
+ "Tamil": sanscript.TAMIL, "Malayalam": sanscript.MALAYALAM,
91
+ "Hindi": sanscript.DEVANAGARI, "Sanskrit": sanscript.DEVANAGARI,
 
 
92
  "English": None
93
  }
94
  return transliterate(text, mapping[lang_choice], sanscript.HK) if mapping[lang_choice] else text
95
 
96
  def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperature, condition_on_previous_text):
97
+ segments, _ = whisper_model.transcribe(
98
+ audio_path, language=lang_code, task="transcribe",
99
+ initial_prompt=initial_prompt, beam_size=beam_size,
100
+ temperature=temperature, condition_on_previous_text=condition_on_previous_text,
 
 
 
 
101
  word_timestamps=False
102
  )
103
  return "".join(s.text for s in segments).strip()
104
 
 
 
 
105
  def highlight_differences(ref, hyp):
106
+ ref_words, hyp_words = ref.strip().split(), hyp.strip().split()
 
 
107
  sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
108
  out_html = []
109
  for tag, i1, i2, j1, j2 in sm.get_opcodes():
 
118
  out_html.extend([f"<span style='color:orange'>{w}</span>" for w in hyp_words[j1:j2]])
119
  return " ".join(out_html)
120
 
121
+ def char_level_highlight(ref, hyp):
122
+ # Highlight correct in green, incorrect in red underline
123
+ sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
124
+ out = []
125
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
126
+ if tag == 'equal':
127
+ out.extend([f"<span style='color:green'>{c}</span>" for c in ref[i1:i2]])
128
+ elif tag in ('replace', 'delete'):
129
+ out.extend([f"<span style='color:red;text-decoration:underline'>{c}</span>" for c in ref[i1:i2]])
130
+ elif tag == 'insert':
131
+ # Characters only in hyp - show orange
132
+ out.extend([f"<span style='color:orange'>{c}</span>" for c in hyp[j1:j2]])
133
+ return "".join(out)
134
+
135
+ def synthesize_tts(text, lang_choice):
136
+ if not text.strip():
137
+ return None
138
+ prompt_style = VOICE_STYLE.get(lang_choice, "")
139
+ audio_out = tts_pipe(text, forward_params={"description": prompt_style})
140
+ return (audio_out["sampling_rate"], audio_out["audio"])
141
+
142
+ # ---------------- MAIN ---------------- #
143
+ def compare_pronunciation(audio, language_choice, intended_sentence,
144
+ pass1_beam, pass1_temp, pass1_condition):
145
  if audio is None or not intended_sentence.strip():
146
+ return "No audio or intended sentence.", "", "", "", "", "", None, None, "", ""
147
 
148
  lang_code = LANG_CODES[language_choice]
149
  primer_weak, primer_strong = LANG_PRIMERS[language_choice]
150
 
151
+ # Pass 1 - actual speech
152
+ actual_text = transcribe_once(audio, lang_code, primer_weak,
153
+ pass1_beam, pass1_temp, pass1_condition)
 
 
 
 
 
 
154
 
155
+ # Pass 2 - target biased (fixed)
156
  strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
157
+ corrected_text = transcribe_once(audio, lang_code, strict_prompt,
158
+ beam_size=5, temperature=0.0, condition_on_previous_text=False)
 
 
 
 
 
 
159
 
160
  # Scores
161
  wer_val = jiwer.wer(intended_sentence, actual_text)
162
  cer_val = jiwer.cer(intended_sentence, actual_text)
163
 
164
+ # Transliteration - pass1
165
+ hk_translit = transliterate_to_hk(actual_text, language_choice) \
166
+ if is_script(actual_text, language_choice) \
167
+ else f"[Script mismatch: expected {language_choice}]"
168
 
 
169
  diff_html = highlight_differences(intended_sentence, actual_text)
170
+ char_html = char_level_highlight(intended_sentence, actual_text)
171
+
172
+ # TTS for intended & pass1
173
+ tts_intended = synthesize_tts(intended_sentence, language_choice)
174
+ tts_pass1 = synthesize_tts(actual_text, language_choice)
175
 
176
+ return actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}", diff_html, tts_intended, tts_pass1, char_html, intended_sentence
177
 
178
  # ---------------- UI ---------------- #
179
  with gr.Blocks() as demo:
180
+ gr.Markdown("## 🎙 Pronunciation Comparator + IndicParler‑TTS + Error Highlighting\n"
181
+ "Generate sentence Listen to TTS → Read aloud See errors Listen to your transcription")
182
 
183
  with gr.Row():
184
  lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
185
  gen_btn = gr.Button("🎲 Generate Sentence")
186
 
187
+ intended_display = gr.Textbox(label="Generated Sentence (Read aloud)", interactive=False)
188
 
189
  with gr.Row():
190
  audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
 
198
  hk_out = gr.Textbox(label="Harvard-Kyoto Transliteration (Pass 1)")
199
 
200
  with gr.Row():
201
+ wer_out = gr.Textbox(label="Word Error Rate")
202
+ cer_out = gr.Textbox(label="Character Error Rate")
203
 
204
+ diff_html_box = gr.HTML(label="Word Differences Highlighted")
205
+ char_html_box = gr.HTML(label="Character-Level Highlighting (mispronounced = red underline)")
206
+
207
+ with gr.Row():
208
+ intended_tts_audio = gr.Audio(label="TTS - Intended Sentence", type="numpy")
209
+ pass1_tts_audio = gr.Audio(label="TTS - Pass1 Output", type="numpy")
210
 
211
  gen_btn.click(fn=get_random_sentence, inputs=[lang_choice], outputs=[intended_display])
212
 
 
214
  submit_btn.click(
215
  fn=compare_pronunciation,
216
  inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp, pass1_condition],
217
+ outputs=[
218
+ pass1_out, pass2_out, hk_out, wer_out, cer_out,
219
+ diff_html_box, intended_tts_audio, pass1_tts_audio,
220
+ char_html_box, intended_display
221
+ ]
222
  )
223
 
224
  if __name__ == "__main__":