sudhanm commited on
Commit
455645c
·
verified ·
1 Parent(s): 35c6cb3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -34
app.py CHANGED
@@ -1,10 +1,11 @@
1
  import gradio as gr
2
  import random
 
3
  from faster_whisper import WhisperModel
4
  from indic_transliteration import sanscript
5
  from indic_transliteration.sanscript import transliterate
6
  import re
7
- import jiwer # pip install jiwer
8
 
9
  # ---------------- CONFIG ---------------- #
10
  MODEL_NAME = "large-v2"
@@ -49,7 +50,6 @@ SCRIPT_PATTERNS = {
49
  "English": re.compile(r"[A-Za-z]")
50
  }
51
 
52
- # Example sentence bank for random generation
53
  SENTENCE_BANK = {
54
  "English": [
55
  "The sun sets over the horizon.",
@@ -85,9 +85,7 @@ model = WhisperModel(MODEL_NAME, device=DEVICE)
85
  # ---------------- HELPERS ---------------- #
86
  def is_script(text, lang_name):
87
  pattern = SCRIPT_PATTERNS.get(lang_name)
88
- if not pattern:
89
- return True
90
- return bool(pattern.search(text))
91
 
92
  def transliterate_to_hk(text, lang_choice):
93
  mapping = {
@@ -97,13 +95,10 @@ def transliterate_to_hk(text, lang_choice):
97
  "Sanskrit": sanscript.DEVANAGARI,
98
  "English": None
99
  }
100
- if mapping[lang_choice]:
101
- return transliterate(text, mapping[lang_choice], sanscript.HK)
102
- else:
103
- return text
104
 
105
  def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperature, condition_on_previous_text):
106
- segments, info = model.transcribe(
107
  audio_path,
108
  language=lang_code,
109
  task="transcribe",
@@ -118,50 +113,69 @@ def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperatur
118
  def get_random_sentence(language_choice):
119
  return random.choice(SENTENCE_BANK[language_choice])
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  # ---------------- MAIN PIPELINE ---------------- #
122
- def compare_pronunciation(audio, language_choice, intended_sentence, pass2_beam, pass2_temp, pass2_condition):
123
  if audio is None or not intended_sentence.strip():
124
- return "No audio or intended sentence provided.", "", "", "", ""
125
 
126
  lang_code = LANG_CODES[language_choice]
127
  primer_weak, primer_strong = LANG_PRIMERS[language_choice]
128
 
129
- # Pass 1: Actual speech (no bias with intended sentence)
130
  actual_text = transcribe_once(
131
  audio_path=audio,
132
  lang_code=lang_code,
133
  initial_prompt=primer_weak,
134
- beam_size=8,
135
- temperature=0.4,
136
- condition_on_previous_text=True
137
  )
138
 
139
- # Pass 2: Target-biased output
140
  strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
141
  corrected_text = transcribe_once(
142
  audio_path=audio,
143
  lang_code=lang_code,
144
  initial_prompt=strict_prompt,
145
- beam_size=pass2_beam,
146
- temperature=pass2_temp,
147
- condition_on_previous_text=pass2_condition
148
  )
149
 
150
- # Error Rates
151
  wer_val = jiwer.wer(intended_sentence, actual_text)
152
  cer_val = jiwer.cer(intended_sentence, actual_text)
153
 
154
- # Transliteration
155
- if is_script(actual_text, language_choice):
156
- hk_translit = transliterate_to_hk(actual_text, language_choice)
157
- else:
158
- hk_translit = f"[Script mismatch: expected {language_choice}]"
159
 
160
- return actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}"
161
 
162
  # ---------------- UI ---------------- #
163
  with gr.Blocks() as demo:
164
- gr.Markdown("# 🎙️ Pronunciation Comparator with Random Sentence\nClick 'Generate Sentence', read it aloud, and compare actual vs intended output.")
 
165
 
166
  with gr.Row():
167
  lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
@@ -171,9 +185,9 @@ with gr.Blocks() as demo:
171
 
172
  with gr.Row():
173
  audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
174
- pass2_beam = gr.Slider(1, 10, value=5, step=1, label="Pass 2 Beam Size")
175
- pass2_temp = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Pass 2 Temperature")
176
- pass2_condition = gr.Checkbox(value=False, label="Pass 2: Condition on previous text")
177
 
178
  with gr.Row():
179
  pass1_out = gr.Textbox(label="Pass 1: What You Actually Said")
@@ -184,14 +198,15 @@ with gr.Blocks() as demo:
184
  wer_out = gr.Textbox(label="Word Error Rate vs Intended")
185
  cer_out = gr.Textbox(label="Character Error Rate vs Intended")
186
 
 
 
187
  gen_btn.click(fn=get_random_sentence, inputs=[lang_choice], outputs=[intended_display])
188
 
189
  submit_btn = gr.Button("Analyze Pronunciation")
190
-
191
  submit_btn.click(
192
  fn=compare_pronunciation,
193
- inputs=[audio_input, lang_choice, intended_display, pass2_beam, pass2_temp, pass2_condition],
194
- outputs=[pass1_out, pass2_out, hk_out, wer_out, cer_out]
195
  )
196
 
197
  if __name__ == "__main__":
 
1
  import gradio as gr
2
  import random
3
+ import difflib
4
  from faster_whisper import WhisperModel
5
  from indic_transliteration import sanscript
6
  from indic_transliteration.sanscript import transliterate
7
  import re
8
+ import jiwer
9
 
10
  # ---------------- CONFIG ---------------- #
11
  MODEL_NAME = "large-v2"
 
50
  "English": re.compile(r"[A-Za-z]")
51
  }
52
 
 
53
  SENTENCE_BANK = {
54
  "English": [
55
  "The sun sets over the horizon.",
 
85
  # ---------------- HELPERS ---------------- #
86
  def is_script(text, lang_name):
87
  pattern = SCRIPT_PATTERNS.get(lang_name)
88
+ return bool(pattern.search(text)) if pattern else True
 
 
89
 
90
  def transliterate_to_hk(text, lang_choice):
91
  mapping = {
 
95
  "Sanskrit": sanscript.DEVANAGARI,
96
  "English": None
97
  }
98
+ return transliterate(text, mapping[lang_choice], sanscript.HK) if mapping[lang_choice] else text
 
 
 
99
 
100
  def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperature, condition_on_previous_text):
101
+ segments, _ = model.transcribe(
102
  audio_path,
103
  language=lang_code,
104
  task="transcribe",
 
113
  def get_random_sentence(language_choice):
114
  return random.choice(SENTENCE_BANK[language_choice])
115
 
116
+ def highlight_differences(ref, hyp):
117
+ """Return HTML string highlighting differences between ref and hyp at word level."""
118
+ ref_words = ref.strip().split()
119
+ hyp_words = hyp.strip().split()
120
+ sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
121
+ out_html = []
122
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
123
+ if tag == 'equal':
124
+ out_html.extend([f"<span style='color:green'>{w}</span>" for w in ref_words[i1:i2]])
125
+ elif tag == 'replace':
126
+ out_html.extend([f"<span style='color:red'>{w}</span>" for w in ref_words[i1:i2]])
127
+ out_html.extend([f"<span style='color:orange'>{w}</span>" for w in hyp_words[j1:j2]])
128
+ elif tag == 'delete':
129
+ out_html.extend([f"<span style='color:red;text-decoration:line-through'>{w}</span>" for w in ref_words[i1:i2]])
130
+ elif tag == 'insert':
131
+ out_html.extend([f"<span style='color:orange'>{w}</span>" for w in hyp_words[j1:j2]])
132
+ return " ".join(out_html)
133
+
134
  # ---------------- MAIN PIPELINE ---------------- #
135
+ def compare_pronunciation(audio, language_choice, intended_sentence, pass1_beam, pass1_temp, pass1_condition):
136
  if audio is None or not intended_sentence.strip():
137
+ return "No audio or intended sentence provided.", "", "", "", "", ""
138
 
139
  lang_code = LANG_CODES[language_choice]
140
  primer_weak, primer_strong = LANG_PRIMERS[language_choice]
141
 
142
+ # Pass 1
143
  actual_text = transcribe_once(
144
  audio_path=audio,
145
  lang_code=lang_code,
146
  initial_prompt=primer_weak,
147
+ beam_size=pass1_beam,
148
+ temperature=pass1_temp,
149
+ condition_on_previous_text=pass1_condition
150
  )
151
 
152
+ # Pass 2 (fixed settings)
153
  strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
154
  corrected_text = transcribe_once(
155
  audio_path=audio,
156
  lang_code=lang_code,
157
  initial_prompt=strict_prompt,
158
+ beam_size=5,
159
+ temperature=0.0,
160
+ condition_on_previous_text=False
161
  )
162
 
163
+ # Scores
164
  wer_val = jiwer.wer(intended_sentence, actual_text)
165
  cer_val = jiwer.cer(intended_sentence, actual_text)
166
 
167
+ # HK translit
168
+ hk_translit = transliterate_to_hk(actual_text, language_choice) if is_script(actual_text, language_choice) else f"[Script mismatch: expected {language_choice}]"
169
+
170
+ # Highlighted diff
171
+ diff_html = highlight_differences(intended_sentence, actual_text)
172
 
173
+ return actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}", diff_html
174
 
175
  # ---------------- UI ---------------- #
176
  with gr.Blocks() as demo:
177
+ gr.Markdown("# 🎙 Pronunciation Comparator with Random Sentence & Word Highlighting\n"
178
+ "Generate a sentence, read it aloud, and see exactly which words differ from the target.")
179
 
180
  with gr.Row():
181
  lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
 
185
 
186
  with gr.Row():
187
  audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
188
+ pass1_beam = gr.Slider(1, 10, value=8, step=1, label="Pass 1 Beam Size")
189
+ pass1_temp = gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="Pass 1 Temperature")
190
+ pass1_condition = gr.Checkbox(value=True, label="Pass 1: Condition on previous text")
191
 
192
  with gr.Row():
193
  pass1_out = gr.Textbox(label="Pass 1: What You Actually Said")
 
198
  wer_out = gr.Textbox(label="Word Error Rate vs Intended")
199
  cer_out = gr.Textbox(label="Character Error Rate vs Intended")
200
 
201
+ diff_html_box = gr.HTML(label="Differences Highlighted")
202
+
203
  gen_btn.click(fn=get_random_sentence, inputs=[lang_choice], outputs=[intended_display])
204
 
205
  submit_btn = gr.Button("Analyze Pronunciation")
 
206
  submit_btn.click(
207
  fn=compare_pronunciation,
208
+ inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp, pass1_condition],
209
+ outputs=[pass1_out, pass2_out, hk_out, wer_out, cer_out, diff_html_box]
210
  )
211
 
212
  if __name__ == "__main__":