sudhanm commited on
Commit
12e638e
·
verified ·
1 Parent(s): e82b46f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -50
app.py CHANGED
@@ -3,18 +3,24 @@ import random
3
  import difflib
4
  import re
5
  import jiwer
 
 
 
6
  from faster_whisper import WhisperModel
7
  from indic_transliteration import sanscript
8
  from indic_transliteration.sanscript import transliterate
9
- from transformers import pipeline # only pipeline is needed for TTS
10
 
11
  # ---------------- CONFIG ---------------- #
12
  MODEL_NAME = "large-v2"
13
- DEVICE = "cpu" # change to "cuda" if GPU available in Space
14
 
15
  LANG_CODES = {
16
- "English": "en", "Tamil": "ta", "Malayalam": "ml",
17
- "Hindi": "hi", "Sanskrit": "sa"
 
 
 
18
  }
19
 
20
  LANG_PRIMERS = {
@@ -22,7 +28,7 @@ LANG_PRIMERS = {
22
  "Write only in English without translation. Example: This is an English sentence."),
23
  "Tamil": ("நகல் தமிழ் எழுத்துக்களில் மட்டும் இருக்க வேண்டும்.",
24
  "தமிழ் எழுத்துக்களில் மட்டும் எழுதவும், மொழிபெயர்ப்பு செய்யக்கூடாது. உதாரணம்: இது ஒரு தமிழ் வாக்கியம்."),
25
- "Malayalam": ("ട്രാൻസ്ക്രിപ്റ്റ് മലയാള ലിപിയിൽ ആയിരിക്കണം.",
26
  "മലയാള ലിപിയിൽ മാത്രം എഴുതുക, വിവർത്തനം ചെയ്യരുത്. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്. എനിക്ക് മലയാളം അറിയാം."),
27
  "Hindi": ("प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
28
  "केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: यह एक हिंदी वाक्य है।"),
@@ -31,29 +37,39 @@ LANG_PRIMERS = {
31
  }
32
 
33
  SCRIPT_PATTERNS = {
34
- "Tamil": re.compile(r"[\u0B80-\u0BFF]"),
35
- "Malayalam": re.compile(r"[\u0D00-\u0D7F]"),
36
- "Hindi": re.compile(r"[\u0900-\u097F]"),
37
- "Sanskrit": re.compile(r"[\u0900-\u097F]"),
38
  "English": re.compile(r"[A-Za-z]")
39
  }
40
 
41
  SENTENCE_BANK = {
42
- "English": ["The sun sets over the horizon.",
43
- "Learning languages is fun.",
44
- "I like to drink coffee in the morning."],
45
- "Tamil": ["இன்று நல்ல வானிலை உள்ளது.",
46
- "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
47
- "எனக்கு புத்தகம் படிக்க விருப்பம்."],
48
- "Malayalam": ["എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
49
- "ഇന്ന് മഴപെയ്യുന്നു.",
50
- "ഞാൻ പുസ്തകം വായിക്കുന്നു."],
51
- "Hindi": ["आज मौसम अच्छा है।",
52
- "मुझे हिंदी बोलना पसंद है।",
53
- "मैं किताब पढ़ रहा हूँ।"],
54
- "Sanskrit": ["अहं ग्रन्थं पठामि।",
55
- "अद्य सूर्यः तेजस्वी अस्ति।",
56
- "मम नाम रामः।"]
 
 
 
 
 
 
 
 
 
 
57
  }
58
 
59
  VOICE_STYLE = {
@@ -68,31 +84,38 @@ VOICE_STYLE = {
68
  print("Loading Whisper model...")
69
  whisper_model = WhisperModel(MODEL_NAME, device=DEVICE)
70
 
71
- print("Loading IndicParler-TTS via pipeline...")
72
- TTS_MODEL_ID = "ai4bharat/indic-parler-tts"
73
- tts_pipe = pipeline("text-to-speech", model=TTS_MODEL_ID)
 
74
 
75
  # ---------------- HELPERS ---------------- #
76
  def get_random_sentence(language_choice):
77
  return random.choice(SENTENCE_BANK[language_choice])
78
 
79
  def is_script(text, lang_name):
80
- pat = SCRIPT_PATTERNS.get(lang_name)
81
- return bool(pat.search(text)) if pat else True
82
 
83
  def transliterate_to_hk(text, lang_choice):
84
  mapping = {
85
- "Tamil": sanscript.TAMIL, "Malayalam": sanscript.MALAYALAM,
86
- "Hindi": sanscript.DEVANAGARI, "Sanskrit": sanscript.DEVANAGARI,
 
 
87
  "English": None
88
  }
89
  return transliterate(text, mapping[lang_choice], sanscript.HK) if mapping[lang_choice] else text
90
 
91
  def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperature, condition_on_previous_text):
92
  segments, _ = whisper_model.transcribe(
93
- audio_path, language=lang_code, task="transcribe",
94
- initial_prompt=initial_prompt, beam_size=beam_size,
95
- temperature=temperature, condition_on_previous_text=condition_on_previous_text,
 
 
 
 
96
  word_timestamps=False
97
  )
98
  return "".join(s.text for s in segments).strip()
@@ -128,50 +151,59 @@ def char_level_highlight(ref, hyp):
128
  def synthesize_tts(text, lang_choice):
129
  if not text.strip():
130
  return None
131
- prompt_style = VOICE_STYLE.get(lang_choice, "")
132
- audio_out = tts_pipe(text, forward_params={"description": prompt_style})
133
- return (audio_out["sampling_rate"], audio_out["audio"])
 
 
 
 
 
 
 
 
 
134
 
135
  # ---------------- MAIN ---------------- #
136
  def compare_pronunciation(audio, language_choice, intended_sentence,
137
- pass1_beam, pass1_temp, pass1_condition):
138
  if audio is None or not intended_sentence.strip():
139
- return "No audio or intended sentence.", "", "", "", "", "", None, None, "", ""
 
140
 
141
  lang_code = LANG_CODES[language_choice]
142
  primer_weak, primer_strong = LANG_PRIMERS[language_choice]
143
 
144
- # Pass 1
145
  actual_text = transcribe_once(audio, lang_code, primer_weak,
146
  pass1_beam, pass1_temp, pass1_condition)
147
 
148
- # Pass 2 (fixed best-known defaults)
149
  strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
150
  corrected_text = transcribe_once(audio, lang_code, strict_prompt,
151
  beam_size=5, temperature=0.0, condition_on_previous_text=False)
152
 
153
- # Scores
154
  wer_val = jiwer.wer(intended_sentence, actual_text)
155
  cer_val = jiwer.cer(intended_sentence, actual_text)
156
 
157
- # Transliteration
158
- hk_translit = transliterate_to_hk(actual_text, language_choice) \
159
- if is_script(actual_text, language_choice) \
160
- else f"[Script mismatch: expected {language_choice}]"
161
 
162
- # Highlights
163
  diff_html = highlight_differences(intended_sentence, actual_text)
164
  char_html = char_level_highlight(intended_sentence, actual_text)
165
 
166
- # TTS
167
  tts_intended = synthesize_tts(intended_sentence, language_choice)
168
  tts_pass1 = synthesize_tts(actual_text, language_choice)
169
 
170
- return actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}", diff_html, tts_intended, tts_pass1, char_html, intended_sentence
 
171
 
172
  # ---------------- UI ---------------- #
173
  with gr.Blocks() as demo:
174
- gr.Markdown("## 🎙 Pronunciation Comparator + IndicParler‑TTS + Error Highlighting")
175
 
176
  with gr.Row():
177
  lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
@@ -204,6 +236,7 @@ with gr.Blocks() as demo:
204
  gen_btn.click(fn=get_random_sentence, inputs=[lang_choice], outputs=[intended_display])
205
 
206
  submit_btn = gr.Button("Analyze Pronunciation")
 
207
  submit_btn.click(
208
  fn=compare_pronunciation,
209
  inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp, pass1_condition],
 
3
  import difflib
4
  import re
5
  import jiwer
6
+ import torch
7
+ from parler_tts import ParlerTTSForConditionalGeneration
8
+ from transformers import AutoTokenizer
9
  from faster_whisper import WhisperModel
10
  from indic_transliteration import sanscript
11
  from indic_transliteration.sanscript import transliterate
12
+ import soundfile as sf
13
 
14
  # ---------------- CONFIG ---------------- #
15
  MODEL_NAME = "large-v2"
16
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
17
 
18
  LANG_CODES = {
19
+ "English": "en",
20
+ "Tamil": "ta",
21
+ "Malayalam": "ml",
22
+ "Hindi": "hi",
23
+ "Sanskrit": "sa"
24
  }
25
 
26
  LANG_PRIMERS = {
 
28
  "Write only in English without translation. Example: This is an English sentence."),
29
  "Tamil": ("நகல் தமிழ் எழுத்துக்களில் மட்டும் இருக்க வேண்டும்.",
30
  "தமிழ் எழுத்துக்களில் மட்டும் எழுதவும், மொழிபெயர்ப்பு செய்யக்கூடாது. உதாரணம்: இது ஒரு தமிழ் வாக்கியம்."),
31
+ "Malayalam": ("ട്രാൻസ്ഖ്രിപ്റ്റ് മലയാള ലിപിയിൽ ആയിരിക്കണം.",
32
  "മലയാള ലിപിയിൽ മാത്രം എഴുതുക, വിവർത്തനം ചെയ്യരുത്. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്. എനിക്ക് മലയാളം അറിയാം."),
33
  "Hindi": ("प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
34
  "केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: यह एक हिंदी वाक्य है।"),
 
37
  }
38
 
39
  SCRIPT_PATTERNS = {
40
+ "Tamil": re.compile(r"[஀-௿]"),
41
+ "Malayalam": re.compile(r"[ഀ-ൿ]"),
42
+ "Hindi": re.compile(r"[ऀ-ॿ]"),
43
+ "Sanskrit": re.compile(r"[ऀ-ॿ]"),
44
  "English": re.compile(r"[A-Za-z]")
45
  }
46
 
47
  SENTENCE_BANK = {
48
+ "English": [
49
+ "The sun sets over the horizon.",
50
+ "Learning languages is fun.",
51
+ "I like to drink coffee in the morning."
52
+ ],
53
+ "Tamil": [
54
+ "இன்று நல்ல வானிலை உள்ளது.",
55
+ "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
56
+ "எனக்கு புத்தகம் படிக்க விருப்பம்."
57
+ ],
58
+ "Malayalam": [
59
+ "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
60
+ "ഇന്ന് മഴപെയ്യുന്നു.",
61
+ "ഞാൻ പുസ്തകം വായിക്കുന്നു."
62
+ ],
63
+ "Hindi": [
64
+ "आज मौसम अच्छा है।",
65
+ "मुझे हिंदी बोलना पसंद है।",
66
+ "मैं किताब पढ़ रहा हूँ।"
67
+ ],
68
+ "Sanskrit": [
69
+ "अहं ग्रन्थं पठा���ि।",
70
+ "अद्य सूर्यः तेजस्वी अस्ति।",
71
+ "मम नाम रामः।"
72
+ ]
73
  }
74
 
75
  VOICE_STYLE = {
 
84
  print("Loading Whisper model...")
85
  whisper_model = WhisperModel(MODEL_NAME, device=DEVICE)
86
 
87
+ print("Loading Parler-TTS model...")
88
+ parler_model_id = "parler-tts/parler-tts-mini-v1" # You may switch to larger models if desired
89
+ parler_tts_model = ParlerTTSForConditionalGeneration.from_pretrained(parler_model_id).to(DEVICE)
90
+ parler_tts_tokenizer = AutoTokenizer.from_pretrained(parler_model_id)
91
 
92
  # ---------------- HELPERS ---------------- #
93
  def get_random_sentence(language_choice):
94
  return random.choice(SENTENCE_BANK[language_choice])
95
 
96
  def is_script(text, lang_name):
97
+ pattern = SCRIPT_PATTERNS.get(lang_name)
98
+ return bool(pattern.search(text)) if pattern else True
99
 
100
  def transliterate_to_hk(text, lang_choice):
101
  mapping = {
102
+ "Tamil": sanscript.TAMIL,
103
+ "Malayalam": sanscript.MALAYALAM,
104
+ "Hindi": sanscript.DEVANAGARI,
105
+ "Sanskrit": sanscript.DEVANAGARI,
106
  "English": None
107
  }
108
  return transliterate(text, mapping[lang_choice], sanscript.HK) if mapping[lang_choice] else text
109
 
110
  def transcribe_once(audio_path, lang_code, initial_prompt, beam_size, temperature, condition_on_previous_text):
111
  segments, _ = whisper_model.transcribe(
112
+ audio_path,
113
+ language=lang_code,
114
+ task="transcribe",
115
+ initial_prompt=initial_prompt,
116
+ beam_size=beam_size,
117
+ temperature=temperature,
118
+ condition_on_previous_text=condition_on_previous_text,
119
  word_timestamps=False
120
  )
121
  return "".join(s.text for s in segments).strip()
 
151
  def synthesize_tts(text, lang_choice):
152
  if not text.strip():
153
  return None
154
+ description = VOICE_STYLE.get(lang_choice, "")
155
+ description_input = parler_tts_tokenizer(description, return_tensors='pt').to(DEVICE)
156
+ prompt_input = parler_tts_tokenizer(text, return_tensors='pt').to(DEVICE)
157
+ generation = parler_tts_model.generate(
158
+ input_ids=description_input.input_ids,
159
+ attention_mask=description_input.attention_mask,
160
+ prompt_input_ids=prompt_input.input_ids,
161
+ prompt_attention_mask=prompt_input.attention_mask
162
+ )
163
+ audio_arr = generation.cpu().numpy().squeeze()
164
+ # Parler-TTS default sample rate is 24000
165
+ return 24000, audio_arr
166
 
167
  # ---------------- MAIN ---------------- #
168
  def compare_pronunciation(audio, language_choice, intended_sentence,
169
+ pass1_beam, pass1_temp, pass1_condition):
170
  if audio is None or not intended_sentence.strip():
171
+ return ("No audio or intended sentence.", "", "", "", "", "",
172
+ None, None, "", "")
173
 
174
  lang_code = LANG_CODES[language_choice]
175
  primer_weak, primer_strong = LANG_PRIMERS[language_choice]
176
 
177
+ # Pass 1: raw transcription with user-configured decoding parameters
178
  actual_text = transcribe_once(audio, lang_code, primer_weak,
179
  pass1_beam, pass1_temp, pass1_condition)
180
 
181
+ # Pass 2: strict transcription biased by intended sentence (fixed decoding params)
182
  strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
183
  corrected_text = transcribe_once(audio, lang_code, strict_prompt,
184
  beam_size=5, temperature=0.0, condition_on_previous_text=False)
185
 
186
+ # Compute WER and CER
187
  wer_val = jiwer.wer(intended_sentence, actual_text)
188
  cer_val = jiwer.cer(intended_sentence, actual_text)
189
 
190
+ # Transliteration of Pass 1 output
191
+ hk_translit = transliterate_to_hk(actual_text, language_choice) if is_script(actual_text, language_choice) else f"[Script mismatch: expected {language_choice}]"
 
 
192
 
193
+ # Highlight word-level and character-level differences
194
  diff_html = highlight_differences(intended_sentence, actual_text)
195
  char_html = char_level_highlight(intended_sentence, actual_text)
196
 
197
+ # Synthesized TTS audios for intended and Pass 1 text
198
  tts_intended = synthesize_tts(intended_sentence, language_choice)
199
  tts_pass1 = synthesize_tts(actual_text, language_choice)
200
 
201
+ return (actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}",
202
+ diff_html, tts_intended, tts_pass1, char_html, intended_sentence)
203
 
204
  # ---------------- UI ---------------- #
205
  with gr.Blocks() as demo:
206
+ gr.Markdown("## 🎙 Pronunciation Comparator + Parler-TTS + Highlights")
207
 
208
  with gr.Row():
209
  lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
 
236
  gen_btn.click(fn=get_random_sentence, inputs=[lang_choice], outputs=[intended_display])
237
 
238
  submit_btn = gr.Button("Analyze Pronunciation")
239
+
240
  submit_btn.click(
241
  fn=compare_pronunciation,
242
  inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp, pass1_condition],