sudhanm commited on
Commit
33e6674
·
verified ·
1 Parent(s): 039f2f1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -551
app.py CHANGED
@@ -1,519 +1,28 @@
1
- import gradio as gr
2
- import random
3
- import difflib
4
- import re
5
- import jiwer
6
- import torch
7
- import torchaudio
8
- import numpy as np
9
- from transformers import (
10
- AutoProcessor,
11
- AutoModelForSpeechSeq2Seq,
12
- WhisperProcessor,
13
- WhisperForConditionalGeneration
14
- )
15
- import librosa
16
- import soundfile as sf
17
- from indic_transliteration import sanscript
18
- from indic_transliteration.sanscript import transliterate
19
- import warnings
20
- import spaces
21
- warnings.filterwarnings("ignore")
22
-
23
- # ---------------- CONFIG ---------------- #
24
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
25
- print(f"🔧 Using device: {DEVICE}")
26
-
27
- LANG_CODES = {
28
- "English": "en",
29
- "Tamil": "ta",
30
- "Malayalam": "ml"
31
- }
32
-
33
- # Updated model configurations with LARGE models for maximum accuracy
34
- ASR_MODELS = {
35
- "English": "openai/whisper-base.en",
36
- "Tamil": "ai4bharat/whisper-large-ta", # LARGE AI4Bharat Tamil model (~1.5GB)
37
- "Malayalam": "ai4bharat/whisper-large-ml" # LARGE AI4Bharat Malayalam model (~1.5GB)
38
- }
39
-
40
- LANG_PRIMERS = {
41
- "English": ("Transcribe in English.",
42
- "Write only in English. Example: This is an English sentence."),
43
- "Tamil": ("தமிழில் எழுதுக.",
44
- "தமிழ் எழுத்துக்களில் மட்டும் எழுதவும். உதாரணம்: இது ஒரு தமிழ் வாக்கியம்."),
45
- "Malayalam": ("മലയാളത്തിൽ എഴുതുക.",
46
- "മലയാള ലിപിയിൽ മാത്രം എഴുതുക. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്.")
47
- }
48
-
49
- SCRIPT_PATTERNS = {
50
- "Tamil": re.compile(r"[஀-௿]"),
51
- "Malayalam": re.compile(r"[ഀ-ൿ]"),
52
- "English": re.compile(r"[A-Za-z]")
53
- }
54
-
55
- SENTENCE_BANK = {
56
- "English": [
57
- "The sun sets over the beautiful horizon.",
58
- "Learning new languages opens many doors.",
59
- "I enjoy reading books in the evening.",
60
- "Technology has changed our daily lives.",
61
- "Music brings people together across cultures.",
62
- "Education is the key to a bright future.",
63
- "The flowers bloom beautifully in spring.",
64
- "Hard work always pays off in the end."
65
- ],
66
- "Tamil": [
67
- "இன்று நல்ல வானிலை உள்ளது.",
68
- "நான் தமிழ் கற்றுக்கொண்டு இருக்கிறேன்.",
69
- "எனக்கு புத்தகம் படிக்க விருப்பம்.",
70
- "தமிழ் மொழி மிகவும் அழகானது.",
71
- "குடும்பத்துடன் நேரம் செலவிடுவது முக்கியம்.",
72
- "கல்வி நமது எதிர்காலத்தின் திறவுகோல்.",
73
- "பறவைகள் காலையில் இனிமையாக பாடுகின்றன.",
74
- "உழைப்பு எப்போதும் வெற்றியைத் தரும்."
75
- ],
76
- "Malayalam": [
77
- "എനിക്ക് മലയാളം വളരെ ഇഷ്ടമാണ്.",
78
- "ഇന്ന് മഴപെയ്യുന്നു.",
79
- "ഞാൻ പുസ്തകം വായിക്കുന്നു.",
80
- "കേരളത്തിന്റെ പ്രകൃതി സുന്ദരമാണ്.",
81
- "വിദ്യാഭ്യാസം ജീവിതത്തിൽ പ്രധാനമാണ്.",
82
- "സംഗീതം മനസ്സിന് സന്തോഷം നൽകുന്നു.",
83
- "കുടുംബസമയം വളരെ വിലപ്പെട്ടതാണ്.",
84
- "കഠിനാധ്വാനം എപ്പോഴും ഫലം നൽകും."
85
- ]
86
- }
87
-
88
- # ---------------- MODEL CACHE ---------------- #
89
- asr_models = {}
90
-
91
- @spaces.GPU
92
- def load_asr_model(language):
93
- """Load ASR model for specific language - PRIMARY MODELS ONLY"""
94
- if language not in asr_models:
95
- model_name = ASR_MODELS[language]
96
- print(f"🔄 Loading LARGE model for {language}: {model_name}")
97
-
98
- try:
99
- processor = AutoProcessor.from_pretrained(model_name)
100
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
101
- model_name,
102
- torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
103
- low_cpu_mem_usage=True,
104
- use_safetensors=True
105
- ).to(DEVICE)
106
-
107
- asr_models[language] = {"processor": processor, "model": model, "model_name": model_name}
108
- print(f"✅ LARGE model loaded successfully for {language}")
109
-
110
- except Exception as e:
111
- print(f"❌ Failed to load {model_name}: {e}")
112
- raise Exception(f"Could not load {language} model. Please check model availability.")
113
-
114
- return asr_models[language]
115
-
116
- # ---------------- HELPERS ---------------- #
117
- def get_random_sentence(language_choice):
118
- """Get random sentence for practice"""
119
- return random.choice(SENTENCE_BANK[language_choice])
120
-
121
- def is_script(text, lang_name):
122
- """Check if text is in expected script"""
123
- pattern = SCRIPT_PATTERNS.get(lang_name)
124
- if not pattern:
125
- return True
126
- return bool(pattern.search(text))
127
-
128
- def transliterate_to_hk(text, lang_choice):
129
- """Transliterate Indic text to Harvard-Kyoto"""
130
- mapping = {
131
- "Tamil": sanscript.TAMIL,
132
- "Malayalam": sanscript.MALAYALAM,
133
- "English": None
134
- }
135
-
136
- script = mapping.get(lang_choice)
137
- if script and is_script(text, lang_choice):
138
- try:
139
- return transliterate(text, script, sanscript.HK)
140
- except Exception as e:
141
- print(f"Transliteration error: {e}")
142
- return text
143
- return text
144
-
145
- def preprocess_audio(audio_path, target_sr=16000):
146
- """Preprocess audio for ASR"""
147
- try:
148
- # Load audio
149
- audio, sr = librosa.load(audio_path, sr=target_sr)
150
-
151
- # Normalize audio
152
- if np.max(np.abs(audio)) > 0:
153
- audio = audio / np.max(np.abs(audio))
154
-
155
- # Remove silence from beginning and end
156
- audio, _ = librosa.effects.trim(audio, top_db=20)
157
-
158
- # Ensure minimum length
159
- if len(audio) < target_sr * 0.1: # Less than 0.1 seconds
160
- return None, None
161
-
162
- return audio, target_sr
163
- except Exception as e:
164
- print(f"Audio preprocessing error: {e}")
165
- return None, None
166
-
167
- @spaces.GPU
168
- def transcribe_audio(audio_path, language, initial_prompt="", force_language=True):
169
- """Transcribe audio using loaded models"""
170
- try:
171
- # Load model components
172
- asr_components = load_asr_model(language)
173
- processor = asr_components["processor"]
174
- model = asr_components["model"]
175
- model_name = asr_components["model_name"]
176
-
177
- # Preprocess audio
178
- audio, sr = preprocess_audio(audio_path)
179
- if audio is None:
180
- return "Error: Audio too short or could not be processed"
181
-
182
- # Prepare inputs
183
- inputs = processor(
184
- audio,
185
- sampling_rate=sr,
186
- return_tensors="pt",
187
- padding=True
188
- )
189
-
190
- # Move to device
191
- input_features = inputs.input_features.to(DEVICE)
192
-
193
- # Generate transcription
194
- with torch.no_grad():
195
- # Basic generation parameters
196
- generate_kwargs = {
197
- "input_features": input_features,
198
- "max_length": 200,
199
- "num_beams": 3, # Reduced for better compatibility
200
- "do_sample": False
201
- }
202
-
203
- # Try different approaches for language forcing
204
- if force_language and language != "English":
205
- lang_code = LANG_CODES.get(language, "en")
206
-
207
- # Method 1: Try forced_decoder_ids (OpenAI Whisper style)
208
- try:
209
- if hasattr(processor, 'get_decoder_prompt_ids'):
210
- forced_decoder_ids = processor.get_decoder_prompt_ids(
211
- language=lang_code,
212
- task="transcribe"
213
- )
214
- # Test if model accepts this parameter
215
- test_kwargs = generate_kwargs.copy()
216
- test_kwargs["max_length"] = 10
217
- test_kwargs["forced_decoder_ids"] = forced_decoder_ids
218
- _ = model.generate(**test_kwargs) # Test run
219
- generate_kwargs["forced_decoder_ids"] = forced_decoder_ids
220
- print(f"✅ Using forced_decoder_ids for {language}")
221
- except Exception as e:
222
- print(f"⚠️ forced_decoder_ids not supported: {e}")
223
-
224
- # Method 2: Try language parameter
225
- try:
226
- test_kwargs = generate_kwargs.copy()
227
- test_kwargs["max_length"] = 10
228
- test_kwargs["language"] = lang_code
229
- _ = model.generate(**test_kwargs) # Test run
230
- generate_kwargs["language"] = lang_code
231
- print(f"✅ Using language parameter for {language}")
232
- except Exception as e:
233
- print(f"⚠️ language parameter not supported: {e}")
234
-
235
- # Generate with whatever parameters work
236
- predicted_ids = model.generate(**generate_kwargs)
237
-
238
- # Decode
239
- transcription = processor.batch_decode(
240
- predicted_ids,
241
- skip_special_tokens=True,
242
- clean_up_tokenization_spaces=True
243
- )[0]
244
-
245
- # Post-process transcription
246
- transcription = transcription.strip()
247
-
248
- # If we get empty transcription, try again with simpler parameters
249
- if not transcription and generate_kwargs.get("num_beams", 1) > 1:
250
- print("🔄 Retrying with greedy decoding...")
251
- simple_kwargs = {
252
- "input_features": input_features,
253
- "max_length": 200,
254
- "do_sample": False
255
- }
256
- predicted_ids = model.generate(**simple_kwargs)
257
- transcription = processor.batch_decode(
258
- predicted_ids,
259
- skip_special_tokens=True,
260
- clean_up_tokenization_spaces=True
261
- )[0].strip()
262
-
263
- return transcription or "(No transcription generated)"
264
-
265
- except Exception as e:
266
- print(f"Transcription error for {language}: {e}")
267
- return f"Error: {str(e)[:150]}..."
268
-
269
- def highlight_differences(ref, hyp):
270
- """Highlight word-level differences with better styling"""
271
- if not ref.strip() or not hyp.strip():
272
- return "No text to compare"
273
-
274
- ref_words = ref.strip().split()
275
- hyp_words = hyp.strip().split()
276
-
277
- sm = difflib.SequenceMatcher(None, ref_words, hyp_words)
278
- out_html = []
279
-
280
- for tag, i1, i2, j1, j2 in sm.get_opcodes():
281
- if tag == 'equal':
282
- out_html.extend([f"<span style='color:green; font-weight:bold; background-color:#e8f5e8; padding:2px 4px; margin:1px; border-radius:3px;'>{w}</span>" for w in ref_words[i1:i2]])
283
- elif tag == 'replace':
284
- out_html.extend([f"<span style='color:red; text-decoration:line-through; background-color:#ffe8e8; padding:2px 4px; margin:1px; border-radius:3px;'>{w}</span>" for w in ref_words[i1:i2]])
285
- out_html.extend([f"<span style='color:orange; font-weight:bold; background-color:#fff3cd; padding:2px 4px; margin:1px; border-radius:3px;'>→{w}</span>" for w in hyp_words[j1:j2]])
286
- elif tag == 'delete':
287
- out_html.extend([f"<span style='color:red; text-decoration:line-through; background-color:#ffe8e8; padding:2px 4px; margin:1px; border-radius:3px;'>{w}</span>" for w in ref_words[i1:i2]])
288
- elif tag == 'insert':
289
- out_html.extend([f"<span style='color:orange; font-weight:bold; background-color:#fff3cd; padding:2px 4px; margin:1px; border-radius:3px;'>+{w}</span>" for w in hyp_words[j1:j2]])
290
-
291
- return " ".join(out_html)
292
-
293
- def char_level_highlight(ref, hyp):
294
- """Highlight character-level differences"""
295
- if not ref.strip() or not hyp.strip():
296
- return "No text to compare"
297
-
298
- sm = difflib.SequenceMatcher(None, list(ref), list(hyp))
299
- out = []
300
-
301
- for tag, i1, i2, j1, j2 in sm.get_opcodes():
302
- if tag == 'equal':
303
- out.extend([f"<span style='color:green; background-color:#e8f5e8;'>{c}</span>" for c in ref[i1:i2]])
304
- elif tag in ('replace', 'delete'):
305
- out.extend([f"<span style='color:red; text-decoration:underline; background-color:#ffe8e8; font-weight:bold;'>{c}</span>" for c in ref[i1:i2]])
306
- elif tag == 'insert':
307
- out.extend([f"<span style='color:orange; background-color:#fff3cd; font-weight:bold;'>{c}</span>" for c in hyp[j1:j2]])
308
-
309
- return "".join(out)
310
-
311
- def get_pronunciation_score(wer_val, cer_val):
312
- """Calculate pronunciation score and feedback"""
313
- # Weight WER more heavily than CER
314
- combined_score = (wer_val * 0.7) + (cer_val * 0.3)
315
-
316
- if combined_score <= 0.1:
317
- return "🏆 Excellent! (90%+)", "Your pronunciation is outstanding!"
318
- elif combined_score <= 0.2:
319
- return "🎉 Very Good! (80-90%)", "Great pronunciation with minor areas for improvement."
320
- elif combined_score <= 0.4:
321
- return "👍 Good! (60-80%)", "Good effort! Keep practicing for better accuracy."
322
- elif combined_score <= 0.6:
323
- return "📚 Needs Practice (40-60%)", "Focus on clearer pronunciation of highlighted words."
324
- else:
325
- return "💪 Keep Trying! (<40%)", "Don't give up! Practice makes perfect."
326
-
327
- # ---------------- MAIN FUNCTION ---------------- #
328
- @spaces.GPU
329
- def compare_pronunciation(audio, language_choice, intended_sentence):
330
- """Main function to compare pronunciation"""
331
- print(f"🔍 Starting analysis with language: {language_choice}")
332
- print(f"📝 Audio file: {audio}")
333
- print(f"🎯 Intended sentence: {intended_sentence}")
334
-
335
- if audio is None:
336
- print("❌ No audio provided")
337
- return ("❌ Please record audio first.", "", "", "", "", "", "", "", "", "", "", "", "")
338
-
339
- if not intended_sentence.strip():
340
- print("❌ No intended sentence")
341
- return ("❌ Please generate a practice sentence first.", "", "", "", "", "", "", "", "", "", "", "", "")
342
-
343
- try:
344
- print(f"🔍 Analyzing pronunciation for {language_choice}...")
345
-
346
- # Pass 1: Raw transcription
347
- print("🔄 Starting Pass 1 transcription...")
348
- primer_weak, _ = LANG_PRIMERS[language_choice]
349
- actual_text = transcribe_audio(audio, language_choice, primer_weak, force_language=True)
350
- print(f"✅ Pass 1 result: {actual_text}")
351
-
352
- # Pass 2: Target-biased transcription with stronger prompt
353
- print("🔄 Starting Pass 2 transcription...")
354
- _, primer_strong = LANG_PRIMERS[language_choice]
355
- strict_prompt = f"{primer_strong}\nExpected: {intended_sentence}"
356
- corrected_text = transcribe_audio(audio, language_choice, strict_prompt, force_language=True)
357
- print(f"✅ Pass 2 result: {corrected_text}")
358
-
359
- # Handle transcription errors
360
- if actual_text.startswith("Error:"):
361
- print(f"❌ Transcription error: {actual_text}")
362
- return (f"❌ {actual_text}", "", "", "", "", "", "", "", "", "", "", "", "")
363
-
364
- # Calculate error metrics
365
- try:
366
- print("🔄 Calculating error metrics...")
367
- wer_val = jiwer.wer(intended_sentence, actual_text)
368
- cer_val = jiwer.cer(intended_sentence, actual_text)
369
- print(f"✅ WER: {wer_val:.3f}, CER: {cer_val:.3f}")
370
- except Exception as e:
371
- print(f"❌ Error calculating metrics: {e}")
372
- wer_val, cer_val = 1.0, 1.0
373
-
374
- # Get pronunciation score and feedback
375
- score_text, feedback = get_pronunciation_score(wer_val, cer_val)
376
- print(f"✅ Score: {score_text}")
377
-
378
- # Transliterations for both actual and intended
379
- print("🔄 Generating transliterations...")
380
- actual_hk = transliterate_to_hk(actual_text, language_choice)
381
- target_hk = transliterate_to_hk(intended_sentence, language_choice)
382
-
383
- # Handle script mismatches
384
- if not is_script(actual_text, language_choice) and language_choice != "English":
385
- actual_hk = f"⚠️ Expected {language_choice} script, got mixed/other script"
386
-
387
- # Visual feedback
388
- print("🔄 Generating visual feedback...")
389
- diff_html = highlight_differences(intended_sentence, actual_text)
390
- char_html = char_level_highlight(intended_sentence, actual_text)
391
-
392
- # Status message with detailed feedback
393
- status = f"✅ Analysis Complete - {score_text}\n💬 {feedback}"
394
- print(f"✅ Analysis completed successfully")
395
-
396
- return (
397
- status,
398
- actual_text or "(No transcription)",
399
- corrected_text or "(No corrected transcription)",
400
- f"{wer_val:.3f} ({(1-wer_val)*100:.1f}% word accuracy)",
401
- f"{cer_val:.3f} ({(1-cer_val)*100:.1f}% character accuracy)",
402
- # New visual feedback outputs
403
- actual_text or "(No transcription)", # actual_text_display
404
- actual_hk, # actual_transliteration
405
- intended_sentence, # target_text_display
406
- target_hk, # target_transliteration
407
- diff_html, # diff_html_box
408
- char_html, # char_html_box
409
- intended_sentence, # intended_display (unchanged)
410
- f"🎯 Target: {intended_sentence}" # target_display
411
- )
412
-
413
- except Exception as e:
414
- error_msg = f"❌ Analysis Error: {str(e)[:200]}"
415
- print(f"❌ FATAL ERROR: {e}")
416
- import traceback
417
- traceback.print_exc()
418
- return (error_msg, str(e), "", "", "", "", "", "", "", "", "", "", "")
419
-
420
- # ---------------- UI ---------------- #
421
- def create_interface():
422
- with gr.Blocks(title="🎙️ Multilingual Pronunciation Trainer") as demo:
423
-
424
- gr.Markdown("""
425
- # 🎙️ Multilingual Pronunciation Trainer
426
-
427
- **Practice pronunciation in Tamil, Malayalam & English** using advanced speech recognition!
428
-
429
- ### 📋 How to Use:
430
- 1. **Select** your target language 🌍
431
- 2. **Generate** a practice sentence 🎲
432
- 3. **Record** yourself reading it aloud 🎤
433
- 4. **Get** detailed feedback with accuracy metrics 📊
434
-
435
- ### 🎯 Features:
436
- - **Dual-pass analysis** for accurate assessment
437
- - **Visual highlighting** of pronunciation errors
438
- - **Romanization** for Indic scripts
439
- - **Detailed metrics** (Word & Character accuracy)
440
- """)
441
-
442
- with gr.Row():
443
- with gr.Column(scale=3):
444
- lang_choice = gr.Dropdown(
445
- choices=list(LANG_CODES.keys()),
446
- value="Tamil",
447
- label="🌍 Select Language"
448
- )
449
- with gr.Column(scale=1):
450
- gen_btn = gr.Button("🎲 Generate Sentence", variant="primary")
451
-
452
- intended_display = gr.Textbox(
453
- label="📝 Practice Sentence (Read this aloud)",
454
- placeholder="Click 'Generate Sentence' to get started...",
455
- interactive=False,
456
- lines=3
457
- )
458
-
459
- audio_input = gr.Audio(
460
- sources=["microphone", "upload"],
461
- type="filepath",
462
- label="🎤 Record Your Pronunciation"
463
- )
464
-
465
- analyze_btn = gr.Button("�� Analyze Pronunciation", variant="primary")
466
-
467
- status_output = gr.Textbox(
468
- label="📊 Analysis Results",
469
- interactive=False,
470
- lines=3
471
- )
472
-
473
- with gr.Row():
474
- with gr.Column():
475
- pass1_out = gr.Textbox(
476
- label="🎯 What You Actually Said (Raw Output)",
477
- interactive=False,
478
- lines=2
479
- )
480
- wer_out = gr.Textbox(
481
- label="📈 Word Accuracy",
482
- interactive=False
483
- )
484
-
485
- with gr.Column():
486
- pass2_out = gr.Textbox(
487
- label="🔧 Target-Biased Analysis",
488
- interactive=False,
489
- lines=2
490
- )
491
- cer_out = gr.Textbox(
492
- label="📊 Character Accuracy",
493
- interactive=False
494
  )
495
 
496
- with gr.Accordion("📝 Detailed Visual Feedback", open=True):
497
- gr.Markdown("""
498
- ### 🎨 Color Guide:
499
- - 🟢 **Green**: Correctly pronounced words/characters
500
- - 🔴 **Red**: Missing or mispronounced (strikethrough)
501
- - 🟠 **Orange**: Extra words or substitutions
502
- """)
503
-
504
- diff_html_box = gr.HTML(
505
- label="🔍 Word-Level Analysis",
506
- show_label=True
507
- )
508
- char_html_box = gr.HTML(
509
- label="🔤 Character-Level Analysis",
510
- show_label=True
511
- )
512
-
513
- target_display = gr.Textbox(
514
- label="🎯 Reference Text",
515
- interactive=False,
516
- visible=False
517
  )
518
 
519
  # Auto-generate sentence on language change
@@ -521,40 +30,4 @@ def create_interface():
521
  fn=get_random_sentence,
522
  inputs=[lang_choice],
523
  outputs=[intended_display]
524
- )
525
-
526
- # Footer
527
- gr.Markdown("""
528
- ---
529
- ### 🔧 Technical Details:
530
- - **ASR Models**:
531
- - **Tamil**: AI4Bharat Whisper-LARGE-TA (~1.5GB, maximum accuracy)
532
- - **Malayalam**: AI4Bharat Whisper-LARGE-ML (~1.5GB, maximum accuracy)
533
- - **English**: OpenAI Whisper-Base-EN (optimized for English)
534
- - **Performance**: Using largest available models for best pronunciation assessment
535
- - **Metrics**: WER (Word Error Rate) and CER (Character Error Rate)
536
- - **Transliteration**: Harvard-Kyoto system for Indic scripts
537
- - **Analysis**: Dual-pass approach for comprehensive feedback
538
-
539
- **Note**: Large models provide maximum accuracy but require longer initial loading time.
540
- **Languages**: English, Tamil, and Malayalam with specialized large models.
541
- """)
542
-
543
- return demo
544
-
545
- # ---------------- LAUNCH ---------------- #
546
- if __name__ == "__main__":
547
- print("🚀 Starting Multilingual Pronunciation Trainer with LARGE models...")
548
- print(f"🔧 Device: {DEVICE}")
549
- print(f"🔧 PyTorch version: {torch.__version__}")
550
- print("📦 Models will be loaded on-demand with GPU acceleration...")
551
- print("⚡ Using AI4Bharat LARGE models for maximum accuracy!")
552
- print("🎮 GPU functions decorated with @spaces.GPU for HuggingFace Spaces")
553
-
554
- demo = create_interface()
555
- demo.launch(
556
- share=True,
557
- show_error=True,
558
- server_name="0.0.0.0",
559
- server_port=7860
560
- )
 
1
+ # Event handlers for buttons
2
+ gen_btn.click(
3
+ fn=get_random_sentence,
4
+ inputs=[lang_choice],
5
+ outputs=[intended_display]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  )
7
 
8
+ analyze_btn.click(
9
+ fn=compare_pronunciation,
10
+ inputs=[audio_input, lang_choice, intended_display],
11
+ outputs=[
12
+ status_output, # status
13
+ pass1_out, # actual_text
14
+ pass2_out, # corrected_text
15
+ wer_out, # wer formatted
16
+ cer_out, # cer formatted
17
+ gr.skip(), # actual_text (duplicate)
18
+ gr.skip(), # actual_hk (not displayed)
19
+ gr.skip(), # intended_sentence (duplicate)
20
+ gr.skip(), # target_hk (not displayed)
21
+ diff_html_box, # diff_html
22
+ char_html_box, # char_html
23
+ gr.skip(), # intended_sentence (duplicate)
24
+ target_display # target_display
25
+ ]
 
 
 
26
  )
27
 
28
  # Auto-generate sentence on language change
 
30
  fn=get_random_sentence,
31
  inputs=[lang_choice],
32
  outputs=[intended_display]
33
+ )