Spaces:

sudhanm
/

whisper-largev2-raw-ta-ml

Running on Zero

App Files Files Community

sudhanm commited on 10 days ago

Commit

89f17cd

verified ·

1 Parent(s): 02b32bc

Update app.py

Browse files

Files changed (1) hide show

app.py +172 -122

app.py CHANGED Viewed

@@ -4,14 +4,8 @@ import difflib
 import re
 import jiwer
 import torch
-import torchaudio
 import numpy as np
-from transformers import (
-    AutoProcessor,
-    AutoModelForSpeechSeq2Seq,
-    WhisperProcessor,
-    WhisperForConditionalGeneration
-)
 import librosa
 import soundfile as sf
 from indic_transliteration import sanscript
@@ -20,6 +14,16 @@ import warnings
 import spaces
 warnings.filterwarnings("ignore")
 # ---------------- CONFIG ---------------- #
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🔧 Using device: {DEVICE}")
@@ -30,11 +34,14 @@ LANG_CODES = {
     "Malayalam": "ml"
 }
-# Updated model configurations with LARGE models for maximum accuracy
-ASR_MODELS = {
     "English": "openai/whisper-base.en",
-    "Tamil": "ai4bharat/whisper-large-ta",      # LARGE AI4Bharat Tamil model (~1.5GB)
-    "Malayalam": "ai4bharat/whisper-large-ml"   # LARGE AI4Bharat Malayalam model (~1.5GB)
 }
 LANG_PRIMERS = {
@@ -86,14 +93,49 @@ SENTENCE_BANK = {
 }
 # ---------------- MODEL CACHE ---------------- #
-asr_models = {}
 @spaces.GPU
-def load_asr_model(language):
-    """Load ASR model for specific language - PRIMARY MODELS ONLY"""
-    if language not in asr_models:
-        model_name = ASR_MODELS[language]
-        print(f"🔄 Loading LARGE model for {language}: {model_name}")
         try:
             processor = AutoProcessor.from_pretrained(model_name)
@@ -104,14 +146,14 @@ def load_asr_model(language):
                 use_safetensors=True
             ).to(DEVICE)
-            asr_models[language] = {"processor": processor, "model": model, "model_name": model_name}
-            print(f"✅ LARGE model loaded successfully for {language}")
         except Exception as e:
-            print(f"❌ Failed to load {model_name}: {e}")
-            raise Exception(f"Could not load {language} model. Please check model availability.")
-    return asr_models[language]
 # ---------------- HELPERS ---------------- #
 def get_random_sentence(language_choice):
@@ -165,14 +207,36 @@ def preprocess_audio(audio_path, target_sr=16000):
         return None, None
 @spaces.GPU
-def transcribe_audio(audio_path, language, initial_prompt="", force_language=True):
-    """Transcribe audio using loaded models"""
     try:
-        # Load model components
-        asr_components = load_asr_model(language)
-        processor = asr_components["processor"]
-        model = asr_components["model"]
-        model_name = asr_components["model_name"]
         # Preprocess audio
         audio, sr = preprocess_audio(audio_path)
@@ -192,47 +256,26 @@ def transcribe_audio(audio_path, language, initial_prompt="", force_language=Tru
         # Generate transcription
         with torch.no_grad():
-            # Basic generation parameters
             generate_kwargs = {
                 "input_features": input_features,
                 "max_length": 200,
-                "num_beams": 3,  # Reduced for better compatibility
                 "do_sample": False
             }
-            # Try different approaches for language forcing
-            if force_language and language != "English":
                 lang_code = LANG_CODES.get(language, "en")
-                # Method 1: Try forced_decoder_ids (OpenAI Whisper style)
                 try:
                     if hasattr(processor, 'get_decoder_prompt_ids'):
                         forced_decoder_ids = processor.get_decoder_prompt_ids(
                             language=lang_code,
                             task="transcribe"
                         )
-                        # Test if model accepts this parameter
-                        test_kwargs = generate_kwargs.copy()
-                        test_kwargs["max_length"] = 10
-                        test_kwargs["forced_decoder_ids"] = forced_decoder_ids
-                        _ = model.generate(**test_kwargs)  # Test run
                         generate_kwargs["forced_decoder_ids"] = forced_decoder_ids
-                        print(f"✅ Using forced_decoder_ids for {language}")
                 except Exception as e:
-                    print(f"⚠️ forced_decoder_ids not supported: {e}")
-                # Method 2: Try language parameter
-                try:
-                    test_kwargs = generate_kwargs.copy()
-                    test_kwargs["max_length"] = 10
-                    test_kwargs["language"] = lang_code
-                    _ = model.generate(**test_kwargs)  # Test run
-                    generate_kwargs["language"] = lang_code
-                    print(f"✅ Using language parameter for {language}")
-                except Exception as e:
-                    print(f"⚠️ language parameter not supported: {e}")
-            # Generate with whatever parameters work
             predicted_ids = model.generate(**generate_kwargs)
         # Decode
@@ -242,30 +285,31 @@ def transcribe_audio(audio_path, language, initial_prompt="", force_language=Tru
             clean_up_tokenization_spaces=True
         )[0]
-        # Post-process transcription
-        transcription = transcription.strip()
-        # If we get empty transcription, try again with simpler parameters
-        if not transcription and generate_kwargs.get("num_beams", 1) > 1:
-            print("🔄 Retrying with greedy decoding...")
-            simple_kwargs = {
-                "input_features": input_features,
-                "max_length": 200,
-                "do_sample": False
-            }
-            predicted_ids = model.generate(**simple_kwargs)
-            transcription = processor.batch_decode(
-                predicted_ids,
-                skip_special_tokens=True,
-                clean_up_tokenization_spaces=True
-            )[0].strip()
-        return transcription or "(No transcription generated)"
     except Exception as e:
-        print(f"Transcription error for {language}: {e}")
         return f"Error: {str(e)[:150]}..."
 def highlight_differences(ref, hyp):
     """Highlight word-level differences with better styling"""
     if not ref.strip() or not hyp.strip():
@@ -327,8 +371,8 @@ def get_pronunciation_score(wer_val, cer_val):
 # ---------------- MAIN FUNCTION ---------------- #
 @spaces.GPU
 def compare_pronunciation(audio, language_choice, intended_sentence):
-    """Main function to compare pronunciation"""
-    print(f"🔍 Starting analysis with language: {language_choice}")
     print(f"📝 Audio file: {audio}")
     print(f"🎯 Intended sentence: {intended_sentence}")
@@ -341,27 +385,24 @@ def compare_pronunciation(audio, language_choice, intended_sentence):
         return ("❌ Please generate a practice sentence first.", "", "", "", "", "", "", "")
     try:
-        print(f"🔍 Analyzing pronunciation for {language_choice}...")
-        # Pass 1: Raw transcription
-        print("🔄 Starting Pass 1 transcription...")
-        primer_weak, _ = LANG_PRIMERS[language_choice]
-        actual_text = transcribe_audio(audio, language_choice, primer_weak, force_language=True)
-        print(f"✅ Pass 1 result: {actual_text}")
-        # Pass 2: Target-biased transcription with stronger prompt
-        print("🔄 Starting Pass 2 transcription...")
-        _, primer_strong = LANG_PRIMERS[language_choice]
-        strict_prompt = f"{primer_strong}\nExpected: {intended_sentence}"
-        corrected_text = transcribe_audio(audio, language_choice, strict_prompt, force_language=True)
-        print(f"✅ Pass 2 result: {corrected_text}")
         # Handle transcription errors
         if actual_text.startswith("Error:"):
             print(f"❌ Transcription error: {actual_text}")
             return (f"❌ {actual_text}", "", "", "", "", "", "", "")
-        # Calculate error metrics
         try:
             print("🔄 Calculating error metrics...")
             wer_val = jiwer.wer(intended_sentence, actual_text)
@@ -375,7 +416,7 @@ def compare_pronunciation(audio, language_choice, intended_sentence):
         score_text, feedback = get_pronunciation_score(wer_val, cer_val)
         print(f"✅ Score: {score_text}")
-        # Transliterations for both actual and intended
         print("🔄 Generating transliterations...")
         actual_hk = transliterate_to_hk(actual_text, language_choice)
         target_hk = transliterate_to_hk(intended_sentence, language_choice)
@@ -389,19 +430,19 @@ def compare_pronunciation(audio, language_choice, intended_sentence):
         diff_html = highlight_differences(intended_sentence, actual_text)
         char_html = char_level_highlight(intended_sentence, actual_text)
-        # Status message with detailed feedback
-        status = f"✅ Analysis Complete - {score_text}\n💬 {feedback}"
-        print(f"✅ Analysis completed successfully")
         return (
             status,
             actual_text or "(No transcription)",
-            corrected_text or "(No corrected transcription)",
             f"{wer_val:.3f} ({(1-wer_val)*100:.1f}% word accuracy)",
             f"{cer_val:.3f} ({(1-cer_val)*100:.1f}% character accuracy)",
-            diff_html,  # diff_html_box
-            char_html,  # char_html_box
-            f"🎯 Target: {intended_sentence}"  # target_display
         )
     except Exception as e:
@@ -413,24 +454,29 @@ def compare_pronunciation(audio, language_choice, intended_sentence):
 # ---------------- UI ---------------- #
 def create_interface():
-    with gr.Blocks(title="🎙️ Multilingual Pronunciation Trainer") as demo:
         gr.Markdown("""
-        # 🎙️ Multilingual Pronunciation Trainer
-        **Practice pronunciation in Tamil, Malayalam & English** using advanced speech recognition!
         ### 📋 How to Use:
         1. **Select** your target language 🌍
         2. **Generate** a practice sentence 🎲
         3. **Record** yourself reading it aloud 🎤
-        4. **Get** detailed feedback with accuracy metrics 📊
         ### 🎯 Features:
-        - **Dual-pass analysis** for accurate assessment
         - **Visual highlighting** of pronunciation errors
         - **Romanization** for Indic scripts
-        - **Detailed metrics** (Word & Character accuracy)
         """)
         with gr.Row():
@@ -456,18 +502,18 @@ def create_interface():
             label="🎤 Record Your Pronunciation"
         )
-        analyze_btn = gr.Button("🔍 Analyze Pronunciation", variant="primary")
         status_output = gr.Textbox(
-            label="📊 Analysis Results",
             interactive=False,
-            lines=3
         )
         with gr.Row():
             with gr.Column():
                 pass1_out = gr.Textbox(
-                    label="🎯 What You Actually Said (Raw Output)",
                     interactive=False,
                     lines=2
                 )
@@ -478,7 +524,7 @@ def create_interface():
             with gr.Column():
                 pass2_out = gr.Textbox(
-                    label="🔧 Target-Biased Analysis",
                     interactive=False,
                     lines=2
                 )
@@ -522,8 +568,8 @@ def create_interface():
             inputs=[audio_input, lang_choice, intended_display],
             outputs=[
                 status_output,      # status
-                pass1_out,          # actual_text
-                pass2_out,          # corrected_text
                 wer_out,            # wer formatted
                 cer_out,            # cer formatted
                 diff_html_box,      # diff_html
@@ -542,29 +588,33 @@ def create_interface():
         # Footer
         gr.Markdown("""
         ---
-        ### 🔧 Technical Details:
-        - **ASR Models**:
-          - **Tamil**: AI4Bharat Whisper-LARGE-TA (~1.5GB, maximum accuracy)
-          - **Malayalam**: AI4Bharat Whisper-LARGE-ML (~1.5GB, maximum accuracy)
-          - **English**: OpenAI Whisper-Base-EN (optimized for English)
-        - **Performance**: Using largest available models for best pronunciation assessment
         - **Metrics**: WER (Word Error Rate) and CER (Character Error Rate)
         - **Transliteration**: Harvard-Kyoto system for Indic scripts
-        - **Analysis**: Dual-pass approach for comprehensive feedback
-        **Note**: Large models provide maximum accuracy but require longer initial loading time.
-        **Languages**: English, Tamil, and Malayalam with specialized large models.
         """)
     return demo
 # ---------------- LAUNCH ---------------- #
 if __name__ == "__main__":
-    print("🚀 Starting Multilingual Pronunciation Trainer with LARGE models...")
     print(f"🔧 Device: {DEVICE}")
     print(f"🔧 PyTorch version: {torch.__version__}")
-    print("📦 Models will be loaded on-demand with GPU acceleration...")
-    print("⚡ Using AI4Bharat LARGE models for maximum accuracy!")
     print("🎮 GPU functions decorated with @spaces.GPU for HuggingFace Spaces")
     demo = create_interface()

 import re
 import jiwer
 import torch
 import numpy as np
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 import librosa
 import soundfile as sf
 from indic_transliteration import sanscript
 import spaces
 warnings.filterwarnings("ignore")
+# Try to import whisper_jax, fallback to transformers if not available
+try:
+    from whisper_jax import FlaxWhisperPipeline
+    import jax.numpy as jnp
+    WHISPER_JAX_AVAILABLE = True
+    print("🚀 Using JAX-optimized IndicWhisper (70x faster!)")
+except ImportError:
+    WHISPER_JAX_AVAILABLE = False
+    print("⚠️ whisper_jax not available, using transformers fallback")
 # ---------------- CONFIG ---------------- #
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🔧 Using device: {DEVICE}")
     "Malayalam": "ml"
 }
+# SOTA IndicWhisper model - one model for all languages!
+INDICWHISPER_MODEL = "parthiv11/indic_whisper_nodcil"
+# Fallback models if IndicWhisper fails
+FALLBACK_MODELS = {
     "English": "openai/whisper-base.en",
+    "Tamil": "vasista22/whisper-tamil-large-v2",
+    "Malayalam": "thennal/whisper-medium-ml"
 }
 LANG_PRIMERS = {
 }
 # ---------------- MODEL CACHE ---------------- #
+indicwhisper_pipeline = None
+fallback_models = {}
 @spaces.GPU
+def load_indicwhisper():
+    """Load the SOTA IndicWhisper model"""
+    global indicwhisper_pipeline
+    if indicwhisper_pipeline is None:
+        try:
+            print(f"🔄 Loading SOTA IndicWhisper: {INDICWHISPER_MODEL}")
+            if WHISPER_JAX_AVAILABLE:
+                # Use JAX-optimized version (70x faster!)
+                indicwhisper_pipeline = FlaxWhisperPipeline(
+                    INDICWHISPER_MODEL,
+                    dtype=jnp.bfloat16,
+                    batch_size=1
+                )
+                print("✅ IndicWhisper loaded with JAX optimization (70x faster!)")
+            else:
+                # Fallback to transformers if whisper_jax not available
+                from transformers import pipeline
+                indicwhisper_pipeline = pipeline(
+                    "automatic-speech-recognition",
+                    model=INDICWHISPER_MODEL,
+                    device=DEVICE if DEVICE == "cuda" else -1
+                )
+                print("✅ IndicWhisper loaded with transformers (fallback mode)")
+        except Exception as e:
+            print(f"❌ Failed to load IndicWhisper: {e}")
+            indicwhisper_pipeline = None
+            raise Exception(f"Could not load IndicWhisper model: {str(e)}")
+    return indicwhisper_pipeline
+@spaces.GPU
+def load_fallback_model(language):
+    """Load fallback model if IndicWhisper fails"""
+    if language not in fallback_models:
+        model_name = FALLBACK_MODELS[language]
+        print(f"🔄 Loading fallback model for {language}: {model_name}")
         try:
             processor = AutoProcessor.from_pretrained(model_name)
                 use_safetensors=True
             ).to(DEVICE)
+            fallback_models[language] = {"processor": processor, "model": model, "model_name": model_name}
+            print(f"✅ Fallback model loaded for {language}")
         except Exception as e:
+            print(f"❌ Failed to load fallback {model_name}: {e}")
+            raise Exception(f"Could not load fallback {language} model")
+    return fallback_models[language]
 # ---------------- HELPERS ---------------- #
 def get_random_sentence(language_choice):
         return None, None
 @spaces.GPU
+def transcribe_with_indicwhisper(audio_path, language):
+    """Transcribe using SOTA IndicWhisper"""
+    try:
+        pipeline = load_indicwhisper()
+        if WHISPER_JAX_AVAILABLE and hasattr(pipeline, '__call__'):
+            # JAX-optimized version
+            result = pipeline(audio_path)
+            if isinstance(result, dict) and 'text' in result:
+                return result['text'].strip()
+            elif isinstance(result, str):
+                return result.strip()
+            else:
+                return str(result).strip()
+        else:
+            # Transformers fallback
+            result = pipeline(audio_path)
+            return result.get('text', '').strip()
+    except Exception as e:
+        print(f"IndicWhisper transcription error: {e}")
+        raise e
+@spaces.GPU
+def transcribe_with_fallback(audio_path, language):
+    """Transcribe using fallback models"""
     try:
+        components = load_fallback_model(language)
+        processor = components["processor"]
+        model = components["model"]
         # Preprocess audio
         audio, sr = preprocess_audio(audio_path)
         # Generate transcription
         with torch.no_grad():
             generate_kwargs = {
                 "input_features": input_features,
                 "max_length": 200,
+                "num_beams": 3,
                 "do_sample": False
             }
+            # Language forcing for non-English
+            if language != "English":
                 lang_code = LANG_CODES.get(language, "en")
                 try:
                     if hasattr(processor, 'get_decoder_prompt_ids'):
                         forced_decoder_ids = processor.get_decoder_prompt_ids(
                             language=lang_code,
                             task="transcribe"
                         )
                         generate_kwargs["forced_decoder_ids"] = forced_decoder_ids
                 except Exception as e:
+                    print(f"⚠️ Language forcing failed: {e}")
             predicted_ids = model.generate(**generate_kwargs)
         # Decode
             clean_up_tokenization_spaces=True
         )[0]
+        return transcription.strip() or "(No transcription generated)"
     except Exception as e:
+        print(f"Fallback transcription error: {e}")
         return f"Error: {str(e)[:150]}..."
+@spaces.GPU
+def transcribe_audio(audio_path, language, initial_prompt="", use_fallback=False):
+    """Main transcription function with IndicWhisper + fallback"""
+    try:
+        if use_fallback:
+            print(f"🔄 Using fallback model for {language}")
+            return transcribe_with_fallback(audio_path, language)
+        else:
+            print(f"🔄 Using SOTA IndicWhisper for {language}")
+            return transcribe_with_indicwhisper(audio_path, language)
+    except Exception as e:
+        print(f"Transcription failed, trying fallback: {e}")
+        if not use_fallback:
+            # Retry with fallback
+            return transcribe_audio(audio_path, language, initial_prompt, use_fallback=True)
+        else:
+            return f"Error: All transcription methods failed - {str(e)[:100]}"
 def highlight_differences(ref, hyp):
     """Highlight word-level differences with better styling"""
     if not ref.strip() or not hyp.strip():
 # ---------------- MAIN FUNCTION ---------------- #
 @spaces.GPU
 def compare_pronunciation(audio, language_choice, intended_sentence):
+    """Main function to compare pronunciation using SOTA IndicWhisper"""
+    print(f"🔍 Starting SOTA analysis with language: {language_choice}")
     print(f"📝 Audio file: {audio}")
     print(f"🎯 Intended sentence: {intended_sentence}")
         return ("❌ Please generate a practice sentence first.", "", "", "", "", "", "", "")
     try:
+        print(f"🔍 Analyzing pronunciation using SOTA IndicWhisper...")
+        # Pass 1: SOTA IndicWhisper transcription
+        print("🔄 Starting Pass 1: SOTA IndicWhisper transcription...")
+        actual_text = transcribe_audio(audio, language_choice, use_fallback=False)
+        print(f"✅ SOTA Pass 1 result: {actual_text}")
+        # Pass 2: Fallback model for comparison
+        print("🔄 Starting Pass 2: Fallback model transcription...")
+        fallback_text = transcribe_audio(audio, language_choice, use_fallback=True)
+        print(f"✅ Fallback Pass 2 result: {fallback_text}")
         # Handle transcription errors
         if actual_text.startswith("Error:"):
             print(f"❌ Transcription error: {actual_text}")
             return (f"❌ {actual_text}", "", "", "", "", "", "", "")
+        # Calculate error metrics using the better transcription
         try:
             print("🔄 Calculating error metrics...")
             wer_val = jiwer.wer(intended_sentence, actual_text)
         score_text, feedback = get_pronunciation_score(wer_val, cer_val)
         print(f"✅ Score: {score_text}")
+        # Transliterations
         print("🔄 Generating transliterations...")
         actual_hk = transliterate_to_hk(actual_text, language_choice)
         target_hk = transliterate_to_hk(intended_sentence, language_choice)
         diff_html = highlight_differences(intended_sentence, actual_text)
         char_html = char_level_highlight(intended_sentence, actual_text)
+        # Status message with SOTA info
+        status = f"✅ SOTA Analysis Complete - {score_text}\n💬 {feedback}\n🚀 Powered by IndicWhisper (AI4Bharat SOTA)"
+        print(f"✅ SOTA analysis completed successfully")
         return (
             status,
             actual_text or "(No transcription)",
+            fallback_text or "(No fallback transcription)",
             f"{wer_val:.3f} ({(1-wer_val)*100:.1f}% word accuracy)",
             f"{cer_val:.3f} ({(1-cer_val)*100:.1f}% character accuracy)",
+            diff_html,
+            char_html,
+            f"🎯 Target: {intended_sentence}"
         )
     except Exception as e:
 # ---------------- UI ---------------- #
 def create_interface():
+    with gr.Blocks(title="🎙️ SOTA Multilingual Pronunciation Trainer") as demo:
         gr.Markdown("""
+        # 🎙️ SOTA Multilingual Pronunciation Trainer
+        **Practice pronunciation in Tamil, Malayalam & English** using **IndicWhisper - the State-of-the-Art ASR model**!
+        ### 🏆 **Powered by IndicWhisper:**
+        - **SOTA Performance:** Lowest WER on 39/59 benchmarks for Indian languages
+        - **JAX-Optimized:** 70x faster than standard implementations
+        - **AI4Bharat Research:** Built by IIT Madras for maximum accuracy
         ### 📋 How to Use:
         1. **Select** your target language 🌍
         2. **Generate** a practice sentence 🎲
         3. **Record** yourself reading it aloud 🎤
+        4. **Get** detailed feedback with SOTA-level accuracy 📊
         ### 🎯 Features:
+        - **SOTA + Fallback analysis** for comprehensive assessment
         - **Visual highlighting** of pronunciation errors
         - **Romanization** for Indic scripts
+        - **Advanced metrics** (Word & Character accuracy)
         """)
         with gr.Row():
             label="🎤 Record Your Pronunciation"
         )
+        analyze_btn = gr.Button("🔍 Analyze with SOTA IndicWhisper", variant="primary")
         status_output = gr.Textbox(
+            label="📊 SOTA Analysis Results",
             interactive=False,
+            lines=4
         )
         with gr.Row():
             with gr.Column():
                 pass1_out = gr.Textbox(
+                    label="🏆 SOTA IndicWhisper Output",
                     interactive=False,
                     lines=2
                 )
             with gr.Column():
                 pass2_out = gr.Textbox(
+                    label="🔧 Fallback Model Comparison",
                     interactive=False,
                     lines=2
                 )
             inputs=[audio_input, lang_choice, intended_display],
             outputs=[
                 status_output,      # status
+                pass1_out,          # SOTA IndicWhisper
+                pass2_out,          # fallback comparison
                 wer_out,            # wer formatted
                 cer_out,            # cer formatted
                 diff_html_box,      # diff_html
         # Footer
         gr.Markdown("""
         ---
+        ### 🏆 **SOTA Technology Stack:**
+        - **Primary ASR**: IndicWhisper (AI4Bharat/IIT Madras) - SOTA for Indian languages
+        - **JAX Optimization**: 70x speed improvement with `parthiv11/indic_whisper_nodcil`
+        - **Fallback Models**: Specialized fine-tuned models for comparison
+        - **Benchmark Performance**: Lowest WER on 39/59 Vistaar benchmarks
+        - **Training Data**: 10,700+ hours across 12 Indian languages
+        ### 🔧 **Technical Details:**
         - **Metrics**: WER (Word Error Rate) and CER (Character Error Rate)
         - **Transliteration**: Harvard-Kyoto system for Indic scripts
+        - **Analysis**: SOTA + Fallback comparison for comprehensive feedback
+        - **Languages**: English, Tamil, and Malayalam with SOTA accuracy
+        **Note**: Using the most advanced ASR models available for Indian language pronunciation assessment.
+        **Research**: Based on "Vistaar: Diverse Benchmarks and Training Sets for Indian Language ASR" (AI4Bharat, 2023)
         """)
     return demo
 # ---------------- LAUNCH ---------------- #
 if __name__ == "__main__":
+    print("🚀 Starting SOTA Multilingual Pronunciation Trainer...")
     print(f"🔧 Device: {DEVICE}")
     print(f"🔧 PyTorch version: {torch.__version__}")
+    print("🏆 Using IndicWhisper - State-of-the-Art for Indian Languages")
+    print("⚡ JAX optimization: 70x speed improvement available")
+    print("📊 SOTA Performance: Lowest WER on 39/59 benchmarks")
     print("🎮 GPU functions decorated with @spaces.GPU for HuggingFace Spaces")
     demo = create_interface()