Spaces:

sudhanm
/

whisper-largev2-raw-ta-ml

Sleeping

App Files Files Community

sudhanm commited on 6 days ago

Commit

382a648

verified ·

1 Parent(s): 382858d

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -70

app.py CHANGED Viewed

@@ -65,17 +65,51 @@ SENTENCE_BANK = {
 }
 # Global variables for models (will be loaded lazily)
-whisper_models = {}
-whisper_processors = {}
 def load_model(language_choice):
-    """Load model for specific language if not already loaded"""
-    if language_choice not in whisper_models:
-        model_id = MODEL_CONFIGS[language_choice]
-        print(f"Loading {language_choice} model: {model_id}")
-        whisper_models[language_choice] = WhisperForConditionalGeneration.from_pretrained(model_id).to(DEVICE)
-        whisper_processors[language_choice] = WhisperProcessor.from_pretrained(model_id)
         print(f"{language_choice} model loaded successfully!")
 # ---------------- HELPERS ---------------- #
 def get_random_sentence(language_choice):
@@ -95,38 +129,50 @@ def transliterate_to_hk(text, lang_choice):
 @spaces.GPU
 def transcribe_once(audio_path, language_choice, initial_prompt, beam_size, temperature, condition_on_previous_text):
-    # Load model if not already loaded
-    load_model(language_choice)
-    # Get the appropriate model and processor for the language
-    model = whisper_models[language_choice]
-    processor = whisper_processors[language_choice]
-    lang_code = LANG_CODES[language_choice]
-    # Load and process audio
-    import librosa
-    audio, sr = librosa.load(audio_path, sr=16000)
-    # Process audio with the specific model's processor
-    input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features.to(DEVICE)
-    # Generate forced decoder ids for the language
-    forced_decoder_ids = processor.get_decoder_prompt_ids(language=lang_code, task="transcribe")
-    # Generate transcription
-    with torch.no_grad():
-        predicted_ids = model.generate(
-            input_features,
-            forced_decoder_ids=forced_decoder_ids,
-            max_length=448,
-            num_beams=beam_size,
-            temperature=temperature if temperature > 0 else None,
-            do_sample=temperature > 0,
-        )
-    # Decode the transcription
-    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-    return transcription.strip()
 def highlight_differences(ref, hyp):
     ref_words, hyp_words = ref.strip().split(), hyp.strip().split()
@@ -226,54 +272,76 @@ def char_level_highlight(ref, hyp):
 def compare_pronunciation(audio, language_choice, intended_sentence,
                           pass1_beam, pass1_temp, pass1_condition):
     if audio is None or not intended_sentence.strip():
-        return ("No audio or intended sentence.", "", "", "", "", "", "", "")
-    primer_weak, primer_strong = LANG_PRIMERS[language_choice]
-    # Pass 1: raw transcription with user-configured decoding parameters
-    actual_text = transcribe_once(audio, language_choice, primer_weak,
-                                  pass1_beam, pass1_temp, pass1_condition)
-    # Pass 2: strict transcription biased by intended sentence (fixed decoding params)
-    strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
-    corrected_text = transcribe_once(audio, language_choice, strict_prompt,
-                                     beam_size=5, temperature=0.0, condition_on_previous_text=False)
-    # Compute WER and CER
-    wer_val = jiwer.wer(intended_sentence, actual_text)
-    cer_val = jiwer.cer(intended_sentence, actual_text)
-    # Transliteration of Pass 1 output
-    hk_translit = transliterate_to_hk(actual_text, language_choice) if is_script(actual_text, language_choice) else f"[Script mismatch: expected {language_choice}]"
-    # Highlight word-level and character-level differences
-    diff_html = highlight_differences(intended_sentence, actual_text)
-    char_html = char_level_highlight(intended_sentence, actual_text)
-    return (actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}",
-            diff_html, char_html, intended_sentence)
 # ---------------- UI ---------------- #
 with gr.Blocks(title="Pronunciation Comparator") as demo:
     gr.Markdown("## 🎙 Pronunciation Comparator - English, Tamil & Malayalam")
     gr.Markdown("Practice pronunciation with specialized Whisper models for each language!")
     with gr.Row():
         lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
         gen_btn = gr.Button("🎲 Generate Sentence")
     intended_display = gr.Textbox(label="Generated Sentence (Read aloud)", interactive=False)
     with gr.Row():
         audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Record your pronunciation")
     with gr.Column():
-        gr.Markdown("### Transcription Parameters")
-        pass1_beam = gr.Slider(1, 10, value=8, step=1, label="Pass 1 Beam Size")
-        pass1_temp = gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="Pass 1 Temperature")
-        pass1_condition = gr.Checkbox(value=True, label="Pass 1: Condition on previous text")
-    submit_btn = gr.Button("🔍 Analyze Pronunciation", variant="primary")
     gr.Markdown("### 📊 Analysis Results")
     with gr.Row():
@@ -299,7 +367,7 @@ with gr.Blocks(title="Pronunciation Comparator") as demo:
         inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp, pass1_condition],
         outputs=[
             pass1_out, pass2_out, hk_out, wer_out, cer_out,
-            diff_html_box, char_html_box, intended_display
         ]
     )

 }
 # Global variables for models (will be loaded lazily)
+current_model = None
+current_processor = None
+current_language = None
+def clear_gpu_memory():
+    """Clear GPU memory to prevent OOM errors"""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
 def load_model(language_choice):
+    """Load model for specific language, unload previous if different"""
+    global current_model, current_processor, current_language
+    if current_language == language_choice and current_model is not None:
+        return current_model, current_processor
+    # Clear previous model if different language
+    if current_model is not None:
+        print(f"Unloading previous model for {current_language}")
+        del current_model
+        del current_processor
+        clear_gpu_memory()
+    # Load new model
+    model_id = MODEL_CONFIGS[language_choice]
+    print(f"Loading {language_choice} model: {model_id}")
+    try:
+        current_processor = WhisperProcessor.from_pretrained(model_id)
+        current_model = WhisperForConditionalGeneration.from_pretrained(
+            model_id,
+            torch_dtype=torch.float16,  # Use half precision to save memory
+            device_map="auto"
+        )
+        current_language = language_choice
         print(f"{language_choice} model loaded successfully!")
+        return current_model, current_processor
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        # Fallback to CPU if GPU fails
+        current_processor = WhisperProcessor.from_pretrained(model_id)
+        current_model = WhisperForConditionalGeneration.from_pretrained(model_id)
+        current_language = language_choice
+        return current_model, current_processor
 # ---------------- HELPERS ---------------- #
 def get_random_sentence(language_choice):
 @spaces.GPU
 def transcribe_once(audio_path, language_choice, initial_prompt, beam_size, temperature, condition_on_previous_text):
+    try:
+        # Load model if not already loaded
+        model, processor = load_model(language_choice)
+        lang_code = LANG_CODES[language_choice]
+        # Load and process audio
+        import librosa
+        audio, sr = librosa.load(audio_path, sr=16000)
+        # Process audio with the specific model's processor
+        input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
+        # Move to GPU if available
+        if torch.cuda.is_available():
+            input_features = input_features.to("cuda")
+        # Generate forced decoder ids for the language
+        forced_decoder_ids = processor.get_decoder_prompt_ids(language=lang_code, task="transcribe")
+        # Generate transcription with memory-efficient settings
+        with torch.no_grad():
+            predicted_ids = model.generate(
+                input_features,
+                forced_decoder_ids=forced_decoder_ids,
+                max_length=200,  # Reduced max length to save memory
+                num_beams=min(beam_size, 4),  # Limit beam size for memory
+                temperature=temperature if temperature > 0 else None,
+                do_sample=temperature > 0,
+                no_repeat_ngram_size=2,
+                early_stopping=True
+            )
+        # Decode the transcription
+        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        # Clear GPU cache after inference
+        clear_gpu_memory()
+        return transcription.strip()
+    except Exception as e:
+        print(f"Transcription error: {e}")
+        clear_gpu_memory()
+        return f"Error during transcription: {str(e)}"
 def highlight_differences(ref, hyp):
     ref_words, hyp_words = ref.strip().split(), hyp.strip().split()
 def compare_pronunciation(audio, language_choice, intended_sentence,
                           pass1_beam, pass1_temp, pass1_condition):
     if audio is None or not intended_sentence.strip():
+        return ("No audio or intended sentence.", "", "", "", "", "", "", "", "❌ Please provide audio and sentence")
+    try:
+        primer_weak, primer_strong = LANG_PRIMERS[language_choice]
+        # Pass 1: raw transcription with user-configured decoding parameters
+        status_msg = f"🔄 Transcribing with {language_choice} model..."
+        actual_text = transcribe_once(audio, language_choice, primer_weak,
+                                      pass1_beam, pass1_temp, pass1_condition)
+        if actual_text.startswith("Error"):
+            return (actual_text, "", "", "", "", "", "", "", "❌ Transcription failed")
+        # Pass 2: strict transcription biased by intended sentence (fixed decoding params)
+        strict_prompt = f"{primer_strong}\nTarget: {intended_sentence}"
+        corrected_text = transcribe_once(audio, language_choice, strict_prompt,
+                                         beam_size=3, temperature=0.0, condition_on_previous_text=False)
+        # Compute WER and CER
+        try:
+            wer_val = jiwer.wer(intended_sentence, actual_text)
+            cer_val = jiwer.cer(intended_sentence, actual_text)
+        except:
+            wer_val = 1.0
+            cer_val = 1.0
+        # Transliteration of Pass 1 output
+        hk_translit = transliterate_to_hk(actual_text, language_choice) if is_script(actual_text, language_choice) else f"[Script mismatch: expected {language_choice}]"
+        # Highlight word-level and character-level differences
+        diff_html = highlight_differences(intended_sentence, actual_text)
+        char_html = char_level_highlight(intended_sentence, actual_text)
+        # Success status
+        status_msg = f"✅ Analysis complete! WER: {wer_val:.2f}"
+        return (actual_text, corrected_text, hk_translit, f"{wer_val:.2f}", f"{cer_val:.2f}",
+                diff_html, char_html, intended_sentence, status_msg)
+    except Exception as e:
+        error_msg = f"❌ Error: {str(e)}"
+        clear_gpu_memory()
+        return ("Error occurred", "", "", "", "", "", "", "", error_msg)
 # ---------------- UI ---------------- #
 with gr.Blocks(title="Pronunciation Comparator") as demo:
     gr.Markdown("## 🎙 Pronunciation Comparator - English, Tamil & Malayalam")
     gr.Markdown("Practice pronunciation with specialized Whisper models for each language!")
+    gr.Markdown("⚠️ **Note**: Models load on-demand to optimize memory usage. First use may take longer.")
     with gr.Row():
         lang_choice = gr.Dropdown(choices=list(LANG_CODES.keys()), value="Malayalam", label="Language")
         gen_btn = gr.Button("🎲 Generate Sentence")
     intended_display = gr.Textbox(label="Generated Sentence (Read aloud)", interactive=False)
+    # Status indicator
+    status_display = gr.Textbox(label="Status", interactive=False, value="🟢 Ready")
     with gr.Row():
         audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Record your pronunciation")
     with gr.Column():
+        gr.Markdown("### ⚙️ Transcription Parameters")
+        with gr.Row():
+            pass1_beam = gr.Slider(1, 4, value=2, step=1, label="Beam Size (lower = faster)")
+            pass1_temp = gr.Slider(0.0, 0.8, value=0.2, step=0.1, label="Temperature")
+        pass1_condition = gr.Checkbox(value=False, label="Condition on previous text")
+    submit_btn = gr.Button("🔍 Analyze Pronunciation", variant="primary", size="lg")
     gr.Markdown("### 📊 Analysis Results")
     with gr.Row():
         inputs=[audio_input, lang_choice, intended_display, pass1_beam, pass1_temp, pass1_condition],
         outputs=[
             pass1_out, pass2_out, hk_out, wer_out, cer_out,
+            diff_html_box, char_html_box, intended_display, status_display
         ]
     )