Spaces:

sudhanm
/

whisper-largev2-raw-ta-ml

Running on Zero

App Files Files Community

sudhanm commited on 4 days ago

Commit

9c2f50b

verified ·

1 Parent(s): be6893d

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -13

app.py CHANGED Viewed

@@ -145,16 +145,28 @@ def transliterate_with_qwen(text, source_lang):
     model, tokenizer = load_qwen_model()
     if model is None or tokenizer is None:
-        return text  # Return original if model fails
     try:
-        # Create prompts
         if source_lang == "Tamil":
-            system_prompt = "Convert Tamil text to natural Thanglish (how Tamil people type on phones). Only output the romanized text."
-            user_prompt = f"Tamil: {text}\nThanglish:"
         else:  # Malayalam
-            system_prompt = "Convert Malayalam text to natural Manglish (how Malayalam people type on phones). Only output the romanized text."
-            user_prompt = f"Malayalam: {text}\nManglish:"
         # Format for Qwen
         messages = [
@@ -166,27 +178,100 @@ def transliterate_with_qwen(text, source_lang):
         inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
         inputs = inputs.to(DEVICE)
-        # Generate
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
-                max_new_tokens=50,
-                temperature=0.1,
                 do_sample=True,
-                pad_token_id=tokenizer.eos_token_id
             )
         # Extract response
         full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         response = full_response[len(prompt):].strip()
-        # Clean response
-        response = response.split('\n')[0].strip()  # Take first line only
-        return response if response else text
     except Exception as e:
         print(f"Qwen transliteration error: {e}")
         return text
 # ---------------- SPEECH RECOGNITION ---------------- #

     model, tokenizer = load_qwen_model()
     if model is None or tokenizer is None:
+        return get_simple_transliteration(text, source_lang)  # Simple fallback
     try:
+        # Create better prompts with examples
         if source_lang == "Tamil":
+            system_prompt = "You are a Tamil transliteration expert. Convert Tamil script to English letters (Thanglish) like how Tamil people type on phones."
+            user_prompt = f"""Convert this Tamil text to Thanglish using English letters:
+Tamil: நான் தமிழ் படிக்கிறேன்
+Thanglish: naan tamil padikkiren
+Tamil: {text}
+Thanglish:"""
         else:  # Malayalam
+            system_prompt = "You are a Malayalam transliteration expert. Convert Malayalam script to English letters (Manglish) like how Malayalam people type on phones."
+            user_prompt = f"""Convert this Malayalam text to Manglish using English letters:
+Malayalam: ഞാൻ മലയാളം പഠിക്കുന്നു
+Manglish: njan malayalam padikkunnu
+Malayalam: {text}
+Manglish:"""
         # Format for Qwen
         messages = [
         inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
         inputs = inputs.to(DEVICE)
+        # Generate with better parameters
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
+                max_new_tokens=100,
+                temperature=0.3,
                 do_sample=True,
+                pad_token_id=tokenizer.eos_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                repetition_penalty=1.2
             )
         # Extract response
         full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         response = full_response[len(prompt):].strip()
+        # Clean response - remove any remaining script characters
+        import re
+        response = response.split('\n')[0].strip()  # Take first line
+        response = re.sub(r'[^\x00-\x7F]+', '', response)  # Remove non-ASCII (script chars)
+        response = response.strip()
+        # Validate response (should not contain original script)
+        if source_lang == "Malayalam" and any(char in response for char in "അആഇഈഉഊഋഎഏഐഒഓഔകഖഗഘങചഛജഝഞടഠഡഢണതഥദധനപഫബഭമയരലവശഷസഹളഴറ"):
+            return get_simple_transliteration(text, source_lang)
+        elif source_lang == "Tamil" and any(char in response for char in "அஆஇஈஉஊஎஏஐஒஓஔகஙசஞடணதநபமயரலவழளற"):
+            return get_simple_transliteration(text, source_lang)
+        return response if response else get_simple_transliteration(text, source_lang)
     except Exception as e:
         print(f"Qwen transliteration error: {e}")
+        return get_simple_transliteration(text, source_lang)
+def get_simple_transliteration(text, lang_choice):
+    """Simple transliteration if Qwen fails"""
+    # Basic word-level mappings for common words
+    if lang_choice == "Malayalam":
+        word_map = {
+            "കേരളം": "kerala",
+            "എന്റെ": "ente",
+            "സ്വന്തം": "swantham",
+            "നാടാണ്": "naadaan",
+            "എനിക്ക്": "enikku",
+            "മലയാളം": "malayalam",
+            "വളരെ": "valare",
+            "ഇഷ്ടമാണ്": "ishtamaan",
+            "ഞാൻ": "njan",
+            "പുസ്തകം": "pusthakam",
+            "വായിക്കുന്നു": "vaayikkunnu"
+        }
+    elif lang_choice == "Tamil":
+        word_map = {
+            "அன்னை": "annai",
+            "தமிழ்": "tamil",
+            "எங்கள்": "engal",
+            "தாய்மொழி": "thaaimozhi",
+            "நான்": "naan",
+            "இன்று": "indru",
+            "நல்ல": "nalla",
+            "வானிலை": "vaanilai"
+        }
+    else:
         return text
+    # Simple word replacement
+    words = text.split()
+    result_words = []
+    for word in words:
+        # Remove punctuation for lookup
+        clean_word = word.rstrip('.,!?')
+        punct = word[len(clean_word):]
+        if clean_word in word_map:
+            result_words.append(word_map[clean_word] + punct)
+        else:
+            # For unknown words, try basic phonetic conversion
+            result_words.append(basic_phonetic_convert(clean_word, lang_choice) + punct)
+    return ' '.join(result_words)
+def basic_phonetic_convert(word, lang_choice):
+    """Very basic phonetic conversion for unknown words"""
+    # This is a minimal fallback - just remove complex characters
+    import re
+    if lang_choice == "Malayalam":
+        # Replace some common Malayalam characters with approximate sounds
+        result = word.replace('ം', 'm').replace('ൺ', 'n').replace('ൻ', 'n')
+        result = re.sub(r'[^\x00-\x7F]+', '', result)  # Remove remaining script chars
+        return result if result else "unknown"
+    elif lang_choice == "Tamil":
+        result = re.sub(r'[^\x00-\x7F]+', '', word)  # Remove script chars
+        return result if result else "unknown"
+    return word
 # ---------------- SPEECH RECOGNITION ---------------- #