""" Enhanced language detection for ALL Indian languages Optimized for IndicWhisper ASR pipeline with comprehensive script detection """ import re from typing import Optional, Dict, List, Tuple, Any import logging # Safe langdetect import with fallback try: from langdetect import detect, DetectorFactory DetectorFactory.seed = 0 LANGDETECT_AVAILABLE = True except ImportError: LANGDETECT_AVAILABLE = False logger = logging.getLogger(__name__) class IndicLanguageDetector: """Comprehensive language detection for all 22+ Indian languages""" def __init__(self): # Extended Unicode ranges for all Indian scripts self.script_patterns = { 'devanagari': r'[\u0900-\u097f]', # Hindi, Marathi, Sanskrit, Nepali, Bodo, Dogri, Konkani, Maithili 'bengali': r'[\u0980-\u09ff]', # Bengali, Assamese 'tamil': r'[\u0b80-\u0bff]', # Tamil 'telugu': r'[\u0c00-\u0c7f]', # Telugu 'malayalam': r'[\u0d00-\u0d7f]', # Malayalam 'kannada': r'[\u0c80-\u0cff]', # Kannada 'gujarati': r'[\u0a80-\u0aff]', # Gujarati 'punjabi': r'[\u0a00-\u0a7f]', # Punjabi (Gurmukhi) 'odia': r'[\u0b00-\u0b7f]', # Odia 'arabic': r'[\u0600-\u06ff]', # Urdu, Kashmiri, Sindhi 'olchiki': r'[\u1c50-\u1c7f]', # Santali 'meitei': r'[\uabc0-\uabff]', # Manipuri } # Enhanced script to language mapping with priority order self.script_to_languages = { 'devanagari': ['hi', 'mr', 'ne', 'mai', 'sa', 'brx', 'doi', 'kok'], 'bengali': ['bn', 'as'], 'tamil': ['ta'], 'telugu': ['te'], 'malayalam': ['ml'], 'kannada': ['kn'], 'gujarati': ['gu'], 'punjabi': ['pa'], 'odia': ['or'], 'arabic': ['ur', 'ks', 'sd'], 'olchiki': ['sat'], 'meitei': ['mni'], } # All supported Indian languages (22 official + others) self.supported_languages = { 'hi', 'bn', 'te', 'mr', 'ta', 'ur', 'gu', 'kn', 'ml', 'or', 'pa', 'as', 'mai', 'sa', 'ne', 'ks', 'sd', 'brx', 'doi', 'kok', 'mni', 'sat' } # Enhanced character-based patterns for better detection self.char_patterns = { # Bengali specific characters 'bn': r'[হবদকগপতনমলরস]', # Tamil specific characters 'ta': r'[தகநலமபவசரன]', # Telugu specific characters 'te': r'[తకనలమపవసరణ]', # Malayalam specific characters 'ml': r'[തകനലമപവസരണ]', # Kannada specific characters 'kn': r'[ತಕನಲಮಪವಸರಣ]', # Gujarati specific characters 'gu': r'[તકનલમપવસરણ]', # Punjabi specific characters 'pa': r'[ਤਕਨਲਮਪਵਸਰਣ]', # Odia specific characters 'or': r'[ତକନଲମପଵସରଣ]', # Assamese specific characters 'as': r'[তৰৱখগঘচছজঝ]', # Urdu specific characters 'ur': r'[اردوپہکتنلمسرع]', } # Language confidence weights self.language_weights = { 'hi': 1.0, # Hindi (highest priority for Devanagari) 'bn': 0.9, # Bengali 'te': 0.9, # Telugu 'mr': 0.8, # Marathi 'ta': 0.9, # Tamil 'ur': 0.8, # Urdu 'gu': 0.8, # Gujarati 'kn': 0.8, # Kannada 'ml': 0.8, # Malayalam 'or': 0.7, # Odia 'pa': 0.7, # Punjabi 'as': 0.7, # Assamese } def detect_script(self, text: str) -> Optional[str]: """Detect script family from text with confidence scoring""" if not text or not text.strip(): return None script_scores = {} for script, pattern in self.script_patterns.items(): matches = re.findall(pattern, text) if matches: # Score based on percentage of matching characters score = len(matches) / len(text.replace(' ', '')) script_scores[script] = score if script_scores: # Return script with highest score return max(script_scores.items(), key=lambda x: x[1])[0] return None def detect_language_from_script(self, text: str) -> Optional[str]: """Get most likely language based on enhanced script detection""" script = self.detect_script(text) if not script or script not in self.script_to_languages: return None possible_languages = self.script_to_languages[script] # For single language scripts, return immediately if len(possible_languages) == 1: return possible_languages[0] # For multi-language scripts (like Devanagari), use character patterns if script == 'devanagari': return self._detect_devanagari_language(text, possible_languages) elif script == 'bengali': return self._detect_bengali_script_language(text, possible_languages) elif script == 'arabic': return self._detect_arabic_script_language(text, possible_languages) # Default to first (most common) language for that script return possible_languages[0] def _detect_devanagari_language(self, text: str, candidates: List[str]) -> str: """Enhanced Devanagari language detection""" # Hindi is most common, but check for specific patterns # Marathi specific patterns if re.search(r'[ळझञ]', text): # Marathi specific characters return 'mr' # Nepali specific patterns if re.search(r'[ऋएौ].*[नत]', text): # Common Nepali patterns return 'ne' # Sanskrit specific patterns (complex conjuncts) if re.search(r'[क्ष|त्र|ज्ञ|श्र]', text) and len(re.findall(r'[क्ष|त्र|ज्ञ|श्र]', text)) > 2: return 'sa' # Default to Hindi for Devanagari return 'hi' def _detect_bengali_script_language(self, text: str, candidates: List[str]) -> str: """Distinguish between Bengali and Assamese""" # Assamese specific characters if re.search(r'[ৰৱখগঘ]', text): return 'as' # Default to Bengali return 'bn' def _detect_arabic_script_language(self, text: str, candidates: List[str]) -> str: """Distinguish between Urdu, Kashmiri, and Sindhi""" # Urdu is most common for Arabic script in Indian context # Could add specific character patterns for Kashmiri/Sindhi if needed return 'ur' def detect_with_langdetect(self, text: str) -> Optional[str]: """Enhanced langdetect with Indian language support""" if not LANGDETECT_AVAILABLE: logger.warning("langdetect not available, skipping") return None try: if len(text.strip()) < 10: return None detected = detect(text) # Only return if it's a supported Indian language if detected in self.supported_languages: logger.debug(f"langdetect successful: {detected}") return detected else: logger.debug(f"langdetect returned non-Indian language: {detected}") return None except Exception as e: logger.warning(f"langdetect failed: {e}") return None def detect_with_character_patterns(self, text: str) -> Optional[str]: """Language detection using character-specific patterns""" if not text or len(text.strip()) < 5: return None language_scores = {} for lang_code, pattern in self.char_patterns.items(): matches = re.findall(pattern, text) if matches: # Calculate score based on character frequency and language weight base_score = len(matches) / len(text.replace(' ', '')) weight = self.language_weights.get(lang_code, 0.5) language_scores[lang_code] = base_score * weight if language_scores: # Return language with highest weighted score best_lang = max(language_scores.items(), key=lambda x: x[1])[0] logger.debug(f"Character pattern detection: {best_lang} (score: {language_scores[best_lang]:.3f})") return best_lang return None def detect_language(self, text: str) -> str: """ Comprehensive language detection for ALL Indian languages Multi-strategy approach with fallbacks: 1. Script-based detection (most reliable for Indic) 2. Character pattern matching 3. langdetect fallback 4. Default to Hindi """ if not text or not text.strip(): return 'hi' # Default to Hindi # Clean text for better detection cleaned_text = text.strip() # Strategy 1: Script-based detection (most reliable for Indic) script_lang = self.detect_language_from_script(cleaned_text) if script_lang: logger.debug(f"Script-based detection: {script_lang}") return script_lang # Strategy 2: Character pattern matching pattern_lang = self.detect_with_character_patterns(cleaned_text) if pattern_lang: logger.debug(f"Pattern-based detection: {pattern_lang}") return pattern_lang # Strategy 3: langdetect fallback (if available) langdetect_result = self.detect_with_langdetect(cleaned_text) if langdetect_result: logger.debug(f"langdetect result: {langdetect_result}") return langdetect_result # Strategy 4: Fallback based on common characters fallback_lang = self._fallback_detection(cleaned_text) if fallback_lang: logger.debug(f"Fallback detection: {fallback_lang}") return fallback_lang # Final fallback: Default to Hindi logger.debug("Using default language: Hindi") return 'hi' def _fallback_detection(self, text: str) -> Optional[str]: """Simple fallback detection based on common character patterns""" # Basic script detection without full pattern matching if any(char in text for char in 'হবদকগপ'): # Bengali chars return "bn" elif any(char in text for char in 'தகநலம'): # Tamil chars return "ta" elif any(char in text for char in 'తకనలమ'): # Telugu chars return "te" elif any(char in text for char in 'തകനലമ'): # Malayalam chars return "ml" elif any(char in text for char in 'ತಕನಲಮ'): # Kannada chars return "kn" elif any(char in text for char in 'તકનલમ'): # Gujarati chars return "gu" elif any(char in text for char in 'ਤਕਨਲਮ'): # Punjabi chars return "pa" elif any(char in text for char in 'ତକନଲମ'): # Odia chars return "or" elif any(char in text for char in 'اردوپہک'): # Urdu chars return "ur" return None def get_supported_languages(self) -> List[str]: """Get list of all supported languages""" return sorted(list(self.supported_languages)) def get_language_confidence(self, text: str, language: str) -> float: """Get confidence score for detected language""" if not text or language not in self.supported_languages: return 0.0 # Calculate confidence based on script match and character patterns script = self.detect_script(text) if not script: return 0.1 # Check if language matches detected script if language in self.script_to_languages.get(script, []): base_confidence = 0.8 else: base_confidence = 0.3 # Boost confidence with character pattern matching if language in self.char_patterns: pattern = self.char_patterns[language] matches = re.findall(pattern, text) if matches: char_boost = min(len(matches) / len(text.replace(' ', '')), 0.2) base_confidence += char_boost return min(base_confidence, 1.0) # Standalone function for backward compatibility def detect_language(text: str) -> str: """Standalone language detection function for backward compatibility""" detector = IndicLanguageDetector() return detector.detect_language(text) # Enhanced detection with confidence def detect_language_with_confidence(text: str) -> Tuple[str, float]: """Detect language and return confidence score""" detector = IndicLanguageDetector() language = detector.detect_language(text) confidence = detector.get_language_confidence(text, language) return language, confidence # Validation and testing functions def validate_language_detection(text: str, expected_language: str) -> Dict[str, Any]: """Validate language detection accuracy""" detector = IndicLanguageDetector() detected = detector.detect_language(text) confidence = detector.get_language_confidence(text, detected) return { 'text': text, 'expected': expected_language, 'detected': detected, 'confidence': confidence, 'correct': detected == expected_language, 'script': detector.detect_script(text) } # Language metadata for API responses LANGUAGE_NAMES = { 'hi': 'Hindi', 'bn': 'Bengali', 'te': 'Telugu', 'mr': 'Marathi', 'ta': 'Tamil', 'ur': 'Urdu', 'gu': 'Gujarati', 'kn': 'Kannada', 'ml': 'Malayalam', 'or': 'Odia', 'pa': 'Punjabi', 'as': 'Assamese', 'mai': 'Maithili', 'sa': 'Sanskrit', 'ne': 'Nepali', 'ks': 'Kashmiri', 'sd': 'Sindhi', 'brx': 'Bodo', 'doi': 'Dogri', 'kok': 'Konkani', 'mni': 'Manipuri', 'sat': 'Santali' } def get_language_name(language_code: str) -> str: """Get human-readable language name""" return LANGUAGE_NAMES.get(language_code.lower(), 'Unknown') if __name__ == "__main__": # Test the language detector test_texts = { 'hi': 'नमस्ते, आप कैसे हैं? यह हिंदी भाषा का परीक्षण है।', 'bn': 'নমস্কার, আপনি কেমন আছেন? এটি বাংলা ভাষার পরীক্ষা।', 'ta': 'வணக்கம், நீங்கள் எப்படி இருக்கிறீர்கள்? இது தமிழ் மொழியின் சோதனை.', 'te': 'నమస్కారం, మీరు ఎలా ఉన్నారు? ఇది తెలుగు భాష పరీక్ష.', 'ml': 'നമസ്കാരം, നിങ്ങൾ എങ്ങനെയുണ്ട്? ഇത് മലയാളം ഭാഷയുടെ പരീക്ഷണം.', 'ur': 'السلام علیکم، آپ کیسے ہیں؟ یہ اردو زبان کا امتحان ہے۔', 'gu': 'નમસ્તે, તમે કેમ છો? આ ગુજરાતી ભાષાની કસોટી છે.', 'kn': 'ನಮಸ್ಕಾರ, ನೀವು ಹೇಗಿದ್ದೀರಿ? ಇದು ಕನ್ನಡ ಭಾಷೆಯ ಪರೀಕ್ಷೆ.', } print("Testing IndicLanguageDetector:") print("=" * 60) detector = IndicLanguageDetector() for expected_lang, text in test_texts.items(): result = validate_language_detection(text, expected_lang) print(f"\n{expected_lang.upper()} ({get_language_name(expected_lang)}):") print(f"Text: {text}") print(f"Expected: {result['expected']}") print(f"Detected: {result['detected']}") print(f"Confidence: {result['confidence']:.3f}") print(f"Correct: {'✅' if result['correct'] else '❌'}") print(f"Script: {result['script']}") print(f"\nSupported Languages: {len(detector.get_supported_languages())}") print(f"Languages: {', '.join(detector.get_supported_languages())}")