|
""" |
|
Enhanced language detection for ALL Indian languages |
|
Optimized for IndicWhisper ASR pipeline with comprehensive script detection |
|
""" |
|
import re |
|
from typing import Optional, Dict, List, Tuple, Any |
|
import logging |
|
|
|
|
|
try: |
|
from langdetect import detect, DetectorFactory |
|
DetectorFactory.seed = 0 |
|
LANGDETECT_AVAILABLE = True |
|
except ImportError: |
|
LANGDETECT_AVAILABLE = False |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class IndicLanguageDetector: |
|
"""Comprehensive language detection for all 22+ Indian languages""" |
|
|
|
def __init__(self): |
|
|
|
self.script_patterns = { |
|
'devanagari': r'[\u0900-\u097f]', |
|
'bengali': r'[\u0980-\u09ff]', |
|
'tamil': r'[\u0b80-\u0bff]', |
|
'telugu': r'[\u0c00-\u0c7f]', |
|
'malayalam': r'[\u0d00-\u0d7f]', |
|
'kannada': r'[\u0c80-\u0cff]', |
|
'gujarati': r'[\u0a80-\u0aff]', |
|
'punjabi': r'[\u0a00-\u0a7f]', |
|
'odia': r'[\u0b00-\u0b7f]', |
|
'arabic': r'[\u0600-\u06ff]', |
|
'olchiki': r'[\u1c50-\u1c7f]', |
|
'meitei': r'[\uabc0-\uabff]', |
|
} |
|
|
|
|
|
self.script_to_languages = { |
|
'devanagari': ['hi', 'mr', 'ne', 'mai', 'sa', 'brx', 'doi', 'kok'], |
|
'bengali': ['bn', 'as'], |
|
'tamil': ['ta'], |
|
'telugu': ['te'], |
|
'malayalam': ['ml'], |
|
'kannada': ['kn'], |
|
'gujarati': ['gu'], |
|
'punjabi': ['pa'], |
|
'odia': ['or'], |
|
'arabic': ['ur', 'ks', 'sd'], |
|
'olchiki': ['sat'], |
|
'meitei': ['mni'], |
|
} |
|
|
|
|
|
self.supported_languages = { |
|
'hi', 'bn', 'te', 'mr', 'ta', 'ur', 'gu', 'kn', 'ml', 'or', 'pa', 'as', |
|
'mai', 'sa', 'ne', 'ks', 'sd', 'brx', 'doi', 'kok', 'mni', 'sat' |
|
} |
|
|
|
|
|
self.char_patterns = { |
|
|
|
'bn': r'[হবদকগপতনমলরস]', |
|
|
|
'ta': r'[தகநலமபவசரன]', |
|
|
|
'te': r'[తకనలమపవసరణ]', |
|
|
|
'ml': r'[തകനലമപവസരണ]', |
|
|
|
'kn': r'[ತಕನಲಮಪವಸರಣ]', |
|
|
|
'gu': r'[તકનલમપવસરણ]', |
|
|
|
'pa': r'[ਤਕਨਲਮਪਵਸਰਣ]', |
|
|
|
'or': r'[ତକନଲମପଵସରଣ]', |
|
|
|
'as': r'[তৰৱখগঘচছজঝ]', |
|
|
|
'ur': r'[اردوپہکتنلمسرع]', |
|
} |
|
|
|
|
|
self.language_weights = { |
|
'hi': 1.0, |
|
'bn': 0.9, |
|
'te': 0.9, |
|
'mr': 0.8, |
|
'ta': 0.9, |
|
'ur': 0.8, |
|
'gu': 0.8, |
|
'kn': 0.8, |
|
'ml': 0.8, |
|
'or': 0.7, |
|
'pa': 0.7, |
|
'as': 0.7, |
|
} |
|
|
|
def detect_script(self, text: str) -> Optional[str]: |
|
"""Detect script family from text with confidence scoring""" |
|
if not text or not text.strip(): |
|
return None |
|
|
|
script_scores = {} |
|
|
|
for script, pattern in self.script_patterns.items(): |
|
matches = re.findall(pattern, text) |
|
if matches: |
|
|
|
score = len(matches) / len(text.replace(' ', '')) |
|
script_scores[script] = score |
|
|
|
if script_scores: |
|
|
|
return max(script_scores.items(), key=lambda x: x[1])[0] |
|
|
|
return None |
|
|
|
def detect_language_from_script(self, text: str) -> Optional[str]: |
|
"""Get most likely language based on enhanced script detection""" |
|
script = self.detect_script(text) |
|
if not script or script not in self.script_to_languages: |
|
return None |
|
|
|
possible_languages = self.script_to_languages[script] |
|
|
|
|
|
if len(possible_languages) == 1: |
|
return possible_languages[0] |
|
|
|
|
|
if script == 'devanagari': |
|
return self._detect_devanagari_language(text, possible_languages) |
|
elif script == 'bengali': |
|
return self._detect_bengali_script_language(text, possible_languages) |
|
elif script == 'arabic': |
|
return self._detect_arabic_script_language(text, possible_languages) |
|
|
|
|
|
return possible_languages[0] |
|
|
|
def _detect_devanagari_language(self, text: str, candidates: List[str]) -> str: |
|
"""Enhanced Devanagari language detection""" |
|
|
|
|
|
|
|
if re.search(r'[ळझञ]', text): |
|
return 'mr' |
|
|
|
|
|
if re.search(r'[ऋएौ].*[नत]', text): |
|
return 'ne' |
|
|
|
|
|
if re.search(r'[क्ष|त्र|ज्ञ|श्र]', text) and len(re.findall(r'[क्ष|त्र|ज्ञ|श्र]', text)) > 2: |
|
return 'sa' |
|
|
|
|
|
return 'hi' |
|
|
|
def _detect_bengali_script_language(self, text: str, candidates: List[str]) -> str: |
|
"""Distinguish between Bengali and Assamese""" |
|
|
|
if re.search(r'[ৰৱখগঘ]', text): |
|
return 'as' |
|
|
|
|
|
return 'bn' |
|
|
|
def _detect_arabic_script_language(self, text: str, candidates: List[str]) -> str: |
|
"""Distinguish between Urdu, Kashmiri, and Sindhi""" |
|
|
|
|
|
return 'ur' |
|
|
|
def detect_with_langdetect(self, text: str) -> Optional[str]: |
|
"""Enhanced langdetect with Indian language support""" |
|
if not LANGDETECT_AVAILABLE: |
|
logger.warning("langdetect not available, skipping") |
|
return None |
|
|
|
try: |
|
if len(text.strip()) < 10: |
|
return None |
|
|
|
detected = detect(text) |
|
|
|
if detected in self.supported_languages: |
|
logger.debug(f"langdetect successful: {detected}") |
|
return detected |
|
else: |
|
logger.debug(f"langdetect returned non-Indian language: {detected}") |
|
return None |
|
|
|
except Exception as e: |
|
logger.warning(f"langdetect failed: {e}") |
|
return None |
|
|
|
def detect_with_character_patterns(self, text: str) -> Optional[str]: |
|
"""Language detection using character-specific patterns""" |
|
if not text or len(text.strip()) < 5: |
|
return None |
|
|
|
language_scores = {} |
|
|
|
for lang_code, pattern in self.char_patterns.items(): |
|
matches = re.findall(pattern, text) |
|
if matches: |
|
|
|
base_score = len(matches) / len(text.replace(' ', '')) |
|
weight = self.language_weights.get(lang_code, 0.5) |
|
language_scores[lang_code] = base_score * weight |
|
|
|
if language_scores: |
|
|
|
best_lang = max(language_scores.items(), key=lambda x: x[1])[0] |
|
logger.debug(f"Character pattern detection: {best_lang} (score: {language_scores[best_lang]:.3f})") |
|
return best_lang |
|
|
|
return None |
|
|
|
def detect_language(self, text: str) -> str: |
|
""" |
|
Comprehensive language detection for ALL Indian languages |
|
Multi-strategy approach with fallbacks: |
|
1. Script-based detection (most reliable for Indic) |
|
2. Character pattern matching |
|
3. langdetect fallback |
|
4. Default to Hindi |
|
""" |
|
if not text or not text.strip(): |
|
return 'hi' |
|
|
|
|
|
cleaned_text = text.strip() |
|
|
|
|
|
script_lang = self.detect_language_from_script(cleaned_text) |
|
if script_lang: |
|
logger.debug(f"Script-based detection: {script_lang}") |
|
return script_lang |
|
|
|
|
|
pattern_lang = self.detect_with_character_patterns(cleaned_text) |
|
if pattern_lang: |
|
logger.debug(f"Pattern-based detection: {pattern_lang}") |
|
return pattern_lang |
|
|
|
|
|
langdetect_result = self.detect_with_langdetect(cleaned_text) |
|
if langdetect_result: |
|
logger.debug(f"langdetect result: {langdetect_result}") |
|
return langdetect_result |
|
|
|
|
|
fallback_lang = self._fallback_detection(cleaned_text) |
|
if fallback_lang: |
|
logger.debug(f"Fallback detection: {fallback_lang}") |
|
return fallback_lang |
|
|
|
|
|
logger.debug("Using default language: Hindi") |
|
return 'hi' |
|
|
|
def _fallback_detection(self, text: str) -> Optional[str]: |
|
"""Simple fallback detection based on common character patterns""" |
|
|
|
if any(char in text for char in 'হবদকগপ'): |
|
return "bn" |
|
elif any(char in text for char in 'தகநலம'): |
|
return "ta" |
|
elif any(char in text for char in 'తకనలమ'): |
|
return "te" |
|
elif any(char in text for char in 'തകനലമ'): |
|
return "ml" |
|
elif any(char in text for char in 'ತಕನಲಮ'): |
|
return "kn" |
|
elif any(char in text for char in 'તકનલમ'): |
|
return "gu" |
|
elif any(char in text for char in 'ਤਕਨਲਮ'): |
|
return "pa" |
|
elif any(char in text for char in 'ତକନଲମ'): |
|
return "or" |
|
elif any(char in text for char in 'اردوپہک'): |
|
return "ur" |
|
|
|
return None |
|
|
|
def get_supported_languages(self) -> List[str]: |
|
"""Get list of all supported languages""" |
|
return sorted(list(self.supported_languages)) |
|
|
|
def get_language_confidence(self, text: str, language: str) -> float: |
|
"""Get confidence score for detected language""" |
|
if not text or language not in self.supported_languages: |
|
return 0.0 |
|
|
|
|
|
script = self.detect_script(text) |
|
if not script: |
|
return 0.1 |
|
|
|
|
|
if language in self.script_to_languages.get(script, []): |
|
base_confidence = 0.8 |
|
else: |
|
base_confidence = 0.3 |
|
|
|
|
|
if language in self.char_patterns: |
|
pattern = self.char_patterns[language] |
|
matches = re.findall(pattern, text) |
|
if matches: |
|
char_boost = min(len(matches) / len(text.replace(' ', '')), 0.2) |
|
base_confidence += char_boost |
|
|
|
return min(base_confidence, 1.0) |
|
|
|
|
|
|
|
def detect_language(text: str) -> str: |
|
"""Standalone language detection function for backward compatibility""" |
|
detector = IndicLanguageDetector() |
|
return detector.detect_language(text) |
|
|
|
|
|
|
|
def detect_language_with_confidence(text: str) -> Tuple[str, float]: |
|
"""Detect language and return confidence score""" |
|
detector = IndicLanguageDetector() |
|
language = detector.detect_language(text) |
|
confidence = detector.get_language_confidence(text, language) |
|
return language, confidence |
|
|
|
|
|
|
|
def validate_language_detection(text: str, expected_language: str) -> Dict[str, Any]: |
|
"""Validate language detection accuracy""" |
|
detector = IndicLanguageDetector() |
|
detected = detector.detect_language(text) |
|
confidence = detector.get_language_confidence(text, detected) |
|
|
|
return { |
|
'text': text, |
|
'expected': expected_language, |
|
'detected': detected, |
|
'confidence': confidence, |
|
'correct': detected == expected_language, |
|
'script': detector.detect_script(text) |
|
} |
|
|
|
|
|
|
|
LANGUAGE_NAMES = { |
|
'hi': 'Hindi', 'bn': 'Bengali', 'te': 'Telugu', 'mr': 'Marathi', |
|
'ta': 'Tamil', 'ur': 'Urdu', 'gu': 'Gujarati', 'kn': 'Kannada', |
|
'ml': 'Malayalam', 'or': 'Odia', 'pa': 'Punjabi', 'as': 'Assamese', |
|
'mai': 'Maithili', 'sa': 'Sanskrit', 'ne': 'Nepali', 'ks': 'Kashmiri', |
|
'sd': 'Sindhi', 'brx': 'Bodo', 'doi': 'Dogri', 'kok': 'Konkani', |
|
'mni': 'Manipuri', 'sat': 'Santali' |
|
} |
|
|
|
|
|
def get_language_name(language_code: str) -> str: |
|
"""Get human-readable language name""" |
|
return LANGUAGE_NAMES.get(language_code.lower(), 'Unknown') |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
test_texts = { |
|
'hi': 'नमस्ते, आप कैसे हैं? यह हिंदी भाषा का परीक्षण है।', |
|
'bn': 'নমস্কার, আপনি কেমন আছেন? এটি বাংলা ভাষার পরীক্ষা।', |
|
'ta': 'வணக்கம், நீங்கள் எப்படி இருக்கிறீர்கள்? இது தமிழ் மொழியின் சோதனை.', |
|
'te': 'నమస్కారం, మీరు ఎలా ఉన్నారు? ఇది తెలుగు భాష పరీక్ష.', |
|
'ml': 'നമസ്കാരം, നിങ്ങൾ എങ്ങനെയുണ്ട്? ഇത് മലയാളം ഭാഷയുടെ പരീക്ഷണം.', |
|
'ur': 'السلام علیکم، آپ کیسے ہیں؟ یہ اردو زبان کا امتحان ہے۔', |
|
'gu': 'નમસ્તે, તમે કેમ છો? આ ગુજરાતી ભાષાની કસોટી છે.', |
|
'kn': 'ನಮಸ್ಕಾರ, ನೀವು ಹೇಗಿದ್ದೀರಿ? ಇದು ಕನ್ನಡ ಭಾಷೆಯ ಪರೀಕ್ಷೆ.', |
|
} |
|
|
|
print("Testing IndicLanguageDetector:") |
|
print("=" * 60) |
|
|
|
detector = IndicLanguageDetector() |
|
|
|
for expected_lang, text in test_texts.items(): |
|
result = validate_language_detection(text, expected_lang) |
|
|
|
print(f"\n{expected_lang.upper()} ({get_language_name(expected_lang)}):") |
|
print(f"Text: {text}") |
|
print(f"Expected: {result['expected']}") |
|
print(f"Detected: {result['detected']}") |
|
print(f"Confidence: {result['confidence']:.3f}") |
|
print(f"Correct: {'✅' if result['correct'] else '❌'}") |
|
print(f"Script: {result['script']}") |
|
|
|
print(f"\nSupported Languages: {len(detector.get_supported_languages())}") |
|
print(f"Languages: {', '.join(detector.get_supported_languages())}") |
|
|