File size: 16,976 Bytes
5b79694 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 |
"""
Enhanced language detection for ALL Indian languages
Optimized for IndicWhisper ASR pipeline with comprehensive script detection
"""
import re
from typing import Optional, Dict, List, Tuple, Any
import logging
# Safe langdetect import with fallback
try:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0
LANGDETECT_AVAILABLE = True
except ImportError:
LANGDETECT_AVAILABLE = False
logger = logging.getLogger(__name__)
class IndicLanguageDetector:
"""Comprehensive language detection for all 22+ Indian languages"""
def __init__(self):
# Extended Unicode ranges for all Indian scripts
self.script_patterns = {
'devanagari': r'[\u0900-\u097f]', # Hindi, Marathi, Sanskrit, Nepali, Bodo, Dogri, Konkani, Maithili
'bengali': r'[\u0980-\u09ff]', # Bengali, Assamese
'tamil': r'[\u0b80-\u0bff]', # Tamil
'telugu': r'[\u0c00-\u0c7f]', # Telugu
'malayalam': r'[\u0d00-\u0d7f]', # Malayalam
'kannada': r'[\u0c80-\u0cff]', # Kannada
'gujarati': r'[\u0a80-\u0aff]', # Gujarati
'punjabi': r'[\u0a00-\u0a7f]', # Punjabi (Gurmukhi)
'odia': r'[\u0b00-\u0b7f]', # Odia
'arabic': r'[\u0600-\u06ff]', # Urdu, Kashmiri, Sindhi
'olchiki': r'[\u1c50-\u1c7f]', # Santali
'meitei': r'[\uabc0-\uabff]', # Manipuri
}
# Enhanced script to language mapping with priority order
self.script_to_languages = {
'devanagari': ['hi', 'mr', 'ne', 'mai', 'sa', 'brx', 'doi', 'kok'],
'bengali': ['bn', 'as'],
'tamil': ['ta'],
'telugu': ['te'],
'malayalam': ['ml'],
'kannada': ['kn'],
'gujarati': ['gu'],
'punjabi': ['pa'],
'odia': ['or'],
'arabic': ['ur', 'ks', 'sd'],
'olchiki': ['sat'],
'meitei': ['mni'],
}
# All supported Indian languages (22 official + others)
self.supported_languages = {
'hi', 'bn', 'te', 'mr', 'ta', 'ur', 'gu', 'kn', 'ml', 'or', 'pa', 'as',
'mai', 'sa', 'ne', 'ks', 'sd', 'brx', 'doi', 'kok', 'mni', 'sat'
}
# Enhanced character-based patterns for better detection
self.char_patterns = {
# Bengali specific characters
'bn': r'[হবদকগপতনমলরস]',
# Tamil specific characters
'ta': r'[தகநலமபவசரன]',
# Telugu specific characters
'te': r'[తకనలమపవసరణ]',
# Malayalam specific characters
'ml': r'[തകനലമപവസരണ]',
# Kannada specific characters
'kn': r'[ತಕನಲಮಪವಸರಣ]',
# Gujarati specific characters
'gu': r'[તકનલમપવસરણ]',
# Punjabi specific characters
'pa': r'[ਤਕਨਲਮਪਵਸਰਣ]',
# Odia specific characters
'or': r'[ତକନଲମପଵସରଣ]',
# Assamese specific characters
'as': r'[তৰৱখগঘচছজঝ]',
# Urdu specific characters
'ur': r'[اردوپہکتنلمسرع]',
}
# Language confidence weights
self.language_weights = {
'hi': 1.0, # Hindi (highest priority for Devanagari)
'bn': 0.9, # Bengali
'te': 0.9, # Telugu
'mr': 0.8, # Marathi
'ta': 0.9, # Tamil
'ur': 0.8, # Urdu
'gu': 0.8, # Gujarati
'kn': 0.8, # Kannada
'ml': 0.8, # Malayalam
'or': 0.7, # Odia
'pa': 0.7, # Punjabi
'as': 0.7, # Assamese
}
def detect_script(self, text: str) -> Optional[str]:
"""Detect script family from text with confidence scoring"""
if not text or not text.strip():
return None
script_scores = {}
for script, pattern in self.script_patterns.items():
matches = re.findall(pattern, text)
if matches:
# Score based on percentage of matching characters
score = len(matches) / len(text.replace(' ', ''))
script_scores[script] = score
if script_scores:
# Return script with highest score
return max(script_scores.items(), key=lambda x: x[1])[0]
return None
def detect_language_from_script(self, text: str) -> Optional[str]:
"""Get most likely language based on enhanced script detection"""
script = self.detect_script(text)
if not script or script not in self.script_to_languages:
return None
possible_languages = self.script_to_languages[script]
# For single language scripts, return immediately
if len(possible_languages) == 1:
return possible_languages[0]
# For multi-language scripts (like Devanagari), use character patterns
if script == 'devanagari':
return self._detect_devanagari_language(text, possible_languages)
elif script == 'bengali':
return self._detect_bengali_script_language(text, possible_languages)
elif script == 'arabic':
return self._detect_arabic_script_language(text, possible_languages)
# Default to first (most common) language for that script
return possible_languages[0]
def _detect_devanagari_language(self, text: str, candidates: List[str]) -> str:
"""Enhanced Devanagari language detection"""
# Hindi is most common, but check for specific patterns
# Marathi specific patterns
if re.search(r'[ळझञ]', text): # Marathi specific characters
return 'mr'
# Nepali specific patterns
if re.search(r'[ऋएौ].*[नत]', text): # Common Nepali patterns
return 'ne'
# Sanskrit specific patterns (complex conjuncts)
if re.search(r'[क्ष|त्र|ज्ञ|श्र]', text) and len(re.findall(r'[क्ष|त्र|ज्ञ|श्र]', text)) > 2:
return 'sa'
# Default to Hindi for Devanagari
return 'hi'
def _detect_bengali_script_language(self, text: str, candidates: List[str]) -> str:
"""Distinguish between Bengali and Assamese"""
# Assamese specific characters
if re.search(r'[ৰৱখগঘ]', text):
return 'as'
# Default to Bengali
return 'bn'
def _detect_arabic_script_language(self, text: str, candidates: List[str]) -> str:
"""Distinguish between Urdu, Kashmiri, and Sindhi"""
# Urdu is most common for Arabic script in Indian context
# Could add specific character patterns for Kashmiri/Sindhi if needed
return 'ur'
def detect_with_langdetect(self, text: str) -> Optional[str]:
"""Enhanced langdetect with Indian language support"""
if not LANGDETECT_AVAILABLE:
logger.warning("langdetect not available, skipping")
return None
try:
if len(text.strip()) < 10:
return None
detected = detect(text)
# Only return if it's a supported Indian language
if detected in self.supported_languages:
logger.debug(f"langdetect successful: {detected}")
return detected
else:
logger.debug(f"langdetect returned non-Indian language: {detected}")
return None
except Exception as e:
logger.warning(f"langdetect failed: {e}")
return None
def detect_with_character_patterns(self, text: str) -> Optional[str]:
"""Language detection using character-specific patterns"""
if not text or len(text.strip()) < 5:
return None
language_scores = {}
for lang_code, pattern in self.char_patterns.items():
matches = re.findall(pattern, text)
if matches:
# Calculate score based on character frequency and language weight
base_score = len(matches) / len(text.replace(' ', ''))
weight = self.language_weights.get(lang_code, 0.5)
language_scores[lang_code] = base_score * weight
if language_scores:
# Return language with highest weighted score
best_lang = max(language_scores.items(), key=lambda x: x[1])[0]
logger.debug(f"Character pattern detection: {best_lang} (score: {language_scores[best_lang]:.3f})")
return best_lang
return None
def detect_language(self, text: str) -> str:
"""
Comprehensive language detection for ALL Indian languages
Multi-strategy approach with fallbacks:
1. Script-based detection (most reliable for Indic)
2. Character pattern matching
3. langdetect fallback
4. Default to Hindi
"""
if not text or not text.strip():
return 'hi' # Default to Hindi
# Clean text for better detection
cleaned_text = text.strip()
# Strategy 1: Script-based detection (most reliable for Indic)
script_lang = self.detect_language_from_script(cleaned_text)
if script_lang:
logger.debug(f"Script-based detection: {script_lang}")
return script_lang
# Strategy 2: Character pattern matching
pattern_lang = self.detect_with_character_patterns(cleaned_text)
if pattern_lang:
logger.debug(f"Pattern-based detection: {pattern_lang}")
return pattern_lang
# Strategy 3: langdetect fallback (if available)
langdetect_result = self.detect_with_langdetect(cleaned_text)
if langdetect_result:
logger.debug(f"langdetect result: {langdetect_result}")
return langdetect_result
# Strategy 4: Fallback based on common characters
fallback_lang = self._fallback_detection(cleaned_text)
if fallback_lang:
logger.debug(f"Fallback detection: {fallback_lang}")
return fallback_lang
# Final fallback: Default to Hindi
logger.debug("Using default language: Hindi")
return 'hi'
def _fallback_detection(self, text: str) -> Optional[str]:
"""Simple fallback detection based on common character patterns"""
# Basic script detection without full pattern matching
if any(char in text for char in 'হবদকগপ'): # Bengali chars
return "bn"
elif any(char in text for char in 'தகநலம'): # Tamil chars
return "ta"
elif any(char in text for char in 'తకనలమ'): # Telugu chars
return "te"
elif any(char in text for char in 'തകനലമ'): # Malayalam chars
return "ml"
elif any(char in text for char in 'ತಕನಲಮ'): # Kannada chars
return "kn"
elif any(char in text for char in 'તકનલમ'): # Gujarati chars
return "gu"
elif any(char in text for char in 'ਤਕਨਲਮ'): # Punjabi chars
return "pa"
elif any(char in text for char in 'ତକନଲମ'): # Odia chars
return "or"
elif any(char in text for char in 'اردوپہک'): # Urdu chars
return "ur"
return None
def get_supported_languages(self) -> List[str]:
"""Get list of all supported languages"""
return sorted(list(self.supported_languages))
def get_language_confidence(self, text: str, language: str) -> float:
"""Get confidence score for detected language"""
if not text or language not in self.supported_languages:
return 0.0
# Calculate confidence based on script match and character patterns
script = self.detect_script(text)
if not script:
return 0.1
# Check if language matches detected script
if language in self.script_to_languages.get(script, []):
base_confidence = 0.8
else:
base_confidence = 0.3
# Boost confidence with character pattern matching
if language in self.char_patterns:
pattern = self.char_patterns[language]
matches = re.findall(pattern, text)
if matches:
char_boost = min(len(matches) / len(text.replace(' ', '')), 0.2)
base_confidence += char_boost
return min(base_confidence, 1.0)
# Standalone function for backward compatibility
def detect_language(text: str) -> str:
"""Standalone language detection function for backward compatibility"""
detector = IndicLanguageDetector()
return detector.detect_language(text)
# Enhanced detection with confidence
def detect_language_with_confidence(text: str) -> Tuple[str, float]:
"""Detect language and return confidence score"""
detector = IndicLanguageDetector()
language = detector.detect_language(text)
confidence = detector.get_language_confidence(text, language)
return language, confidence
# Validation and testing functions
def validate_language_detection(text: str, expected_language: str) -> Dict[str, Any]:
"""Validate language detection accuracy"""
detector = IndicLanguageDetector()
detected = detector.detect_language(text)
confidence = detector.get_language_confidence(text, detected)
return {
'text': text,
'expected': expected_language,
'detected': detected,
'confidence': confidence,
'correct': detected == expected_language,
'script': detector.detect_script(text)
}
# Language metadata for API responses
LANGUAGE_NAMES = {
'hi': 'Hindi', 'bn': 'Bengali', 'te': 'Telugu', 'mr': 'Marathi',
'ta': 'Tamil', 'ur': 'Urdu', 'gu': 'Gujarati', 'kn': 'Kannada',
'ml': 'Malayalam', 'or': 'Odia', 'pa': 'Punjabi', 'as': 'Assamese',
'mai': 'Maithili', 'sa': 'Sanskrit', 'ne': 'Nepali', 'ks': 'Kashmiri',
'sd': 'Sindhi', 'brx': 'Bodo', 'doi': 'Dogri', 'kok': 'Konkani',
'mni': 'Manipuri', 'sat': 'Santali'
}
def get_language_name(language_code: str) -> str:
"""Get human-readable language name"""
return LANGUAGE_NAMES.get(language_code.lower(), 'Unknown')
if __name__ == "__main__":
# Test the language detector
test_texts = {
'hi': 'नमस्ते, आप कैसे हैं? यह हिंदी भाषा का परीक्षण है।',
'bn': 'নমস্কার, আপনি কেমন আছেন? এটি বাংলা ভাষার পরীক্ষা।',
'ta': 'வணக்கம், நீங்கள் எப்படி இருக்கிறீர்கள்? இது தமிழ் மொழியின் சோதனை.',
'te': 'నమస్కారం, మీరు ఎలా ఉన్నారు? ఇది తెలుగు భాష పరీక్ష.',
'ml': 'നമസ്കാരം, നിങ്ങൾ എങ്ങനെയുണ്ട്? ഇത് മലയാളം ഭാഷയുടെ പരീക്ഷണം.',
'ur': 'السلام علیکم، آپ کیسے ہیں؟ یہ اردو زبان کا امتحان ہے۔',
'gu': 'નમસ્તે, તમે કેમ છો? આ ગુજરાતી ભાષાની કસોટી છે.',
'kn': 'ನಮಸ್ಕಾರ, ನೀವು ಹೇಗಿದ್ದೀರಿ? ಇದು ಕನ್ನಡ ಭಾಷೆಯ ಪರೀಕ್ಷೆ.',
}
print("Testing IndicLanguageDetector:")
print("=" * 60)
detector = IndicLanguageDetector()
for expected_lang, text in test_texts.items():
result = validate_language_detection(text, expected_lang)
print(f"\n{expected_lang.upper()} ({get_language_name(expected_lang)}):")
print(f"Text: {text}")
print(f"Expected: {result['expected']}")
print(f"Detected: {result['detected']}")
print(f"Confidence: {result['confidence']:.3f}")
print(f"Correct: {'✅' if result['correct'] else '❌'}")
print(f"Script: {result['script']}")
print(f"\nSupported Languages: {len(detector.get_supported_languages())}")
print(f"Languages: {', '.join(detector.get_supported_languages())}")
|