ASR-NEW / language_detector.py
kasimali67
Update ASR pipeline code and configs
5b79694
"""
Enhanced language detection for ALL Indian languages
Optimized for IndicWhisper ASR pipeline with comprehensive script detection
"""
import re
from typing import Optional, Dict, List, Tuple, Any
import logging
# Safe langdetect import with fallback
try:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0
LANGDETECT_AVAILABLE = True
except ImportError:
LANGDETECT_AVAILABLE = False
logger = logging.getLogger(__name__)
class IndicLanguageDetector:
"""Comprehensive language detection for all 22+ Indian languages"""
def __init__(self):
# Extended Unicode ranges for all Indian scripts
self.script_patterns = {
'devanagari': r'[\u0900-\u097f]', # Hindi, Marathi, Sanskrit, Nepali, Bodo, Dogri, Konkani, Maithili
'bengali': r'[\u0980-\u09ff]', # Bengali, Assamese
'tamil': r'[\u0b80-\u0bff]', # Tamil
'telugu': r'[\u0c00-\u0c7f]', # Telugu
'malayalam': r'[\u0d00-\u0d7f]', # Malayalam
'kannada': r'[\u0c80-\u0cff]', # Kannada
'gujarati': r'[\u0a80-\u0aff]', # Gujarati
'punjabi': r'[\u0a00-\u0a7f]', # Punjabi (Gurmukhi)
'odia': r'[\u0b00-\u0b7f]', # Odia
'arabic': r'[\u0600-\u06ff]', # Urdu, Kashmiri, Sindhi
'olchiki': r'[\u1c50-\u1c7f]', # Santali
'meitei': r'[\uabc0-\uabff]', # Manipuri
}
# Enhanced script to language mapping with priority order
self.script_to_languages = {
'devanagari': ['hi', 'mr', 'ne', 'mai', 'sa', 'brx', 'doi', 'kok'],
'bengali': ['bn', 'as'],
'tamil': ['ta'],
'telugu': ['te'],
'malayalam': ['ml'],
'kannada': ['kn'],
'gujarati': ['gu'],
'punjabi': ['pa'],
'odia': ['or'],
'arabic': ['ur', 'ks', 'sd'],
'olchiki': ['sat'],
'meitei': ['mni'],
}
# All supported Indian languages (22 official + others)
self.supported_languages = {
'hi', 'bn', 'te', 'mr', 'ta', 'ur', 'gu', 'kn', 'ml', 'or', 'pa', 'as',
'mai', 'sa', 'ne', 'ks', 'sd', 'brx', 'doi', 'kok', 'mni', 'sat'
}
# Enhanced character-based patterns for better detection
self.char_patterns = {
# Bengali specific characters
'bn': r'[হবদকগপতনমলরস]',
# Tamil specific characters
'ta': r'[தகநலமபவசரன]',
# Telugu specific characters
'te': r'[తకనలమపవసరణ]',
# Malayalam specific characters
'ml': r'[തകനലമപവസരണ]',
# Kannada specific characters
'kn': r'[ತಕನಲಮಪವಸರಣ]',
# Gujarati specific characters
'gu': r'[તકનલમપવસરણ]',
# Punjabi specific characters
'pa': r'[ਤਕਨਲਮਪਵਸਰਣ]',
# Odia specific characters
'or': r'[ତକନଲମପଵସରଣ]',
# Assamese specific characters
'as': r'[তৰৱখগঘচছজঝ]',
# Urdu specific characters
'ur': r'[اردوپہکتنلمسرع]',
}
# Language confidence weights
self.language_weights = {
'hi': 1.0, # Hindi (highest priority for Devanagari)
'bn': 0.9, # Bengali
'te': 0.9, # Telugu
'mr': 0.8, # Marathi
'ta': 0.9, # Tamil
'ur': 0.8, # Urdu
'gu': 0.8, # Gujarati
'kn': 0.8, # Kannada
'ml': 0.8, # Malayalam
'or': 0.7, # Odia
'pa': 0.7, # Punjabi
'as': 0.7, # Assamese
}
def detect_script(self, text: str) -> Optional[str]:
"""Detect script family from text with confidence scoring"""
if not text or not text.strip():
return None
script_scores = {}
for script, pattern in self.script_patterns.items():
matches = re.findall(pattern, text)
if matches:
# Score based on percentage of matching characters
score = len(matches) / len(text.replace(' ', ''))
script_scores[script] = score
if script_scores:
# Return script with highest score
return max(script_scores.items(), key=lambda x: x[1])[0]
return None
def detect_language_from_script(self, text: str) -> Optional[str]:
"""Get most likely language based on enhanced script detection"""
script = self.detect_script(text)
if not script or script not in self.script_to_languages:
return None
possible_languages = self.script_to_languages[script]
# For single language scripts, return immediately
if len(possible_languages) == 1:
return possible_languages[0]
# For multi-language scripts (like Devanagari), use character patterns
if script == 'devanagari':
return self._detect_devanagari_language(text, possible_languages)
elif script == 'bengali':
return self._detect_bengali_script_language(text, possible_languages)
elif script == 'arabic':
return self._detect_arabic_script_language(text, possible_languages)
# Default to first (most common) language for that script
return possible_languages[0]
def _detect_devanagari_language(self, text: str, candidates: List[str]) -> str:
"""Enhanced Devanagari language detection"""
# Hindi is most common, but check for specific patterns
# Marathi specific patterns
if re.search(r'[ळझञ]', text): # Marathi specific characters
return 'mr'
# Nepali specific patterns
if re.search(r'[ऋएौ].*[नत]', text): # Common Nepali patterns
return 'ne'
# Sanskrit specific patterns (complex conjuncts)
if re.search(r'[क्ष|त्र|ज्ञ|श्र]', text) and len(re.findall(r'[क्ष|त्र|ज्ञ|श्र]', text)) > 2:
return 'sa'
# Default to Hindi for Devanagari
return 'hi'
def _detect_bengali_script_language(self, text: str, candidates: List[str]) -> str:
"""Distinguish between Bengali and Assamese"""
# Assamese specific characters
if re.search(r'[ৰৱখগঘ]', text):
return 'as'
# Default to Bengali
return 'bn'
def _detect_arabic_script_language(self, text: str, candidates: List[str]) -> str:
"""Distinguish between Urdu, Kashmiri, and Sindhi"""
# Urdu is most common for Arabic script in Indian context
# Could add specific character patterns for Kashmiri/Sindhi if needed
return 'ur'
def detect_with_langdetect(self, text: str) -> Optional[str]:
"""Enhanced langdetect with Indian language support"""
if not LANGDETECT_AVAILABLE:
logger.warning("langdetect not available, skipping")
return None
try:
if len(text.strip()) < 10:
return None
detected = detect(text)
# Only return if it's a supported Indian language
if detected in self.supported_languages:
logger.debug(f"langdetect successful: {detected}")
return detected
else:
logger.debug(f"langdetect returned non-Indian language: {detected}")
return None
except Exception as e:
logger.warning(f"langdetect failed: {e}")
return None
def detect_with_character_patterns(self, text: str) -> Optional[str]:
"""Language detection using character-specific patterns"""
if not text or len(text.strip()) < 5:
return None
language_scores = {}
for lang_code, pattern in self.char_patterns.items():
matches = re.findall(pattern, text)
if matches:
# Calculate score based on character frequency and language weight
base_score = len(matches) / len(text.replace(' ', ''))
weight = self.language_weights.get(lang_code, 0.5)
language_scores[lang_code] = base_score * weight
if language_scores:
# Return language with highest weighted score
best_lang = max(language_scores.items(), key=lambda x: x[1])[0]
logger.debug(f"Character pattern detection: {best_lang} (score: {language_scores[best_lang]:.3f})")
return best_lang
return None
def detect_language(self, text: str) -> str:
"""
Comprehensive language detection for ALL Indian languages
Multi-strategy approach with fallbacks:
1. Script-based detection (most reliable for Indic)
2. Character pattern matching
3. langdetect fallback
4. Default to Hindi
"""
if not text or not text.strip():
return 'hi' # Default to Hindi
# Clean text for better detection
cleaned_text = text.strip()
# Strategy 1: Script-based detection (most reliable for Indic)
script_lang = self.detect_language_from_script(cleaned_text)
if script_lang:
logger.debug(f"Script-based detection: {script_lang}")
return script_lang
# Strategy 2: Character pattern matching
pattern_lang = self.detect_with_character_patterns(cleaned_text)
if pattern_lang:
logger.debug(f"Pattern-based detection: {pattern_lang}")
return pattern_lang
# Strategy 3: langdetect fallback (if available)
langdetect_result = self.detect_with_langdetect(cleaned_text)
if langdetect_result:
logger.debug(f"langdetect result: {langdetect_result}")
return langdetect_result
# Strategy 4: Fallback based on common characters
fallback_lang = self._fallback_detection(cleaned_text)
if fallback_lang:
logger.debug(f"Fallback detection: {fallback_lang}")
return fallback_lang
# Final fallback: Default to Hindi
logger.debug("Using default language: Hindi")
return 'hi'
def _fallback_detection(self, text: str) -> Optional[str]:
"""Simple fallback detection based on common character patterns"""
# Basic script detection without full pattern matching
if any(char in text for char in 'হবদকগপ'): # Bengali chars
return "bn"
elif any(char in text for char in 'தகநலம'): # Tamil chars
return "ta"
elif any(char in text for char in 'తకనలమ'): # Telugu chars
return "te"
elif any(char in text for char in 'തകനലമ'): # Malayalam chars
return "ml"
elif any(char in text for char in 'ತಕನಲಮ'): # Kannada chars
return "kn"
elif any(char in text for char in 'તકનલમ'): # Gujarati chars
return "gu"
elif any(char in text for char in 'ਤਕਨਲਮ'): # Punjabi chars
return "pa"
elif any(char in text for char in 'ତକନଲମ'): # Odia chars
return "or"
elif any(char in text for char in 'اردوپہک'): # Urdu chars
return "ur"
return None
def get_supported_languages(self) -> List[str]:
"""Get list of all supported languages"""
return sorted(list(self.supported_languages))
def get_language_confidence(self, text: str, language: str) -> float:
"""Get confidence score for detected language"""
if not text or language not in self.supported_languages:
return 0.0
# Calculate confidence based on script match and character patterns
script = self.detect_script(text)
if not script:
return 0.1
# Check if language matches detected script
if language in self.script_to_languages.get(script, []):
base_confidence = 0.8
else:
base_confidence = 0.3
# Boost confidence with character pattern matching
if language in self.char_patterns:
pattern = self.char_patterns[language]
matches = re.findall(pattern, text)
if matches:
char_boost = min(len(matches) / len(text.replace(' ', '')), 0.2)
base_confidence += char_boost
return min(base_confidence, 1.0)
# Standalone function for backward compatibility
def detect_language(text: str) -> str:
"""Standalone language detection function for backward compatibility"""
detector = IndicLanguageDetector()
return detector.detect_language(text)
# Enhanced detection with confidence
def detect_language_with_confidence(text: str) -> Tuple[str, float]:
"""Detect language and return confidence score"""
detector = IndicLanguageDetector()
language = detector.detect_language(text)
confidence = detector.get_language_confidence(text, language)
return language, confidence
# Validation and testing functions
def validate_language_detection(text: str, expected_language: str) -> Dict[str, Any]:
"""Validate language detection accuracy"""
detector = IndicLanguageDetector()
detected = detector.detect_language(text)
confidence = detector.get_language_confidence(text, detected)
return {
'text': text,
'expected': expected_language,
'detected': detected,
'confidence': confidence,
'correct': detected == expected_language,
'script': detector.detect_script(text)
}
# Language metadata for API responses
LANGUAGE_NAMES = {
'hi': 'Hindi', 'bn': 'Bengali', 'te': 'Telugu', 'mr': 'Marathi',
'ta': 'Tamil', 'ur': 'Urdu', 'gu': 'Gujarati', 'kn': 'Kannada',
'ml': 'Malayalam', 'or': 'Odia', 'pa': 'Punjabi', 'as': 'Assamese',
'mai': 'Maithili', 'sa': 'Sanskrit', 'ne': 'Nepali', 'ks': 'Kashmiri',
'sd': 'Sindhi', 'brx': 'Bodo', 'doi': 'Dogri', 'kok': 'Konkani',
'mni': 'Manipuri', 'sat': 'Santali'
}
def get_language_name(language_code: str) -> str:
"""Get human-readable language name"""
return LANGUAGE_NAMES.get(language_code.lower(), 'Unknown')
if __name__ == "__main__":
# Test the language detector
test_texts = {
'hi': 'नमस्ते, आप कैसे हैं? यह हिंदी भाषा का परीक्षण है।',
'bn': 'নমস্কার, আপনি কেমন আছেন? এটি বাংলা ভাষার পরীক্ষা।',
'ta': 'வணக்கம், நீங்கள் எப்படி இருக்கிறீர்கள்? இது தமிழ் மொழியின் சோதனை.',
'te': 'నమస్కారం, మీరు ఎలా ఉన్నారు? ఇది తెలుగు భాష పరీక్ష.',
'ml': 'നമസ്കാരം, നിങ്ങൾ എങ്ങനെയുണ്ട്? ഇത് മലയാളം ഭാഷയുടെ പരീക്ഷണം.',
'ur': 'السلام علیکم، آپ کیسے ہیں؟ یہ اردو زبان کا امتحان ہے۔',
'gu': 'નમસ્તે, તમે કેમ છો? આ ગુજરાતી ભાષાની કસોટી છે.',
'kn': 'ನಮಸ್ಕಾರ, ನೀವು ಹೇಗಿದ್ದೀರಿ? ಇದು ಕನ್ನಡ ಭಾಷೆಯ ಪರೀಕ್ಷೆ.',
}
print("Testing IndicLanguageDetector:")
print("=" * 60)
detector = IndicLanguageDetector()
for expected_lang, text in test_texts.items():
result = validate_language_detection(text, expected_lang)
print(f"\n{expected_lang.upper()} ({get_language_name(expected_lang)}):")
print(f"Text: {text}")
print(f"Expected: {result['expected']}")
print(f"Detected: {result['detected']}")
print(f"Confidence: {result['confidence']:.3f}")
print(f"Correct: {'✅' if result['correct'] else '❌'}")
print(f"Script: {result['script']}")
print(f"\nSupported Languages: {len(detector.get_supported_languages())}")
print(f"Languages: {', '.join(detector.get_supported_languages())}")