File size: 16,976 Bytes
5b79694
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
"""
Enhanced language detection for ALL Indian languages
Optimized for IndicWhisper ASR pipeline with comprehensive script detection
"""
import re
from typing import Optional, Dict, List, Tuple, Any
import logging

# Safe langdetect import with fallback
try:
    from langdetect import detect, DetectorFactory
    DetectorFactory.seed = 0
    LANGDETECT_AVAILABLE = True
except ImportError:
    LANGDETECT_AVAILABLE = False

logger = logging.getLogger(__name__)


class IndicLanguageDetector:
    """Comprehensive language detection for all 22+ Indian languages"""
    
    def __init__(self):
        # Extended Unicode ranges for all Indian scripts
        self.script_patterns = {
            'devanagari': r'[\u0900-\u097f]',     # Hindi, Marathi, Sanskrit, Nepali, Bodo, Dogri, Konkani, Maithili
            'bengali': r'[\u0980-\u09ff]',        # Bengali, Assamese
            'tamil': r'[\u0b80-\u0bff]',          # Tamil
            'telugu': r'[\u0c00-\u0c7f]',         # Telugu
            'malayalam': r'[\u0d00-\u0d7f]',      # Malayalam
            'kannada': r'[\u0c80-\u0cff]',        # Kannada
            'gujarati': r'[\u0a80-\u0aff]',       # Gujarati
            'punjabi': r'[\u0a00-\u0a7f]',        # Punjabi (Gurmukhi)
            'odia': r'[\u0b00-\u0b7f]',           # Odia
            'arabic': r'[\u0600-\u06ff]',         # Urdu, Kashmiri, Sindhi
            'olchiki': r'[\u1c50-\u1c7f]',        # Santali
            'meitei': r'[\uabc0-\uabff]',         # Manipuri
        }
        
        # Enhanced script to language mapping with priority order
        self.script_to_languages = {
            'devanagari': ['hi', 'mr', 'ne', 'mai', 'sa', 'brx', 'doi', 'kok'],
            'bengali': ['bn', 'as'],
            'tamil': ['ta'],
            'telugu': ['te'],
            'malayalam': ['ml'],
            'kannada': ['kn'],
            'gujarati': ['gu'],
            'punjabi': ['pa'],
            'odia': ['or'],
            'arabic': ['ur', 'ks', 'sd'],
            'olchiki': ['sat'],
            'meitei': ['mni'],
        }
        
        # All supported Indian languages (22 official + others)
        self.supported_languages = {
            'hi', 'bn', 'te', 'mr', 'ta', 'ur', 'gu', 'kn', 'ml', 'or', 'pa', 'as',
            'mai', 'sa', 'ne', 'ks', 'sd', 'brx', 'doi', 'kok', 'mni', 'sat'
        }
        
        # Enhanced character-based patterns for better detection
        self.char_patterns = {
            # Bengali specific characters
            'bn': r'[হবদকগপতনমলরস]',
            # Tamil specific characters  
            'ta': r'[தகநலமபவசரன]',
            # Telugu specific characters
            'te': r'[తకనలమపవసరణ]',
            # Malayalam specific characters
            'ml': r'[തകനലമപവസരണ]',
            # Kannada specific characters
            'kn': r'[ತಕನಲಮಪವಸರಣ]',
            # Gujarati specific characters
            'gu': r'[તકનલમપવસરણ]',
            # Punjabi specific characters  
            'pa': r'[ਤਕਨਲਮਪਵਸਰਣ]',
            # Odia specific characters
            'or': r'[ତକନଲମପଵସରଣ]',
            # Assamese specific characters
            'as': r'[তৰৱখগঘচছজঝ]',
            # Urdu specific characters
            'ur': r'[اردوپہکتنلمسرع]',
        }
        
        # Language confidence weights
        self.language_weights = {
            'hi': 1.0,   # Hindi (highest priority for Devanagari)
            'bn': 0.9,   # Bengali
            'te': 0.9,   # Telugu
            'mr': 0.8,   # Marathi
            'ta': 0.9,   # Tamil
            'ur': 0.8,   # Urdu
            'gu': 0.8,   # Gujarati
            'kn': 0.8,   # Kannada
            'ml': 0.8,   # Malayalam
            'or': 0.7,   # Odia
            'pa': 0.7,   # Punjabi
            'as': 0.7,   # Assamese
        }
    
    def detect_script(self, text: str) -> Optional[str]:
        """Detect script family from text with confidence scoring"""
        if not text or not text.strip():
            return None
            
        script_scores = {}
        
        for script, pattern in self.script_patterns.items():
            matches = re.findall(pattern, text)
            if matches:
                # Score based on percentage of matching characters
                score = len(matches) / len(text.replace(' ', ''))
                script_scores[script] = score
        
        if script_scores:
            # Return script with highest score
            return max(script_scores.items(), key=lambda x: x[1])[0]
        
        return None
    
    def detect_language_from_script(self, text: str) -> Optional[str]:
        """Get most likely language based on enhanced script detection"""
        script = self.detect_script(text)
        if not script or script not in self.script_to_languages:
            return None
        
        possible_languages = self.script_to_languages[script]
        
        # For single language scripts, return immediately
        if len(possible_languages) == 1:
            return possible_languages[0]
        
        # For multi-language scripts (like Devanagari), use character patterns
        if script == 'devanagari':
            return self._detect_devanagari_language(text, possible_languages)
        elif script == 'bengali':
            return self._detect_bengali_script_language(text, possible_languages)
        elif script == 'arabic':
            return self._detect_arabic_script_language(text, possible_languages)
        
        # Default to first (most common) language for that script
        return possible_languages[0]
    
    def _detect_devanagari_language(self, text: str, candidates: List[str]) -> str:
        """Enhanced Devanagari language detection"""
        # Hindi is most common, but check for specific patterns
        
        # Marathi specific patterns
        if re.search(r'[ळझञ]', text):  # Marathi specific characters
            return 'mr'
        
        # Nepali specific patterns
        if re.search(r'[ऋएौ].*[नत]', text):  # Common Nepali patterns
            return 'ne'
        
        # Sanskrit specific patterns (complex conjuncts)
        if re.search(r'[क्ष|त्र|ज्ञ|श्र]', text) and len(re.findall(r'[क्ष|त्र|ज्ञ|श्र]', text)) > 2:
            return 'sa'
        
        # Default to Hindi for Devanagari
        return 'hi'
    
    def _detect_bengali_script_language(self, text: str, candidates: List[str]) -> str:
        """Distinguish between Bengali and Assamese"""
        # Assamese specific characters
        if re.search(r'[ৰৱখগঘ]', text):
            return 'as'
        
        # Default to Bengali
        return 'bn'
    
    def _detect_arabic_script_language(self, text: str, candidates: List[str]) -> str:
        """Distinguish between Urdu, Kashmiri, and Sindhi"""
        # Urdu is most common for Arabic script in Indian context
        # Could add specific character patterns for Kashmiri/Sindhi if needed
        return 'ur'
    
    def detect_with_langdetect(self, text: str) -> Optional[str]:
        """Enhanced langdetect with Indian language support"""
        if not LANGDETECT_AVAILABLE:
            logger.warning("langdetect not available, skipping")
            return None
            
        try:
            if len(text.strip()) < 10:
                return None
                
            detected = detect(text)
            # Only return if it's a supported Indian language
            if detected in self.supported_languages:
                logger.debug(f"langdetect successful: {detected}")
                return detected
            else:
                logger.debug(f"langdetect returned non-Indian language: {detected}")
                return None
                
        except Exception as e:
            logger.warning(f"langdetect failed: {e}")
            return None
    
    def detect_with_character_patterns(self, text: str) -> Optional[str]:
        """Language detection using character-specific patterns"""
        if not text or len(text.strip()) < 5:
            return None
        
        language_scores = {}
        
        for lang_code, pattern in self.char_patterns.items():
            matches = re.findall(pattern, text)
            if matches:
                # Calculate score based on character frequency and language weight
                base_score = len(matches) / len(text.replace(' ', ''))
                weight = self.language_weights.get(lang_code, 0.5)
                language_scores[lang_code] = base_score * weight
        
        if language_scores:
            # Return language with highest weighted score
            best_lang = max(language_scores.items(), key=lambda x: x[1])[0]
            logger.debug(f"Character pattern detection: {best_lang} (score: {language_scores[best_lang]:.3f})")
            return best_lang
        
        return None
    
    def detect_language(self, text: str) -> str:
        """
        Comprehensive language detection for ALL Indian languages
        Multi-strategy approach with fallbacks:
        1. Script-based detection (most reliable for Indic)
        2. Character pattern matching
        3. langdetect fallback
        4. Default to Hindi
        """
        if not text or not text.strip():
            return 'hi'  # Default to Hindi
        
        # Clean text for better detection
        cleaned_text = text.strip()
        
        # Strategy 1: Script-based detection (most reliable for Indic)
        script_lang = self.detect_language_from_script(cleaned_text)
        if script_lang:
            logger.debug(f"Script-based detection: {script_lang}")
            return script_lang
        
        # Strategy 2: Character pattern matching
        pattern_lang = self.detect_with_character_patterns(cleaned_text)
        if pattern_lang:
            logger.debug(f"Pattern-based detection: {pattern_lang}")
            return pattern_lang
        
        # Strategy 3: langdetect fallback (if available)
        langdetect_result = self.detect_with_langdetect(cleaned_text)
        if langdetect_result:
            logger.debug(f"langdetect result: {langdetect_result}")
            return langdetect_result
        
        # Strategy 4: Fallback based on common characters
        fallback_lang = self._fallback_detection(cleaned_text)
        if fallback_lang:
            logger.debug(f"Fallback detection: {fallback_lang}")
            return fallback_lang
        
        # Final fallback: Default to Hindi
        logger.debug("Using default language: Hindi")
        return 'hi'
    
    def _fallback_detection(self, text: str) -> Optional[str]:
        """Simple fallback detection based on common character patterns"""
        # Basic script detection without full pattern matching
        if any(char in text for char in 'হবদকগপ'):  # Bengali chars
            return "bn"
        elif any(char in text for char in 'தகநலம'):  # Tamil chars
            return "ta"
        elif any(char in text for char in 'తకనలమ'):  # Telugu chars
            return "te"
        elif any(char in text for char in 'തകനലമ'):  # Malayalam chars
            return "ml"
        elif any(char in text for char in 'ತಕನಲಮ'):  # Kannada chars
            return "kn"
        elif any(char in text for char in 'તકનલમ'):  # Gujarati chars
            return "gu"
        elif any(char in text for char in 'ਤਕਨਲਮ'):  # Punjabi chars
            return "pa"
        elif any(char in text for char in 'ତକନଲମ'):  # Odia chars
            return "or"
        elif any(char in text for char in 'اردوپہک'):  # Urdu chars
            return "ur"
        
        return None
    
    def get_supported_languages(self) -> List[str]:
        """Get list of all supported languages"""
        return sorted(list(self.supported_languages))
    
    def get_language_confidence(self, text: str, language: str) -> float:
        """Get confidence score for detected language"""
        if not text or language not in self.supported_languages:
            return 0.0
        
        # Calculate confidence based on script match and character patterns
        script = self.detect_script(text)
        if not script:
            return 0.1
        
        # Check if language matches detected script
        if language in self.script_to_languages.get(script, []):
            base_confidence = 0.8
        else:
            base_confidence = 0.3
        
        # Boost confidence with character pattern matching
        if language in self.char_patterns:
            pattern = self.char_patterns[language]
            matches = re.findall(pattern, text)
            if matches:
                char_boost = min(len(matches) / len(text.replace(' ', '')), 0.2)
                base_confidence += char_boost
        
        return min(base_confidence, 1.0)


# Standalone function for backward compatibility
def detect_language(text: str) -> str:
    """Standalone language detection function for backward compatibility"""
    detector = IndicLanguageDetector()
    return detector.detect_language(text)


# Enhanced detection with confidence
def detect_language_with_confidence(text: str) -> Tuple[str, float]:
    """Detect language and return confidence score"""
    detector = IndicLanguageDetector()
    language = detector.detect_language(text)
    confidence = detector.get_language_confidence(text, language)
    return language, confidence


# Validation and testing functions
def validate_language_detection(text: str, expected_language: str) -> Dict[str, Any]:
    """Validate language detection accuracy"""
    detector = IndicLanguageDetector()
    detected = detector.detect_language(text)
    confidence = detector.get_language_confidence(text, detected)
    
    return {
        'text': text,
        'expected': expected_language,
        'detected': detected,
        'confidence': confidence,
        'correct': detected == expected_language,
        'script': detector.detect_script(text)
    }


# Language metadata for API responses
LANGUAGE_NAMES = {
    'hi': 'Hindi', 'bn': 'Bengali', 'te': 'Telugu', 'mr': 'Marathi',
    'ta': 'Tamil', 'ur': 'Urdu', 'gu': 'Gujarati', 'kn': 'Kannada',
    'ml': 'Malayalam', 'or': 'Odia', 'pa': 'Punjabi', 'as': 'Assamese',
    'mai': 'Maithili', 'sa': 'Sanskrit', 'ne': 'Nepali', 'ks': 'Kashmiri',
    'sd': 'Sindhi', 'brx': 'Bodo', 'doi': 'Dogri', 'kok': 'Konkani',
    'mni': 'Manipuri', 'sat': 'Santali'
}


def get_language_name(language_code: str) -> str:
    """Get human-readable language name"""
    return LANGUAGE_NAMES.get(language_code.lower(), 'Unknown')


if __name__ == "__main__":
    # Test the language detector
    test_texts = {
        'hi': 'नमस्ते, आप कैसे हैं? यह हिंदी भाषा का परीक्षण है।',
        'bn': 'নমস্কার, আপনি কেমন আছেন? এটি বাংলা ভাষার পরীক্ষা।',
        'ta': 'வணக்கம், நீங்கள் எப்படி இருக்கிறீர்கள்? இது தமிழ் மொழியின் சோதனை.',
        'te': 'నమస్కారం, మీరు ఎలా ఉన్నారు? ఇది తెలుగు భాష పరీక్ష.',
        'ml': 'നമസ്കാരം, നിങ്ങൾ എങ്ങനെയുണ്ട്? ഇത് മലയാളം ഭാഷയുടെ പരീക്ഷണം.',
        'ur': 'السلام علیکم، آپ کیسے ہیں؟ یہ اردو زبان کا امتحان ہے۔',
        'gu': 'નમસ્તે, તમે કેમ છો? આ ગુજરાતી ભાષાની કસોટી છે.',
        'kn': 'ನಮಸ್ಕಾರ, ನೀವು ಹೇಗಿದ್ದೀರಿ? ಇದು ಕನ್ನಡ ಭಾಷೆಯ ಪರೀಕ್ಷೆ.',
    }
    
    print("Testing IndicLanguageDetector:")
    print("=" * 60)
    
    detector = IndicLanguageDetector()
    
    for expected_lang, text in test_texts.items():
        result = validate_language_detection(text, expected_lang)
        
        print(f"\n{expected_lang.upper()} ({get_language_name(expected_lang)}):")
        print(f"Text: {text}")
        print(f"Expected: {result['expected']}")
        print(f"Detected: {result['detected']}")
        print(f"Confidence: {result['confidence']:.3f}")
        print(f"Correct: {'✅' if result['correct'] else '❌'}")
        print(f"Script: {result['script']}")
    
    print(f"\nSupported Languages: {len(detector.get_supported_languages())}")
    print(f"Languages: {', '.join(detector.get_supported_languages())}")