""" Complete normalizers for ALL 22+ scheduled Indian languages Enhanced with proper script-specific normalization and IndicWhisper compatibility Integrated with IndicNLP library for production use """ import re import unicodedata from typing import Dict, Optional, Union import logging # Enhanced IndicNLP integration try: from indic_nlp import common from indic_nlp.normalize.indic_normalize import IndicNormalizerFactory INDIC_NLP_AVAILABLE = True except ImportError: INDIC_NLP_AVAILABLE = False logger = logging.getLogger(__name__) class BaseIndicNormalizer: """Enhanced base class for Indic language normalizers with IndicNLP integration""" def __init__(self, language_code: str): self.language_code = language_code # Common patterns self.extra_spaces = re.compile(r'\s+') self.punctuation_normalize = re.compile(r'[।॥]') # Initialize IndicNLP normalizer if available self.indic_normalizer = None if INDIC_NLP_AVAILABLE: try: factory = IndicNormalizerFactory() self.indic_normalizer = factory.get_normalizer(language_code, remove_nuktas=False) except Exception as e: logger.warning(f"Could not initialize IndicNLP normalizer for {language_code}: {e}") def normalize(self, text: str) -> str: """Enhanced normalization with IndicNLP integration""" if not text or not text.strip(): return "" # First try IndicNLP normalization if available if self.indic_normalizer and self.language_code != 'en': try: text = self.indic_normalizer.normalize(text) except Exception as e: logger.warning(f"IndicNLP normalization failed for {self.language_code}: {e}") # Apply NFC normalization (canonical composition) text = unicodedata.normalize('NFC', text) # Basic cleanup text = text.strip() text = self.extra_spaces.sub(' ', text) text = self.punctuation_normalize.sub('।', text) return text # DRAVIDIAN LANGUAGES class MalayalamNormalizer(BaseIndicNormalizer): """Malayalam-specific normalizer preserving chillu forms and complex conjuncts""" def __init__(self): super().__init__('ml') self.malayalam_range = r'[\u0d00-\u0d7f]' # Malayalam chillu forms (critical for proper Malayalam rendering) self.chillu_forms = { '\u0d7a': 'ണ്\u200d', # chillu nn '\u0d7b': 'ന്\u200d', # chillu n '\u0d7c': 'ര്\u200d', # chillu rr '\u0d7d': 'ല്\u200d', # chillu l '\u0d7e': 'ള്\u200d', # chillu ll '\u0d7f': 'ക്\u200d', # chillu k } self.conjunct_pattern = re.compile(r'([' + self.malayalam_range + r'])്([' + self.malayalam_range + r'])') def normalize(self, text: str) -> str: text = super().normalize(text) # Preserve chillu forms for standard, chillu in self.chillu_forms.items(): text = text.replace(chillu, standard) # Preserve Malayalam conjuncts with virama text = self.conjunct_pattern.sub(r'\1്\2', text) return text class TamilNormalizer(BaseIndicNormalizer): """Tamil-specific normalizer with proper pulli handling""" def __init__(self): super().__init__('ta') self.tamil_range = r'[\u0b80-\u0bff]' self.pulli_pattern = re.compile(r'([' + self.tamil_range + r'])्') def normalize(self, text: str) -> str: text = super().normalize(text) # Preserve Tamil pulli (்) text = self.pulli_pattern.sub(r'\1்', text) # Handle Tamil vowel combinations text = re.sub(r'([' + self.tamil_range + r'])([ாிீுூெேைொோௌ])', r'\1\2', text) return text class TeluguNormalizer(BaseIndicNormalizer): """Telugu-specific normalizer with proper halant handling""" def __init__(self): super().__init__('te') self.telugu_range = r'[\u0c00-\u0c7f]' self.halant_pattern = re.compile(r'([' + self.telugu_range + r'])्') def normalize(self, text: str) -> str: text = super().normalize(text) # Preserve Telugu halant (్) text = self.halant_pattern.sub(r'\1్', text) # Handle Telugu vowel signs text = re.sub(r'([' + self.telugu_range + r'])([ాిీుూెేైొోౌ])', r'\1\2', text) return text class KannadaNormalizer(BaseIndicNormalizer): """Kannada-specific normalizer with script preservation""" def __init__(self): super().__init__('kn') self.kannada_range = r'[\u0c80-\u0cff]' self.halant_pattern = re.compile(r'([' + self.kannada_range + r'])्') def normalize(self, text: str) -> str: text = super().normalize(text) # Preserve Kannada halant (್) text = self.halant_pattern.sub(r'\1್', text) # Handle Kannada vowel signs text = re.sub(r'([' + self.kannada_range + r'])([ಾಿೀುೂೆೇೈೊೋೌ])', r'\1\2', text) return text # INDO-ARYAN LANGUAGES (Devanagari Script) class HindiNormalizer(BaseIndicNormalizer): """Enhanced Hindi/Devanagari normalizer for IndicWhisper""" def __init__(self): super().__init__('hi') self.devanagari_range = r'[\u0900-\u097f]' # ✅ FIXED - Use correct Devanagari vowel signs self.vowel_signs = re.compile(r'([' + self.devanagari_range + r'])([ािीुूृेैोौ])') self.conjunct_pattern = re.compile(r'([' + self.devanagari_range + r'])्([' + self.devanagari_range + r'])') def normalize(self, text: str) -> str: text = super().normalize(text) # Preserve Devanagari conjuncts and halant text = self.conjunct_pattern.sub(r'\1्\2', text) # Preserve vowel signs (matras) text = self.vowel_signs.sub(r'\1\2', text) # Handle nukta (़) preservation text = re.sub(r'([कखगजफ])़', r'\1़', text) return text class MarathiNormalizer(BaseIndicNormalizer): """Marathi-specific Devanagari normalizer""" def __init__(self): super().__init__('mr') self.devanagari_range = r'[\u0900-\u097f]' def normalize(self, text: str) -> str: text = super().normalize(text) # Marathi-specific conjuncts marathi_conjuncts = ['क्ष', 'त्र', 'ज्ञ', 'श्र'] for conjunct in marathi_conjuncts: text = re.sub(conjunct, conjunct, text) return text class SanskritNormalizer(BaseIndicNormalizer): """Sanskrit normalizer with classical Devanagari handling""" def __init__(self): super().__init__('sa') class NepaliNormalizer(BaseIndicNormalizer): """Nepali normalizer using Devanagari script""" def __init__(self): super().__init__('ne') # EASTERN INDO-ARYAN class BengaliNormalizer(BaseIndicNormalizer): """Enhanced Bengali normalizer with proper script handling""" def __init__(self): super().__init__('bn') self.bengali_range = r'[\u0980-\u09ff]' self.halant_pattern = re.compile(r'([' + self.bengali_range + r'])्') self.vowel_signs = re.compile(r'([' + self.bengali_range + r'])([ািীুূৃেৈোৌ])') def normalize(self, text: str) -> str: text = super().normalize(text) # Preserve Bengali halant (্) text = self.halant_pattern.sub(r'\1্', text) # Preserve Bengali vowel signs text = self.vowel_signs.sub(r'\1\2', text) # Handle Bengali conjuncts text = re.sub(r'([' + self.bengali_range + r'])্([' + self.bengali_range + r'])', r'\1্\2', text) return text class AssameeseNormalizer(BaseIndicNormalizer): """Assamese normalizer (Bengali script variant)""" def __init__(self): super().__init__('as') self.assamese_range = r'[\u0980-\u09ff]' self.halant_pattern = re.compile(r'([' + self.assamese_range + r'])्') def normalize(self, text: str) -> str: text = super().normalize(text) text = self.halant_pattern.sub(r'\1্', text) return text class OdiaNormalizer(BaseIndicNormalizer): """Odia normalizer with proper script handling""" def __init__(self): super().__init__('or') self.odia_range = r'[\u0b00-\u0b7f]' self.halant_pattern = re.compile(r'([' + self.odia_range + r'])्') def normalize(self, text: str) -> str: text = super().normalize(text) # Preserve Odia halant (୍) text = self.halant_pattern.sub(r'\1୍', text) # Handle Odia vowel signs text = re.sub(r'([' + self.odia_range + r'])([ାିୀୁୂୃେୈୋୌ])', r'\1\2', text) return text # WESTERN INDO-ARYAN class GujaratiNormalizer(BaseIndicNormalizer): """Gujarati normalizer with proper script handling""" def __init__(self): super().__init__('gu') self.gujarati_range = r'[\u0a80-\u0aff]' self.halant_pattern = re.compile(r'([' + self.gujarati_range + r'])्') def normalize(self, text: str) -> str: text = super().normalize(text) # Preserve Gujarati halant (્) text = self.halant_pattern.sub(r'\1્', text) # Handle Gujarati vowel signs text = re.sub(r'([' + self.gujarati_range + r'])([ાિીુૂેૈોૌ])', r'\1\2', text) return text class PunjabiNormalizer(BaseIndicNormalizer): """Punjabi normalizer for Gurmukhi script""" def __init__(self): super().__init__('pa') self.punjabi_range = r'[\u0a00-\u0a7f]' def normalize(self, text: str) -> str: text = super().normalize(text) # Handle Punjabi vowel signs text = re.sub(r'([' + self.punjabi_range + r'])([ਾਿੀੁੂੇੈੋੌ])', r'\1\2', text) return text class SindhiNormalizer(BaseIndicNormalizer): """Sindhi normalizer (Arabic script)""" def __init__(self): super().__init__('sd') # PERSO-ARABIC SCRIPT class UrduNormalizer(BaseIndicNormalizer): """Enhanced Urdu normalizer for Arabic script""" def __init__(self): super().__init__('ur') self.arabic_range = r'[\u0600-\u06ff]' self.urdu_range = r'[\u0620-\u065f\u06a0-\u06ef]' class KashmiriNormalizer(BaseIndicNormalizer): """Kashmiri normalizer (Arabic script)""" def __init__(self): super().__init__('ks') # TIBETO-BURMAN AND OTHERS class BodoNormalizer(BaseIndicNormalizer): """Bodo normalizer (Devanagari script)""" def __init__(self): super().__init__('brx') class SantaliNormalizer(BaseIndicNormalizer): """Santali normalizer (Ol Chiki script)""" def __init__(self): super().__init__('sat') self.olchiki_range = r'[\u1c50-\u1c7f]' class ManipuriNormalizer(BaseIndicNormalizer): """Manipuri/Meitei normalizer (Meitei Mayek script)""" def __init__(self): super().__init__('mni') self.meitei_range = r'[\uabc0-\uabff]' class DogriNormalizer(BaseIndicNormalizer): """Dogri normalizer (Devanagari script)""" def __init__(self): super().__init__('doi') class KonkaniNormalizer(BaseIndicNormalizer): """Konkani normalizer (Devanagari script)""" def __init__(self): super().__init__('kok') class MaithiliNormalizer(BaseIndicNormalizer): """Maithili normalizer (Devanagari script)""" def __init__(self): super().__init__('mai') # COMPLETE NORMALIZER MAPPING NORMALIZERS = { 'ml': MalayalamNormalizer, 'ta': TamilNormalizer, 'te': TeluguNormalizer, 'kn': KannadaNormalizer, 'hi': HindiNormalizer, 'mr': MarathiNormalizer, 'sa': SanskritNormalizer, 'ne': NepaliNormalizer, 'brx': BodoNormalizer, 'doi': DogriNormalizer, 'kok': KonkaniNormalizer, 'mai': MaithiliNormalizer, 'bn': BengaliNormalizer, 'as': AssameeseNormalizer, 'or': OdiaNormalizer, 'gu': GujaratiNormalizer, 'pa': PunjabiNormalizer, 'sd': SindhiNormalizer, 'ur': UrduNormalizer, 'ks': KashmiriNormalizer, 'sat': SantaliNormalizer, 'mni': ManipuriNormalizer, 'en': BaseIndicNormalizer, } def get_normalizer(language_code: str) -> BaseIndicNormalizer: """Get appropriate normalizer with enhanced error handling""" if not language_code: return BaseIndicNormalizer('hi') normalizer_class = NORMALIZERS.get(language_code.lower(), BaseIndicNormalizer) try: if normalizer_class == BaseIndicNormalizer: return normalizer_class(language_code) return normalizer_class() except Exception as e: logger.warning(f"Normalizer initialization failed for {language_code}: {e}") return BaseIndicNormalizer(language_code) # Standalone functions for backward compatibility def normalize_hindi(text: str) -> str: """Standalone Hindi normalization function""" normalizer = HindiNormalizer() return normalizer.normalize(text) def normalize_bengali(text: str) -> str: """Standalone Bengali normalization function""" normalizer = BengaliNormalizer() return normalizer.normalize(text) def normalize_tamil(text: str) -> str: """Standalone Tamil normalization function""" normalizer = TamilNormalizer() return normalizer.normalize(text) def normalize_telugu(text: str) -> str: """Standalone Telugu normalization function""" normalizer = TeluguNormalizer() return normalizer.normalize(text) def normalize_malayalam(text: str) -> str: """Standalone Malayalam normalization function""" normalizer = MalayalamNormalizer() return normalizer.normalize(text) def normalize_kannada(text: str) -> str: """Standalone Kannada normalization function""" normalizer = KannadaNormalizer() return normalizer.normalize(text) def normalize_gujarati(text: str) -> str: """Standalone Gujarati normalization function""" normalizer = GujaratiNormalizer() return normalizer.normalize(text) def normalize_punjabi(text: str) -> str: """Standalone Punjabi normalization function""" normalizer = PunjabiNormalizer() return normalizer.normalize(text) def normalize_marathi(text: str) -> str: """Standalone Marathi normalization function""" normalizer = MarathiNormalizer() return normalizer.normalize(text) def normalize_odia(text: str) -> str: """Standalone Odia normalization function""" normalizer = OdiaNormalizer() return normalizer.normalize(text) def normalize_urdu(text: str) -> str: """Standalone Urdu normalization function""" normalizer = UrduNormalizer() return normalizer.normalize(text) # Language metadata LANGUAGE_INFO = { 'hi': {'name': 'Hindi', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': '600M+'}, 'bn': {'name': 'Bengali', 'script': 'Bengali', 'family': 'Indo-Aryan', 'speakers': '300M+'}, 'te': {'name': 'Telugu', 'script': 'Telugu', 'family': 'Dravidian', 'speakers': '95M+'}, 'mr': {'name': 'Marathi', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': '90M+'}, 'ta': {'name': 'Tamil', 'script': 'Tamil', 'family': 'Dravidian', 'speakers': '80M+'}, 'ur': {'name': 'Urdu', 'script': 'Arabic', 'family': 'Indo-Aryan', 'speakers': '70M+'}, 'gu': {'name': 'Gujarati', 'script': 'Gujarati', 'family': 'Indo-Aryan', 'speakers': '60M+'}, 'kn': {'name': 'Kannada', 'script': 'Kannada', 'family': 'Dravidian', 'speakers': '50M+'}, 'ml': {'name': 'Malayalam', 'script': 'Malayalam', 'family': 'Dravidian', 'speakers': '40M+'}, 'or': {'name': 'Odia', 'script': 'Odia', 'family': 'Indo-Aryan', 'speakers': '40M+'}, 'pa': {'name': 'Punjabi', 'script': 'Gurmukhi', 'family': 'Indo-Aryan', 'speakers': '35M+'}, 'as': {'name': 'Assamese', 'script': 'Bengali', 'family': 'Indo-Aryan', 'speakers': '15M+'}, 'mai': {'name': 'Maithili', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': '13M+'}, 'sa': {'name': 'Sanskrit', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': 'Classical'}, 'ne': {'name': 'Nepali', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': '17M+'}, 'ks': {'name': 'Kashmiri', 'script': 'Arabic', 'family': 'Indo-Aryan', 'speakers': '7M+'}, 'sd': {'name': 'Sindhi', 'script': 'Arabic', 'family': 'Indo-Aryan', 'speakers': '3M+'}, 'brx': {'name': 'Bodo', 'script': 'Devanagari', 'family': 'Tibeto-Burman', 'speakers': '1.5M+'}, 'doi': {'name': 'Dogri', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': '2.5M+'}, 'kok': {'name': 'Konkani', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': '2M+'}, 'mni': {'name': 'Manipuri', 'script': 'Meitei Mayek', 'family': 'Tibeto-Burman', 'speakers': '1.8M+'}, 'sat': {'name': 'Santali', 'script': 'Ol Chiki', 'family': 'Austroasiatic', 'speakers': '7M+'}, 'en': {'name': 'English', 'script': 'Latin', 'family': 'Germanic', 'speakers': 'Global'}, } def get_language_info(language_code: str) -> Dict[str, str]: """Get comprehensive language information""" return LANGUAGE_INFO.get(language_code.lower(), { 'name': 'Unknown', 'script': 'Unknown', 'family': 'Unknown', 'speakers': 'Unknown' }) def get_supported_languages() -> Dict[str, str]: """Get list of all supported languages""" return {code: info['name'] for code, info in LANGUAGE_INFO.items()} if __name__ == "__main__": # Test normalization test_texts = { 'hi': 'नमस्ते, आप कैसे हैं?', 'bn': 'নমস্কার, আপনি কেমন আছেন?', 'ta': 'வணக்கம், நீங்கள் எப்படி இருக்கிறீர்கள்?', } print("Testing Enhanced Normalizers:") print("=" * 50) for lang_code, text in test_texts.items(): normalizer = get_normalizer(lang_code) normalized = normalizer.normalize(text) print(f"\n{lang_code.upper()}: {normalized}")