|
""" |
|
Complete normalizers for ALL 22+ scheduled Indian languages |
|
Enhanced with proper script-specific normalization and IndicWhisper compatibility |
|
Integrated with IndicNLP library for production use |
|
""" |
|
import re |
|
import unicodedata |
|
from typing import Dict, Optional, Union |
|
import logging |
|
|
|
|
|
try: |
|
from indic_nlp import common |
|
from indic_nlp.normalize.indic_normalize import IndicNormalizerFactory |
|
INDIC_NLP_AVAILABLE = True |
|
except ImportError: |
|
INDIC_NLP_AVAILABLE = False |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
class BaseIndicNormalizer: |
|
"""Enhanced base class for Indic language normalizers with IndicNLP integration""" |
|
|
|
def __init__(self, language_code: str): |
|
self.language_code = language_code |
|
|
|
|
|
self.extra_spaces = re.compile(r'\s+') |
|
self.punctuation_normalize = re.compile(r'[।॥]') |
|
|
|
|
|
self.indic_normalizer = None |
|
if INDIC_NLP_AVAILABLE: |
|
try: |
|
factory = IndicNormalizerFactory() |
|
self.indic_normalizer = factory.get_normalizer(language_code, remove_nuktas=False) |
|
except Exception as e: |
|
logger.warning(f"Could not initialize IndicNLP normalizer for {language_code}: {e}") |
|
|
|
def normalize(self, text: str) -> str: |
|
"""Enhanced normalization with IndicNLP integration""" |
|
if not text or not text.strip(): |
|
return "" |
|
|
|
|
|
if self.indic_normalizer and self.language_code != 'en': |
|
try: |
|
text = self.indic_normalizer.normalize(text) |
|
except Exception as e: |
|
logger.warning(f"IndicNLP normalization failed for {self.language_code}: {e}") |
|
|
|
|
|
text = unicodedata.normalize('NFC', text) |
|
|
|
|
|
text = text.strip() |
|
text = self.extra_spaces.sub(' ', text) |
|
text = self.punctuation_normalize.sub('।', text) |
|
|
|
return text |
|
|
|
|
|
class MalayalamNormalizer(BaseIndicNormalizer): |
|
"""Malayalam-specific normalizer preserving chillu forms and complex conjuncts""" |
|
|
|
def __init__(self): |
|
super().__init__('ml') |
|
self.malayalam_range = r'[\u0d00-\u0d7f]' |
|
|
|
|
|
self.chillu_forms = { |
|
'\u0d7a': 'ണ്\u200d', |
|
'\u0d7b': 'ന്\u200d', |
|
'\u0d7c': 'ര്\u200d', |
|
'\u0d7d': 'ല്\u200d', |
|
'\u0d7e': 'ള്\u200d', |
|
'\u0d7f': 'ക്\u200d', |
|
} |
|
|
|
self.conjunct_pattern = re.compile(r'([' + self.malayalam_range + r'])്([' + self.malayalam_range + r'])') |
|
|
|
def normalize(self, text: str) -> str: |
|
text = super().normalize(text) |
|
|
|
|
|
for standard, chillu in self.chillu_forms.items(): |
|
text = text.replace(chillu, standard) |
|
|
|
|
|
text = self.conjunct_pattern.sub(r'\1്\2', text) |
|
|
|
return text |
|
|
|
class TamilNormalizer(BaseIndicNormalizer): |
|
"""Tamil-specific normalizer with proper pulli handling""" |
|
|
|
def __init__(self): |
|
super().__init__('ta') |
|
self.tamil_range = r'[\u0b80-\u0bff]' |
|
self.pulli_pattern = re.compile(r'([' + self.tamil_range + r'])्') |
|
|
|
def normalize(self, text: str) -> str: |
|
text = super().normalize(text) |
|
|
|
|
|
text = self.pulli_pattern.sub(r'\1்', text) |
|
|
|
|
|
text = re.sub(r'([' + self.tamil_range + r'])([ாிீுூெேைொோௌ])', r'\1\2', text) |
|
|
|
return text |
|
|
|
class TeluguNormalizer(BaseIndicNormalizer): |
|
"""Telugu-specific normalizer with proper halant handling""" |
|
|
|
def __init__(self): |
|
super().__init__('te') |
|
self.telugu_range = r'[\u0c00-\u0c7f]' |
|
self.halant_pattern = re.compile(r'([' + self.telugu_range + r'])्') |
|
|
|
def normalize(self, text: str) -> str: |
|
text = super().normalize(text) |
|
|
|
|
|
text = self.halant_pattern.sub(r'\1్', text) |
|
|
|
|
|
text = re.sub(r'([' + self.telugu_range + r'])([ాిీుూెేైొోౌ])', r'\1\2', text) |
|
|
|
return text |
|
|
|
class KannadaNormalizer(BaseIndicNormalizer): |
|
"""Kannada-specific normalizer with script preservation""" |
|
|
|
def __init__(self): |
|
super().__init__('kn') |
|
self.kannada_range = r'[\u0c80-\u0cff]' |
|
self.halant_pattern = re.compile(r'([' + self.kannada_range + r'])्') |
|
|
|
def normalize(self, text: str) -> str: |
|
text = super().normalize(text) |
|
|
|
|
|
text = self.halant_pattern.sub(r'\1್', text) |
|
|
|
|
|
text = re.sub(r'([' + self.kannada_range + r'])([ಾಿೀುೂೆೇೈೊೋೌ])', r'\1\2', text) |
|
|
|
return text |
|
|
|
|
|
class HindiNormalizer(BaseIndicNormalizer): |
|
"""Enhanced Hindi/Devanagari normalizer for IndicWhisper""" |
|
|
|
def __init__(self): |
|
super().__init__('hi') |
|
self.devanagari_range = r'[\u0900-\u097f]' |
|
|
|
self.vowel_signs = re.compile(r'([' + self.devanagari_range + r'])([ािीुूृेैोौ])') |
|
self.conjunct_pattern = re.compile(r'([' + self.devanagari_range + r'])्([' + self.devanagari_range + r'])') |
|
|
|
def normalize(self, text: str) -> str: |
|
text = super().normalize(text) |
|
|
|
|
|
text = self.conjunct_pattern.sub(r'\1्\2', text) |
|
|
|
|
|
text = self.vowel_signs.sub(r'\1\2', text) |
|
|
|
|
|
text = re.sub(r'([कखगजफ])़', r'\1़', text) |
|
|
|
return text |
|
|
|
class MarathiNormalizer(BaseIndicNormalizer): |
|
"""Marathi-specific Devanagari normalizer""" |
|
|
|
def __init__(self): |
|
super().__init__('mr') |
|
self.devanagari_range = r'[\u0900-\u097f]' |
|
|
|
def normalize(self, text: str) -> str: |
|
text = super().normalize(text) |
|
|
|
|
|
marathi_conjuncts = ['क्ष', 'त्र', 'ज्ञ', 'श्र'] |
|
for conjunct in marathi_conjuncts: |
|
text = re.sub(conjunct, conjunct, text) |
|
|
|
return text |
|
|
|
class SanskritNormalizer(BaseIndicNormalizer): |
|
"""Sanskrit normalizer with classical Devanagari handling""" |
|
|
|
def __init__(self): |
|
super().__init__('sa') |
|
|
|
class NepaliNormalizer(BaseIndicNormalizer): |
|
"""Nepali normalizer using Devanagari script""" |
|
|
|
def __init__(self): |
|
super().__init__('ne') |
|
|
|
|
|
class BengaliNormalizer(BaseIndicNormalizer): |
|
"""Enhanced Bengali normalizer with proper script handling""" |
|
|
|
def __init__(self): |
|
super().__init__('bn') |
|
self.bengali_range = r'[\u0980-\u09ff]' |
|
self.halant_pattern = re.compile(r'([' + self.bengali_range + r'])्') |
|
self.vowel_signs = re.compile(r'([' + self.bengali_range + r'])([ািীুূৃেৈোৌ])') |
|
|
|
def normalize(self, text: str) -> str: |
|
text = super().normalize(text) |
|
|
|
|
|
text = self.halant_pattern.sub(r'\1্', text) |
|
|
|
|
|
text = self.vowel_signs.sub(r'\1\2', text) |
|
|
|
|
|
text = re.sub(r'([' + self.bengali_range + r'])্([' + self.bengali_range + r'])', r'\1্\2', text) |
|
|
|
return text |
|
|
|
class AssameeseNormalizer(BaseIndicNormalizer): |
|
"""Assamese normalizer (Bengali script variant)""" |
|
|
|
def __init__(self): |
|
super().__init__('as') |
|
self.assamese_range = r'[\u0980-\u09ff]' |
|
self.halant_pattern = re.compile(r'([' + self.assamese_range + r'])्') |
|
|
|
def normalize(self, text: str) -> str: |
|
text = super().normalize(text) |
|
text = self.halant_pattern.sub(r'\1্', text) |
|
return text |
|
|
|
class OdiaNormalizer(BaseIndicNormalizer): |
|
"""Odia normalizer with proper script handling""" |
|
|
|
def __init__(self): |
|
super().__init__('or') |
|
self.odia_range = r'[\u0b00-\u0b7f]' |
|
self.halant_pattern = re.compile(r'([' + self.odia_range + r'])्') |
|
|
|
def normalize(self, text: str) -> str: |
|
text = super().normalize(text) |
|
|
|
|
|
text = self.halant_pattern.sub(r'\1୍', text) |
|
|
|
|
|
text = re.sub(r'([' + self.odia_range + r'])([ାିୀୁୂୃେୈୋୌ])', r'\1\2', text) |
|
|
|
return text |
|
|
|
|
|
class GujaratiNormalizer(BaseIndicNormalizer): |
|
"""Gujarati normalizer with proper script handling""" |
|
|
|
def __init__(self): |
|
super().__init__('gu') |
|
self.gujarati_range = r'[\u0a80-\u0aff]' |
|
self.halant_pattern = re.compile(r'([' + self.gujarati_range + r'])्') |
|
|
|
def normalize(self, text: str) -> str: |
|
text = super().normalize(text) |
|
|
|
|
|
text = self.halant_pattern.sub(r'\1્', text) |
|
|
|
|
|
text = re.sub(r'([' + self.gujarati_range + r'])([ાિીુૂેૈોૌ])', r'\1\2', text) |
|
|
|
return text |
|
|
|
class PunjabiNormalizer(BaseIndicNormalizer): |
|
"""Punjabi normalizer for Gurmukhi script""" |
|
|
|
def __init__(self): |
|
super().__init__('pa') |
|
self.punjabi_range = r'[\u0a00-\u0a7f]' |
|
|
|
def normalize(self, text: str) -> str: |
|
text = super().normalize(text) |
|
|
|
|
|
text = re.sub(r'([' + self.punjabi_range + r'])([ਾਿੀੁੂੇੈੋੌ])', r'\1\2', text) |
|
|
|
return text |
|
|
|
class SindhiNormalizer(BaseIndicNormalizer): |
|
"""Sindhi normalizer (Arabic script)""" |
|
|
|
def __init__(self): |
|
super().__init__('sd') |
|
|
|
|
|
class UrduNormalizer(BaseIndicNormalizer): |
|
"""Enhanced Urdu normalizer for Arabic script""" |
|
|
|
def __init__(self): |
|
super().__init__('ur') |
|
self.arabic_range = r'[\u0600-\u06ff]' |
|
self.urdu_range = r'[\u0620-\u065f\u06a0-\u06ef]' |
|
|
|
class KashmiriNormalizer(BaseIndicNormalizer): |
|
"""Kashmiri normalizer (Arabic script)""" |
|
|
|
def __init__(self): |
|
super().__init__('ks') |
|
|
|
|
|
class BodoNormalizer(BaseIndicNormalizer): |
|
"""Bodo normalizer (Devanagari script)""" |
|
|
|
def __init__(self): |
|
super().__init__('brx') |
|
|
|
class SantaliNormalizer(BaseIndicNormalizer): |
|
"""Santali normalizer (Ol Chiki script)""" |
|
|
|
def __init__(self): |
|
super().__init__('sat') |
|
self.olchiki_range = r'[\u1c50-\u1c7f]' |
|
|
|
class ManipuriNormalizer(BaseIndicNormalizer): |
|
"""Manipuri/Meitei normalizer (Meitei Mayek script)""" |
|
|
|
def __init__(self): |
|
super().__init__('mni') |
|
self.meitei_range = r'[\uabc0-\uabff]' |
|
|
|
class DogriNormalizer(BaseIndicNormalizer): |
|
"""Dogri normalizer (Devanagari script)""" |
|
|
|
def __init__(self): |
|
super().__init__('doi') |
|
|
|
class KonkaniNormalizer(BaseIndicNormalizer): |
|
"""Konkani normalizer (Devanagari script)""" |
|
|
|
def __init__(self): |
|
super().__init__('kok') |
|
|
|
class MaithiliNormalizer(BaseIndicNormalizer): |
|
"""Maithili normalizer (Devanagari script)""" |
|
|
|
def __init__(self): |
|
super().__init__('mai') |
|
|
|
|
|
NORMALIZERS = { |
|
'ml': MalayalamNormalizer, 'ta': TamilNormalizer, 'te': TeluguNormalizer, 'kn': KannadaNormalizer, |
|
'hi': HindiNormalizer, 'mr': MarathiNormalizer, 'sa': SanskritNormalizer, 'ne': NepaliNormalizer, |
|
'brx': BodoNormalizer, 'doi': DogriNormalizer, 'kok': KonkaniNormalizer, 'mai': MaithiliNormalizer, |
|
'bn': BengaliNormalizer, 'as': AssameeseNormalizer, 'or': OdiaNormalizer, |
|
'gu': GujaratiNormalizer, 'pa': PunjabiNormalizer, 'sd': SindhiNormalizer, |
|
'ur': UrduNormalizer, 'ks': KashmiriNormalizer, |
|
'sat': SantaliNormalizer, 'mni': ManipuriNormalizer, |
|
'en': BaseIndicNormalizer, |
|
} |
|
|
|
def get_normalizer(language_code: str) -> BaseIndicNormalizer: |
|
"""Get appropriate normalizer with enhanced error handling""" |
|
if not language_code: |
|
return BaseIndicNormalizer('hi') |
|
|
|
normalizer_class = NORMALIZERS.get(language_code.lower(), BaseIndicNormalizer) |
|
|
|
try: |
|
if normalizer_class == BaseIndicNormalizer: |
|
return normalizer_class(language_code) |
|
return normalizer_class() |
|
except Exception as e: |
|
logger.warning(f"Normalizer initialization failed for {language_code}: {e}") |
|
return BaseIndicNormalizer(language_code) |
|
|
|
|
|
def normalize_hindi(text: str) -> str: |
|
"""Standalone Hindi normalization function""" |
|
normalizer = HindiNormalizer() |
|
return normalizer.normalize(text) |
|
|
|
def normalize_bengali(text: str) -> str: |
|
"""Standalone Bengali normalization function""" |
|
normalizer = BengaliNormalizer() |
|
return normalizer.normalize(text) |
|
|
|
def normalize_tamil(text: str) -> str: |
|
"""Standalone Tamil normalization function""" |
|
normalizer = TamilNormalizer() |
|
return normalizer.normalize(text) |
|
|
|
def normalize_telugu(text: str) -> str: |
|
"""Standalone Telugu normalization function""" |
|
normalizer = TeluguNormalizer() |
|
return normalizer.normalize(text) |
|
|
|
def normalize_malayalam(text: str) -> str: |
|
"""Standalone Malayalam normalization function""" |
|
normalizer = MalayalamNormalizer() |
|
return normalizer.normalize(text) |
|
|
|
def normalize_kannada(text: str) -> str: |
|
"""Standalone Kannada normalization function""" |
|
normalizer = KannadaNormalizer() |
|
return normalizer.normalize(text) |
|
|
|
def normalize_gujarati(text: str) -> str: |
|
"""Standalone Gujarati normalization function""" |
|
normalizer = GujaratiNormalizer() |
|
return normalizer.normalize(text) |
|
|
|
def normalize_punjabi(text: str) -> str: |
|
"""Standalone Punjabi normalization function""" |
|
normalizer = PunjabiNormalizer() |
|
return normalizer.normalize(text) |
|
|
|
def normalize_marathi(text: str) -> str: |
|
"""Standalone Marathi normalization function""" |
|
normalizer = MarathiNormalizer() |
|
return normalizer.normalize(text) |
|
|
|
def normalize_odia(text: str) -> str: |
|
"""Standalone Odia normalization function""" |
|
normalizer = OdiaNormalizer() |
|
return normalizer.normalize(text) |
|
|
|
def normalize_urdu(text: str) -> str: |
|
"""Standalone Urdu normalization function""" |
|
normalizer = UrduNormalizer() |
|
return normalizer.normalize(text) |
|
|
|
|
|
LANGUAGE_INFO = { |
|
'hi': {'name': 'Hindi', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': '600M+'}, |
|
'bn': {'name': 'Bengali', 'script': 'Bengali', 'family': 'Indo-Aryan', 'speakers': '300M+'}, |
|
'te': {'name': 'Telugu', 'script': 'Telugu', 'family': 'Dravidian', 'speakers': '95M+'}, |
|
'mr': {'name': 'Marathi', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': '90M+'}, |
|
'ta': {'name': 'Tamil', 'script': 'Tamil', 'family': 'Dravidian', 'speakers': '80M+'}, |
|
'ur': {'name': 'Urdu', 'script': 'Arabic', 'family': 'Indo-Aryan', 'speakers': '70M+'}, |
|
'gu': {'name': 'Gujarati', 'script': 'Gujarati', 'family': 'Indo-Aryan', 'speakers': '60M+'}, |
|
'kn': {'name': 'Kannada', 'script': 'Kannada', 'family': 'Dravidian', 'speakers': '50M+'}, |
|
'ml': {'name': 'Malayalam', 'script': 'Malayalam', 'family': 'Dravidian', 'speakers': '40M+'}, |
|
'or': {'name': 'Odia', 'script': 'Odia', 'family': 'Indo-Aryan', 'speakers': '40M+'}, |
|
'pa': {'name': 'Punjabi', 'script': 'Gurmukhi', 'family': 'Indo-Aryan', 'speakers': '35M+'}, |
|
'as': {'name': 'Assamese', 'script': 'Bengali', 'family': 'Indo-Aryan', 'speakers': '15M+'}, |
|
'mai': {'name': 'Maithili', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': '13M+'}, |
|
'sa': {'name': 'Sanskrit', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': 'Classical'}, |
|
'ne': {'name': 'Nepali', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': '17M+'}, |
|
'ks': {'name': 'Kashmiri', 'script': 'Arabic', 'family': 'Indo-Aryan', 'speakers': '7M+'}, |
|
'sd': {'name': 'Sindhi', 'script': 'Arabic', 'family': 'Indo-Aryan', 'speakers': '3M+'}, |
|
'brx': {'name': 'Bodo', 'script': 'Devanagari', 'family': 'Tibeto-Burman', 'speakers': '1.5M+'}, |
|
'doi': {'name': 'Dogri', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': '2.5M+'}, |
|
'kok': {'name': 'Konkani', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': '2M+'}, |
|
'mni': {'name': 'Manipuri', 'script': 'Meitei Mayek', 'family': 'Tibeto-Burman', 'speakers': '1.8M+'}, |
|
'sat': {'name': 'Santali', 'script': 'Ol Chiki', 'family': 'Austroasiatic', 'speakers': '7M+'}, |
|
'en': {'name': 'English', 'script': 'Latin', 'family': 'Germanic', 'speakers': 'Global'}, |
|
} |
|
|
|
def get_language_info(language_code: str) -> Dict[str, str]: |
|
"""Get comprehensive language information""" |
|
return LANGUAGE_INFO.get(language_code.lower(), { |
|
'name': 'Unknown', 'script': 'Unknown', 'family': 'Unknown', 'speakers': 'Unknown' |
|
}) |
|
|
|
def get_supported_languages() -> Dict[str, str]: |
|
"""Get list of all supported languages""" |
|
return {code: info['name'] for code, info in LANGUAGE_INFO.items()} |
|
|
|
if __name__ == "__main__": |
|
|
|
test_texts = { |
|
'hi': 'नमस्ते, आप कैसे हैं?', |
|
'bn': 'নমস্কার, আপনি কেমন আছেন?', |
|
'ta': 'வணக்கம், நீங்கள் எப்படி இருக்கிறீர்கள்?', |
|
} |
|
|
|
print("Testing Enhanced Normalizers:") |
|
print("=" * 50) |
|
|
|
for lang_code, text in test_texts.items(): |
|
normalizer = get_normalizer(lang_code) |
|
normalized = normalizer.normalize(text) |
|
print(f"\n{lang_code.upper()}: {normalized}") |
|
|