ASR-NEW / normalizers.py
kasimali67
Update ASR pipeline code and configs
5b79694
"""
Complete normalizers for ALL 22+ scheduled Indian languages
Enhanced with proper script-specific normalization and IndicWhisper compatibility
Integrated with IndicNLP library for production use
"""
import re
import unicodedata
from typing import Dict, Optional, Union
import logging
# Enhanced IndicNLP integration
try:
from indic_nlp import common
from indic_nlp.normalize.indic_normalize import IndicNormalizerFactory
INDIC_NLP_AVAILABLE = True
except ImportError:
INDIC_NLP_AVAILABLE = False
logger = logging.getLogger(__name__)
class BaseIndicNormalizer:
"""Enhanced base class for Indic language normalizers with IndicNLP integration"""
def __init__(self, language_code: str):
self.language_code = language_code
# Common patterns
self.extra_spaces = re.compile(r'\s+')
self.punctuation_normalize = re.compile(r'[।॥]')
# Initialize IndicNLP normalizer if available
self.indic_normalizer = None
if INDIC_NLP_AVAILABLE:
try:
factory = IndicNormalizerFactory()
self.indic_normalizer = factory.get_normalizer(language_code, remove_nuktas=False)
except Exception as e:
logger.warning(f"Could not initialize IndicNLP normalizer for {language_code}: {e}")
def normalize(self, text: str) -> str:
"""Enhanced normalization with IndicNLP integration"""
if not text or not text.strip():
return ""
# First try IndicNLP normalization if available
if self.indic_normalizer and self.language_code != 'en':
try:
text = self.indic_normalizer.normalize(text)
except Exception as e:
logger.warning(f"IndicNLP normalization failed for {self.language_code}: {e}")
# Apply NFC normalization (canonical composition)
text = unicodedata.normalize('NFC', text)
# Basic cleanup
text = text.strip()
text = self.extra_spaces.sub(' ', text)
text = self.punctuation_normalize.sub('।', text)
return text
# DRAVIDIAN LANGUAGES
class MalayalamNormalizer(BaseIndicNormalizer):
"""Malayalam-specific normalizer preserving chillu forms and complex conjuncts"""
def __init__(self):
super().__init__('ml')
self.malayalam_range = r'[\u0d00-\u0d7f]'
# Malayalam chillu forms (critical for proper Malayalam rendering)
self.chillu_forms = {
'\u0d7a': 'ണ്\u200d', # chillu nn
'\u0d7b': 'ന്\u200d', # chillu n
'\u0d7c': 'ര്\u200d', # chillu rr
'\u0d7d': 'ല്\u200d', # chillu l
'\u0d7e': 'ള്\u200d', # chillu ll
'\u0d7f': 'ക്\u200d', # chillu k
}
self.conjunct_pattern = re.compile(r'([' + self.malayalam_range + r'])്([' + self.malayalam_range + r'])')
def normalize(self, text: str) -> str:
text = super().normalize(text)
# Preserve chillu forms
for standard, chillu in self.chillu_forms.items():
text = text.replace(chillu, standard)
# Preserve Malayalam conjuncts with virama
text = self.conjunct_pattern.sub(r'\1്\2', text)
return text
class TamilNormalizer(BaseIndicNormalizer):
"""Tamil-specific normalizer with proper pulli handling"""
def __init__(self):
super().__init__('ta')
self.tamil_range = r'[\u0b80-\u0bff]'
self.pulli_pattern = re.compile(r'([' + self.tamil_range + r'])्')
def normalize(self, text: str) -> str:
text = super().normalize(text)
# Preserve Tamil pulli (்)
text = self.pulli_pattern.sub(r'\1்', text)
# Handle Tamil vowel combinations
text = re.sub(r'([' + self.tamil_range + r'])([ாிீுூெேைொோௌ])', r'\1\2', text)
return text
class TeluguNormalizer(BaseIndicNormalizer):
"""Telugu-specific normalizer with proper halant handling"""
def __init__(self):
super().__init__('te')
self.telugu_range = r'[\u0c00-\u0c7f]'
self.halant_pattern = re.compile(r'([' + self.telugu_range + r'])्')
def normalize(self, text: str) -> str:
text = super().normalize(text)
# Preserve Telugu halant (్)
text = self.halant_pattern.sub(r'\1్', text)
# Handle Telugu vowel signs
text = re.sub(r'([' + self.telugu_range + r'])([ాిీుూెేైొోౌ])', r'\1\2', text)
return text
class KannadaNormalizer(BaseIndicNormalizer):
"""Kannada-specific normalizer with script preservation"""
def __init__(self):
super().__init__('kn')
self.kannada_range = r'[\u0c80-\u0cff]'
self.halant_pattern = re.compile(r'([' + self.kannada_range + r'])्')
def normalize(self, text: str) -> str:
text = super().normalize(text)
# Preserve Kannada halant (್)
text = self.halant_pattern.sub(r'\1್', text)
# Handle Kannada vowel signs
text = re.sub(r'([' + self.kannada_range + r'])([ಾಿೀುೂೆೇೈೊೋೌ])', r'\1\2', text)
return text
# INDO-ARYAN LANGUAGES (Devanagari Script)
class HindiNormalizer(BaseIndicNormalizer):
"""Enhanced Hindi/Devanagari normalizer for IndicWhisper"""
def __init__(self):
super().__init__('hi')
self.devanagari_range = r'[\u0900-\u097f]'
# ✅ FIXED - Use correct Devanagari vowel signs
self.vowel_signs = re.compile(r'([' + self.devanagari_range + r'])([ािीुूृेैोौ])')
self.conjunct_pattern = re.compile(r'([' + self.devanagari_range + r'])्([' + self.devanagari_range + r'])')
def normalize(self, text: str) -> str:
text = super().normalize(text)
# Preserve Devanagari conjuncts and halant
text = self.conjunct_pattern.sub(r'\1्\2', text)
# Preserve vowel signs (matras)
text = self.vowel_signs.sub(r'\1\2', text)
# Handle nukta (़) preservation
text = re.sub(r'([कखगजफ])़', r'\1़', text)
return text
class MarathiNormalizer(BaseIndicNormalizer):
"""Marathi-specific Devanagari normalizer"""
def __init__(self):
super().__init__('mr')
self.devanagari_range = r'[\u0900-\u097f]'
def normalize(self, text: str) -> str:
text = super().normalize(text)
# Marathi-specific conjuncts
marathi_conjuncts = ['क्ष', 'त्र', 'ज्ञ', 'श्र']
for conjunct in marathi_conjuncts:
text = re.sub(conjunct, conjunct, text)
return text
class SanskritNormalizer(BaseIndicNormalizer):
"""Sanskrit normalizer with classical Devanagari handling"""
def __init__(self):
super().__init__('sa')
class NepaliNormalizer(BaseIndicNormalizer):
"""Nepali normalizer using Devanagari script"""
def __init__(self):
super().__init__('ne')
# EASTERN INDO-ARYAN
class BengaliNormalizer(BaseIndicNormalizer):
"""Enhanced Bengali normalizer with proper script handling"""
def __init__(self):
super().__init__('bn')
self.bengali_range = r'[\u0980-\u09ff]'
self.halant_pattern = re.compile(r'([' + self.bengali_range + r'])्')
self.vowel_signs = re.compile(r'([' + self.bengali_range + r'])([ািীুূৃেৈোৌ])')
def normalize(self, text: str) -> str:
text = super().normalize(text)
# Preserve Bengali halant (্)
text = self.halant_pattern.sub(r'\1্', text)
# Preserve Bengali vowel signs
text = self.vowel_signs.sub(r'\1\2', text)
# Handle Bengali conjuncts
text = re.sub(r'([' + self.bengali_range + r'])্([' + self.bengali_range + r'])', r'\1্\2', text)
return text
class AssameeseNormalizer(BaseIndicNormalizer):
"""Assamese normalizer (Bengali script variant)"""
def __init__(self):
super().__init__('as')
self.assamese_range = r'[\u0980-\u09ff]'
self.halant_pattern = re.compile(r'([' + self.assamese_range + r'])्')
def normalize(self, text: str) -> str:
text = super().normalize(text)
text = self.halant_pattern.sub(r'\1্', text)
return text
class OdiaNormalizer(BaseIndicNormalizer):
"""Odia normalizer with proper script handling"""
def __init__(self):
super().__init__('or')
self.odia_range = r'[\u0b00-\u0b7f]'
self.halant_pattern = re.compile(r'([' + self.odia_range + r'])्')
def normalize(self, text: str) -> str:
text = super().normalize(text)
# Preserve Odia halant (୍)
text = self.halant_pattern.sub(r'\1୍', text)
# Handle Odia vowel signs
text = re.sub(r'([' + self.odia_range + r'])([ାିୀୁୂୃେୈୋୌ])', r'\1\2', text)
return text
# WESTERN INDO-ARYAN
class GujaratiNormalizer(BaseIndicNormalizer):
"""Gujarati normalizer with proper script handling"""
def __init__(self):
super().__init__('gu')
self.gujarati_range = r'[\u0a80-\u0aff]'
self.halant_pattern = re.compile(r'([' + self.gujarati_range + r'])्')
def normalize(self, text: str) -> str:
text = super().normalize(text)
# Preserve Gujarati halant (્)
text = self.halant_pattern.sub(r'\1્', text)
# Handle Gujarati vowel signs
text = re.sub(r'([' + self.gujarati_range + r'])([ાિીુૂેૈોૌ])', r'\1\2', text)
return text
class PunjabiNormalizer(BaseIndicNormalizer):
"""Punjabi normalizer for Gurmukhi script"""
def __init__(self):
super().__init__('pa')
self.punjabi_range = r'[\u0a00-\u0a7f]'
def normalize(self, text: str) -> str:
text = super().normalize(text)
# Handle Punjabi vowel signs
text = re.sub(r'([' + self.punjabi_range + r'])([ਾਿੀੁੂੇੈੋੌ])', r'\1\2', text)
return text
class SindhiNormalizer(BaseIndicNormalizer):
"""Sindhi normalizer (Arabic script)"""
def __init__(self):
super().__init__('sd')
# PERSO-ARABIC SCRIPT
class UrduNormalizer(BaseIndicNormalizer):
"""Enhanced Urdu normalizer for Arabic script"""
def __init__(self):
super().__init__('ur')
self.arabic_range = r'[\u0600-\u06ff]'
self.urdu_range = r'[\u0620-\u065f\u06a0-\u06ef]'
class KashmiriNormalizer(BaseIndicNormalizer):
"""Kashmiri normalizer (Arabic script)"""
def __init__(self):
super().__init__('ks')
# TIBETO-BURMAN AND OTHERS
class BodoNormalizer(BaseIndicNormalizer):
"""Bodo normalizer (Devanagari script)"""
def __init__(self):
super().__init__('brx')
class SantaliNormalizer(BaseIndicNormalizer):
"""Santali normalizer (Ol Chiki script)"""
def __init__(self):
super().__init__('sat')
self.olchiki_range = r'[\u1c50-\u1c7f]'
class ManipuriNormalizer(BaseIndicNormalizer):
"""Manipuri/Meitei normalizer (Meitei Mayek script)"""
def __init__(self):
super().__init__('mni')
self.meitei_range = r'[\uabc0-\uabff]'
class DogriNormalizer(BaseIndicNormalizer):
"""Dogri normalizer (Devanagari script)"""
def __init__(self):
super().__init__('doi')
class KonkaniNormalizer(BaseIndicNormalizer):
"""Konkani normalizer (Devanagari script)"""
def __init__(self):
super().__init__('kok')
class MaithiliNormalizer(BaseIndicNormalizer):
"""Maithili normalizer (Devanagari script)"""
def __init__(self):
super().__init__('mai')
# COMPLETE NORMALIZER MAPPING
NORMALIZERS = {
'ml': MalayalamNormalizer, 'ta': TamilNormalizer, 'te': TeluguNormalizer, 'kn': KannadaNormalizer,
'hi': HindiNormalizer, 'mr': MarathiNormalizer, 'sa': SanskritNormalizer, 'ne': NepaliNormalizer,
'brx': BodoNormalizer, 'doi': DogriNormalizer, 'kok': KonkaniNormalizer, 'mai': MaithiliNormalizer,
'bn': BengaliNormalizer, 'as': AssameeseNormalizer, 'or': OdiaNormalizer,
'gu': GujaratiNormalizer, 'pa': PunjabiNormalizer, 'sd': SindhiNormalizer,
'ur': UrduNormalizer, 'ks': KashmiriNormalizer,
'sat': SantaliNormalizer, 'mni': ManipuriNormalizer,
'en': BaseIndicNormalizer,
}
def get_normalizer(language_code: str) -> BaseIndicNormalizer:
"""Get appropriate normalizer with enhanced error handling"""
if not language_code:
return BaseIndicNormalizer('hi')
normalizer_class = NORMALIZERS.get(language_code.lower(), BaseIndicNormalizer)
try:
if normalizer_class == BaseIndicNormalizer:
return normalizer_class(language_code)
return normalizer_class()
except Exception as e:
logger.warning(f"Normalizer initialization failed for {language_code}: {e}")
return BaseIndicNormalizer(language_code)
# Standalone functions for backward compatibility
def normalize_hindi(text: str) -> str:
"""Standalone Hindi normalization function"""
normalizer = HindiNormalizer()
return normalizer.normalize(text)
def normalize_bengali(text: str) -> str:
"""Standalone Bengali normalization function"""
normalizer = BengaliNormalizer()
return normalizer.normalize(text)
def normalize_tamil(text: str) -> str:
"""Standalone Tamil normalization function"""
normalizer = TamilNormalizer()
return normalizer.normalize(text)
def normalize_telugu(text: str) -> str:
"""Standalone Telugu normalization function"""
normalizer = TeluguNormalizer()
return normalizer.normalize(text)
def normalize_malayalam(text: str) -> str:
"""Standalone Malayalam normalization function"""
normalizer = MalayalamNormalizer()
return normalizer.normalize(text)
def normalize_kannada(text: str) -> str:
"""Standalone Kannada normalization function"""
normalizer = KannadaNormalizer()
return normalizer.normalize(text)
def normalize_gujarati(text: str) -> str:
"""Standalone Gujarati normalization function"""
normalizer = GujaratiNormalizer()
return normalizer.normalize(text)
def normalize_punjabi(text: str) -> str:
"""Standalone Punjabi normalization function"""
normalizer = PunjabiNormalizer()
return normalizer.normalize(text)
def normalize_marathi(text: str) -> str:
"""Standalone Marathi normalization function"""
normalizer = MarathiNormalizer()
return normalizer.normalize(text)
def normalize_odia(text: str) -> str:
"""Standalone Odia normalization function"""
normalizer = OdiaNormalizer()
return normalizer.normalize(text)
def normalize_urdu(text: str) -> str:
"""Standalone Urdu normalization function"""
normalizer = UrduNormalizer()
return normalizer.normalize(text)
# Language metadata
LANGUAGE_INFO = {
'hi': {'name': 'Hindi', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': '600M+'},
'bn': {'name': 'Bengali', 'script': 'Bengali', 'family': 'Indo-Aryan', 'speakers': '300M+'},
'te': {'name': 'Telugu', 'script': 'Telugu', 'family': 'Dravidian', 'speakers': '95M+'},
'mr': {'name': 'Marathi', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': '90M+'},
'ta': {'name': 'Tamil', 'script': 'Tamil', 'family': 'Dravidian', 'speakers': '80M+'},
'ur': {'name': 'Urdu', 'script': 'Arabic', 'family': 'Indo-Aryan', 'speakers': '70M+'},
'gu': {'name': 'Gujarati', 'script': 'Gujarati', 'family': 'Indo-Aryan', 'speakers': '60M+'},
'kn': {'name': 'Kannada', 'script': 'Kannada', 'family': 'Dravidian', 'speakers': '50M+'},
'ml': {'name': 'Malayalam', 'script': 'Malayalam', 'family': 'Dravidian', 'speakers': '40M+'},
'or': {'name': 'Odia', 'script': 'Odia', 'family': 'Indo-Aryan', 'speakers': '40M+'},
'pa': {'name': 'Punjabi', 'script': 'Gurmukhi', 'family': 'Indo-Aryan', 'speakers': '35M+'},
'as': {'name': 'Assamese', 'script': 'Bengali', 'family': 'Indo-Aryan', 'speakers': '15M+'},
'mai': {'name': 'Maithili', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': '13M+'},
'sa': {'name': 'Sanskrit', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': 'Classical'},
'ne': {'name': 'Nepali', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': '17M+'},
'ks': {'name': 'Kashmiri', 'script': 'Arabic', 'family': 'Indo-Aryan', 'speakers': '7M+'},
'sd': {'name': 'Sindhi', 'script': 'Arabic', 'family': 'Indo-Aryan', 'speakers': '3M+'},
'brx': {'name': 'Bodo', 'script': 'Devanagari', 'family': 'Tibeto-Burman', 'speakers': '1.5M+'},
'doi': {'name': 'Dogri', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': '2.5M+'},
'kok': {'name': 'Konkani', 'script': 'Devanagari', 'family': 'Indo-Aryan', 'speakers': '2M+'},
'mni': {'name': 'Manipuri', 'script': 'Meitei Mayek', 'family': 'Tibeto-Burman', 'speakers': '1.8M+'},
'sat': {'name': 'Santali', 'script': 'Ol Chiki', 'family': 'Austroasiatic', 'speakers': '7M+'},
'en': {'name': 'English', 'script': 'Latin', 'family': 'Germanic', 'speakers': 'Global'},
}
def get_language_info(language_code: str) -> Dict[str, str]:
"""Get comprehensive language information"""
return LANGUAGE_INFO.get(language_code.lower(), {
'name': 'Unknown', 'script': 'Unknown', 'family': 'Unknown', 'speakers': 'Unknown'
})
def get_supported_languages() -> Dict[str, str]:
"""Get list of all supported languages"""
return {code: info['name'] for code, info in LANGUAGE_INFO.items()}
if __name__ == "__main__":
# Test normalization
test_texts = {
'hi': 'नमस्ते, आप कैसे हैं?',
'bn': 'নমস্কার, আপনি কেমন আছেন?',
'ta': 'வணக்கம், நீங்கள் எப்படி இருக்கிறீர்கள்?',
}
print("Testing Enhanced Normalizers:")
print("=" * 50)
for lang_code, text in test_texts.items():
normalizer = get_normalizer(lang_code)
normalized = normalizer.normalize(text)
print(f"\n{lang_code.upper()}: {normalized}")