""" TTS Text Preprocessing Utilities with Multilingual Support """ import re import json from typing import Dict, Set, Optional from num2words import num2words from pathlib import Path class TTSPreprocessor: """Text preprocessor for TTS providers with multilingual support""" # Preprocessing flags PREPROCESS_NUMBERS = "numbers" PREPROCESS_CURRENCY = "currency" PREPROCESS_TIME = "time" PREPROCESS_DATE = "date" PREPROCESS_CODES = "codes" PREPROCESS_PERCENTAGE = "percentage" def __init__(self, language: str = "tr"): self.language = language self.locale_data = self._load_locale(language) def _load_locale(self, language: str) -> Dict: """Load locale data from JSON file""" locale_path = Path(__file__).parent / "locales" / f"{language}.json" # Fallback to English if locale not found if not locale_path.exists(): print(f"⚠️ Locale file not found for {language}, falling back to English") locale_path = Path(__file__).parent / "locales" / "en.json" try: with open(locale_path, 'r', encoding='utf-8') as f: return json.load(f) except Exception as e: print(f"❌ Error loading locale {language}: {e}") # Return minimal default structure return { "language_code": language, "currency": {"symbols": {}, "codes": {}}, "months": {}, "numbers": { "decimal_separator": ".", "thousands_separator": ",", "decimal_word": "point" }, "small_number_threshold": 100 } def preprocess(self, text: str, flags: Set[str]) -> str: """Apply preprocessing based on flags""" if self.PREPROCESS_CURRENCY in flags: text = self._process_currency(text) if self.PREPROCESS_TIME in flags: text = self._process_time(text) if self.PREPROCESS_DATE in flags: text = self._process_date(text) if self.PREPROCESS_CODES in flags: text = self._process_codes(text) if self.PREPROCESS_PERCENTAGE in flags: text = self._process_percentage(text) # Numbers should be processed last to avoid conflicts if self.PREPROCESS_NUMBERS in flags: text = self._process_numbers(text) return text def _process_numbers(self, text: str) -> str: """Convert numbers to words based on locale""" decimal_sep = self.locale_data["numbers"]["decimal_separator"] thousands_sep = self.locale_data["numbers"]["thousands_separator"] decimal_word = self.locale_data["numbers"]["decimal_word"] threshold = self.locale_data.get("small_number_threshold", 100) def replace_number(match): num_str = match.group() # Normalize number format