Spaces:
Running
Running
""" | |
TTS Text Preprocessing Utilities with Multilingual Support | |
""" | |
import re | |
import json | |
from typing import Dict, Set, Optional | |
from num2words import num2words | |
from pathlib import Path | |
class TTSPreprocessor: | |
"""Text preprocessor for TTS providers with multilingual support""" | |
# Preprocessing flags | |
PREPROCESS_NUMBERS = "numbers" | |
PREPROCESS_CURRENCY = "currency" | |
PREPROCESS_TIME = "time" | |
PREPROCESS_DATE = "date" | |
PREPROCESS_CODES = "codes" | |
PREPROCESS_PERCENTAGE = "percentage" | |
def __init__(self, language: str = "tr"): | |
self.language = language | |
self.locale_data = self._load_locale(language) | |
def _load_locale(self, language: str) -> Dict: | |
"""Load locale data from JSON file""" | |
locale_path = Path(__file__).parent / "locales" / f"{language}.json" | |
# Fallback to English if locale not found | |
if not locale_path.exists(): | |
print(f"⚠️ Locale file not found for {language}, falling back to English") | |
locale_path = Path(__file__).parent / "locales" / "en.json" | |
try: | |
with open(locale_path, 'r', encoding='utf-8') as f: | |
return json.load(f) | |
except Exception as e: | |
print(f"❌ Error loading locale {language}: {e}") | |
# Return minimal default structure | |
return { | |
"language_code": language, | |
"currency": {"symbols": {}, "codes": {}}, | |
"months": {}, | |
"numbers": { | |
"decimal_separator": ".", | |
"thousands_separator": ",", | |
"decimal_word": "point" | |
}, | |
"small_number_threshold": 100 | |
} | |
def preprocess(self, text: str, flags: Set[str]) -> str: | |
"""Apply preprocessing based on flags""" | |
if self.PREPROCESS_CURRENCY in flags: | |
text = self._process_currency(text) | |
if self.PREPROCESS_TIME in flags: | |
text = self._process_time(text) | |
if self.PREPROCESS_DATE in flags: | |
text = self._process_date(text) | |
if self.PREPROCESS_CODES in flags: | |
text = self._process_codes(text) | |
if self.PREPROCESS_PERCENTAGE in flags: | |
text = self._process_percentage(text) | |
# Numbers should be processed last to avoid conflicts | |
if self.PREPROCESS_NUMBERS in flags: | |
text = self._process_numbers(text) | |
return text | |
def _process_numbers(self, text: str) -> str: | |
"""Convert numbers to words based on locale""" | |
decimal_sep = self.locale_data["numbers"]["decimal_separator"] | |
thousands_sep = self.locale_data["numbers"]["thousands_separator"] | |
decimal_word = self.locale_data["numbers"]["decimal_word"] | |
threshold = self.locale_data.get("small_number_threshold", 100) | |
def replace_number(match): | |
num_str = match.group() | |
# Normalize number format |