flare / tts_preprocessor.py
ciyidogan's picture
Create tts_preprocessor.py
81e4201 verified
raw
history blame
3.11 kB
"""
TTS Text Preprocessing Utilities with Multilingual Support
"""
import re
import json
from typing import Dict, Set, Optional
from num2words import num2words
from pathlib import Path
class TTSPreprocessor:
"""Text preprocessor for TTS providers with multilingual support"""
# Preprocessing flags
PREPROCESS_NUMBERS = "numbers"
PREPROCESS_CURRENCY = "currency"
PREPROCESS_TIME = "time"
PREPROCESS_DATE = "date"
PREPROCESS_CODES = "codes"
PREPROCESS_PERCENTAGE = "percentage"
def __init__(self, language: str = "tr"):
self.language = language
self.locale_data = self._load_locale(language)
def _load_locale(self, language: str) -> Dict:
"""Load locale data from JSON file"""
locale_path = Path(__file__).parent / "locales" / f"{language}.json"
# Fallback to English if locale not found
if not locale_path.exists():
print(f"⚠️ Locale file not found for {language}, falling back to English")
locale_path = Path(__file__).parent / "locales" / "en.json"
try:
with open(locale_path, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
print(f"❌ Error loading locale {language}: {e}")
# Return minimal default structure
return {
"language_code": language,
"currency": {"symbols": {}, "codes": {}},
"months": {},
"numbers": {
"decimal_separator": ".",
"thousands_separator": ",",
"decimal_word": "point"
},
"small_number_threshold": 100
}
def preprocess(self, text: str, flags: Set[str]) -> str:
"""Apply preprocessing based on flags"""
if self.PREPROCESS_CURRENCY in flags:
text = self._process_currency(text)
if self.PREPROCESS_TIME in flags:
text = self._process_time(text)
if self.PREPROCESS_DATE in flags:
text = self._process_date(text)
if self.PREPROCESS_CODES in flags:
text = self._process_codes(text)
if self.PREPROCESS_PERCENTAGE in flags:
text = self._process_percentage(text)
# Numbers should be processed last to avoid conflicts
if self.PREPROCESS_NUMBERS in flags:
text = self._process_numbers(text)
return text
def _process_numbers(self, text: str) -> str:
"""Convert numbers to words based on locale"""
decimal_sep = self.locale_data["numbers"]["decimal_separator"]
thousands_sep = self.locale_data["numbers"]["thousands_separator"]
decimal_word = self.locale_data["numbers"]["decimal_word"]
threshold = self.locale_data.get("small_number_threshold", 100)
def replace_number(match):
num_str = match.group()
# Normalize number format