Spaces:
Running
Running
File size: 3,114 Bytes
81e4201 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
"""
TTS Text Preprocessing Utilities with Multilingual Support
"""
import re
import json
from typing import Dict, Set, Optional
from num2words import num2words
from pathlib import Path
class TTSPreprocessor:
"""Text preprocessor for TTS providers with multilingual support"""
# Preprocessing flags
PREPROCESS_NUMBERS = "numbers"
PREPROCESS_CURRENCY = "currency"
PREPROCESS_TIME = "time"
PREPROCESS_DATE = "date"
PREPROCESS_CODES = "codes"
PREPROCESS_PERCENTAGE = "percentage"
def __init__(self, language: str = "tr"):
self.language = language
self.locale_data = self._load_locale(language)
def _load_locale(self, language: str) -> Dict:
"""Load locale data from JSON file"""
locale_path = Path(__file__).parent / "locales" / f"{language}.json"
# Fallback to English if locale not found
if not locale_path.exists():
print(f"⚠️ Locale file not found for {language}, falling back to English")
locale_path = Path(__file__).parent / "locales" / "en.json"
try:
with open(locale_path, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
print(f"❌ Error loading locale {language}: {e}")
# Return minimal default structure
return {
"language_code": language,
"currency": {"symbols": {}, "codes": {}},
"months": {},
"numbers": {
"decimal_separator": ".",
"thousands_separator": ",",
"decimal_word": "point"
},
"small_number_threshold": 100
}
def preprocess(self, text: str, flags: Set[str]) -> str:
"""Apply preprocessing based on flags"""
if self.PREPROCESS_CURRENCY in flags:
text = self._process_currency(text)
if self.PREPROCESS_TIME in flags:
text = self._process_time(text)
if self.PREPROCESS_DATE in flags:
text = self._process_date(text)
if self.PREPROCESS_CODES in flags:
text = self._process_codes(text)
if self.PREPROCESS_PERCENTAGE in flags:
text = self._process_percentage(text)
# Numbers should be processed last to avoid conflicts
if self.PREPROCESS_NUMBERS in flags:
text = self._process_numbers(text)
return text
def _process_numbers(self, text: str) -> str:
"""Convert numbers to words based on locale"""
decimal_sep = self.locale_data["numbers"]["decimal_separator"]
thousands_sep = self.locale_data["numbers"]["thousands_separator"]
decimal_word = self.locale_data["numbers"]["decimal_word"]
threshold = self.locale_data.get("small_number_threshold", 100)
def replace_number(match):
num_str = match.group()
# Normalize number format |