Spaces:
Paused
Paused
Create tts_preprocessor.py
Browse files- tts_preprocessor.py +87 -0
tts_preprocessor.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
TTS Text Preprocessing Utilities with Multilingual Support
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import re
|
| 6 |
+
import json
|
| 7 |
+
from typing import Dict, Set, Optional
|
| 8 |
+
from num2words import num2words
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
class TTSPreprocessor:
|
| 12 |
+
"""Text preprocessor for TTS providers with multilingual support"""
|
| 13 |
+
|
| 14 |
+
# Preprocessing flags
|
| 15 |
+
PREPROCESS_NUMBERS = "numbers"
|
| 16 |
+
PREPROCESS_CURRENCY = "currency"
|
| 17 |
+
PREPROCESS_TIME = "time"
|
| 18 |
+
PREPROCESS_DATE = "date"
|
| 19 |
+
PREPROCESS_CODES = "codes"
|
| 20 |
+
PREPROCESS_PERCENTAGE = "percentage"
|
| 21 |
+
|
| 22 |
+
def __init__(self, language: str = "tr"):
|
| 23 |
+
self.language = language
|
| 24 |
+
self.locale_data = self._load_locale(language)
|
| 25 |
+
|
| 26 |
+
def _load_locale(self, language: str) -> Dict:
|
| 27 |
+
"""Load locale data from JSON file"""
|
| 28 |
+
locale_path = Path(__file__).parent / "locales" / f"{language}.json"
|
| 29 |
+
|
| 30 |
+
# Fallback to English if locale not found
|
| 31 |
+
if not locale_path.exists():
|
| 32 |
+
print(f"⚠️ Locale file not found for {language}, falling back to English")
|
| 33 |
+
locale_path = Path(__file__).parent / "locales" / "en.json"
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
with open(locale_path, 'r', encoding='utf-8') as f:
|
| 37 |
+
return json.load(f)
|
| 38 |
+
except Exception as e:
|
| 39 |
+
print(f"❌ Error loading locale {language}: {e}")
|
| 40 |
+
# Return minimal default structure
|
| 41 |
+
return {
|
| 42 |
+
"language_code": language,
|
| 43 |
+
"currency": {"symbols": {}, "codes": {}},
|
| 44 |
+
"months": {},
|
| 45 |
+
"numbers": {
|
| 46 |
+
"decimal_separator": ".",
|
| 47 |
+
"thousands_separator": ",",
|
| 48 |
+
"decimal_word": "point"
|
| 49 |
+
},
|
| 50 |
+
"small_number_threshold": 100
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
def preprocess(self, text: str, flags: Set[str]) -> str:
|
| 54 |
+
"""Apply preprocessing based on flags"""
|
| 55 |
+
|
| 56 |
+
if self.PREPROCESS_CURRENCY in flags:
|
| 57 |
+
text = self._process_currency(text)
|
| 58 |
+
|
| 59 |
+
if self.PREPROCESS_TIME in flags:
|
| 60 |
+
text = self._process_time(text)
|
| 61 |
+
|
| 62 |
+
if self.PREPROCESS_DATE in flags:
|
| 63 |
+
text = self._process_date(text)
|
| 64 |
+
|
| 65 |
+
if self.PREPROCESS_CODES in flags:
|
| 66 |
+
text = self._process_codes(text)
|
| 67 |
+
|
| 68 |
+
if self.PREPROCESS_PERCENTAGE in flags:
|
| 69 |
+
text = self._process_percentage(text)
|
| 70 |
+
|
| 71 |
+
# Numbers should be processed last to avoid conflicts
|
| 72 |
+
if self.PREPROCESS_NUMBERS in flags:
|
| 73 |
+
text = self._process_numbers(text)
|
| 74 |
+
|
| 75 |
+
return text
|
| 76 |
+
|
| 77 |
+
def _process_numbers(self, text: str) -> str:
|
| 78 |
+
"""Convert numbers to words based on locale"""
|
| 79 |
+
decimal_sep = self.locale_data["numbers"]["decimal_separator"]
|
| 80 |
+
thousands_sep = self.locale_data["numbers"]["thousands_separator"]
|
| 81 |
+
decimal_word = self.locale_data["numbers"]["decimal_word"]
|
| 82 |
+
threshold = self.locale_data.get("small_number_threshold", 100)
|
| 83 |
+
|
| 84 |
+
def replace_number(match):
|
| 85 |
+
num_str = match.group()
|
| 86 |
+
|
| 87 |
+
# Normalize number format
|