Spaces:

UcsTurkey
/

flare

Running

File size: 3,114 Bytes

81e4201

"""
TTS Text Preprocessing Utilities with Multilingual Support
"""

import re
import json
from typing import Dict, Set, Optional
from num2words import num2words
from pathlib import Path

class TTSPreprocessor:
    """Text preprocessor for TTS providers with multilingual support"""
    
    # Preprocessing flags
    PREPROCESS_NUMBERS = "numbers"
    PREPROCESS_CURRENCY = "currency"
    PREPROCESS_TIME = "time"
    PREPROCESS_DATE = "date"
    PREPROCESS_CODES = "codes"
    PREPROCESS_PERCENTAGE = "percentage"
    
    def __init__(self, language: str = "tr"):
        self.language = language
        self.locale_data = self._load_locale(language)
        
    def _load_locale(self, language: str) -> Dict:
        """Load locale data from JSON file"""
        locale_path = Path(__file__).parent / "locales" / f"{language}.json"
        
        # Fallback to English if locale not found
        if not locale_path.exists():
            print(f"⚠️ Locale file not found for {language}, falling back to English")
            locale_path = Path(__file__).parent / "locales" / "en.json"
            
        try:
            with open(locale_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            print(f"❌ Error loading locale {language}: {e}")
            # Return minimal default structure
            return {
                "language_code": language,
                "currency": {"symbols": {}, "codes": {}},
                "months": {},
                "numbers": {
                    "decimal_separator": ".",
                    "thousands_separator": ",",
                    "decimal_word": "point"
                },
                "small_number_threshold": 100
            }
    
    def preprocess(self, text: str, flags: Set[str]) -> str:
        """Apply preprocessing based on flags"""
        
        if self.PREPROCESS_CURRENCY in flags:
            text = self._process_currency(text)
            
        if self.PREPROCESS_TIME in flags:
            text = self._process_time(text)
            
        if self.PREPROCESS_DATE in flags:
            text = self._process_date(text)
            
        if self.PREPROCESS_CODES in flags:
            text = self._process_codes(text)
            
        if self.PREPROCESS_PERCENTAGE in flags:
            text = self._process_percentage(text)
            
        # Numbers should be processed last to avoid conflicts
        if self.PREPROCESS_NUMBERS in flags:
            text = self._process_numbers(text)
            
        return text
    
    def _process_numbers(self, text: str) -> str:
        """Convert numbers to words based on locale"""
        decimal_sep = self.locale_data["numbers"]["decimal_separator"]
        thousands_sep = self.locale_data["numbers"]["thousands_separator"]
        decimal_word = self.locale_data["numbers"]["decimal_word"]
        threshold = self.locale_data.get("small_number_threshold", 100)
        
        def replace_number(match):
            num_str = match.group()
            
            # Normalize number format