Spaces:

UcsTurkey
/

flare

Building

App Files Files Community

ciyidogan commited on Jun 18

Commit

17a90d6

verified ·

1 Parent(s): 81e4201

Update tts_preprocessor.py

Browse files

Files changed (1) hide show

tts_preprocessor.py +137 -1

tts_preprocessor.py CHANGED Viewed

@@ -84,4 +84,140 @@ class TTSPreprocessor:
         def replace_number(match):
             num_str = match.group()
-            # Normalize number format

         def replace_number(match):
             num_str = match.group()
+            # Normalize number format
+            if self.language == "tr":
+                # Turkish: 1.234,56 -> 1234.56
+                num_str = num_str.replace('.', '').replace(',', '.')
+            else:
+                # English: 1,234.56 -> 1234.56
+                num_str = num_str.replace(',', '')
+            try:
+                num = float(num_str)
+                if num.is_integer():
+                    num = int(num)
+                # Keep small numbers as is based on threshold
+                if isinstance(num, int) and 0 <= num <= threshold:
+                    return str(num)
+                # Convert large numbers to words
+                if isinstance(num, int):
+                    try:
+                        return num2words(num, lang=self.language)
+                    except NotImplementedError:
+                        # Fallback to English if language not supported
+                        return num2words(num, lang='en')
+                else:
+                    # Handle decimal
+                    integer_part = int(num)
+                    decimal_part = int((num - integer_part) * 100)
+                    try:
+                        int_words = num2words(integer_part, lang=self.language)
+                        dec_words = num2words(decimal_part, lang=self.language)
+                        return f"{int_words} {decimal_word} {dec_words}"
+                    except NotImplementedError:
+                        # Fallback
+                        int_words = num2words(integer_part, lang='en')
+                        dec_words = num2words(decimal_part, lang='en')
+                        return f"{int_words} {decimal_word} {dec_words}"
+            except:
+                return num_str
+        # Match numbers with locale-specific format
+        if self.language == "tr":
+            pattern = r'\b\d{1,3}(?:\.\d{3})*(?:,\d+)?\b|\b\d+(?:,\d+)?\b'
+        else:
+            pattern = r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b|\b\d+(?:\.\d+)?\b'
+        return re.sub(pattern, replace_number, text)
+    def _process_currency(self, text: str) -> str:
+        """Process currency symbols and amounts based on locale"""
+        currency_data = self.locale_data.get("currency", {})
+        # Replace currency symbols
+        for symbol, word in currency_data.get("symbols", {}).items():
+            text = text.replace(symbol, f" {word} ")
+        # Process currency codes
+        for code, word in currency_data.get("codes", {}).items():
+            pattern = rf'(\d+)\s*{code}\b'
+            text = re.sub(pattern, rf'\1 {word}', text, flags=re.IGNORECASE)
+        return text
+    def _process_time(self, text: str) -> str:
+        """Process time formats based on locale"""
+        time_format = self.locale_data.get("time", {}).get("format", "word")
+        def replace_time(match):
+            hour, minute = match.groups()
+            hour_int = int(hour)
+            minute_int = int(minute)
+            if time_format == "word":
+                try:
+                    hour_word = num2words(hour_int, lang=self.language)
+                    minute_word = num2words(minute_int, lang=self.language) if minute_int > 0 else ""
+                    if minute_int == 0:
+                        return hour_word
+                    else:
+                        separator = self.locale_data.get("time", {}).get("separator", " ")
+                        return f"{hour_word}{separator}{minute_word}"
+                except NotImplementedError:
+                    return f"{hour} {minute}"
+            else:
+                return f"{hour} {minute}"
+        pattern = r'(\d{1,2}):(\d{2})'
+        return re.sub(pattern, replace_time, text)
+    def _process_date(self, text: str) -> str:
+        """Process date formats based on locale"""
+        months = self.locale_data.get("months", {})
+        date_format = self.locale_data.get("date", {}).get("format", "YYYY-MM-DD")
+        # Convert ISO format dates
+        def replace_date(match):
+            year, month, day = match.groups()
+            month_name = months.get(month, month)
+            # Format based on locale preference
+            if "DD MMMM YYYY" in date_format:
+                return f"{int(day)} {month_name} {year}"
+            elif "MMMM DD, YYYY" in date_format:
+                return f"{month_name} {int(day)}, {year}"
+            else:
+                return match.group()
+        pattern = r'(\d{4})-(\d{2})-(\d{2})'
+        return re.sub(pattern, replace_date, text)
+    def _process_codes(self, text: str) -> str:
+        """Process codes like PNR, flight numbers - language agnostic"""
+        def spell_code(match):
+            code = match.group()
+            return ' '.join(code)
+        # Match uppercase letters followed by numbers
+        pattern = r'\b[A-Z]{2,5}\d{2,5}\b'
+        return re.sub(pattern, spell_code, text)
+    def _process_percentage(self, text: str) -> str:
+        """Process percentage symbols based on locale"""
+        percentage = self.locale_data.get("percentage", {})
+        prefix = percentage.get("prefix", "")
+        suffix = percentage.get("suffix", "")
+        if prefix:
+            pattern = r'%\s*(\d+)'
+            replacement = rf'{prefix} \1'
+        else:
+            pattern = r'(\d+)\s*%'
+            replacement = rf'\1 {suffix}'
+        return re.sub(pattern, replacement, text)