Spaces:

UcsTurkey
/

flare

Paused

App Files Files Community

ciyidogan commited on Jun 24

Commit

c461a97

verified ·

1 Parent(s): 3486391

Update tts_preprocessor.py

Browse files

Files changed (1) hide show

tts_preprocessor.py +87 -52

tts_preprocessor.py CHANGED Viewed

@@ -134,52 +134,77 @@ class TTSPreprocessor:
         return re.sub(pattern, replace_number, text)
     def _process_currency(self, text: str) -> str:
         """Process currency symbols and amounts based on locale"""
         currency_data = self.locale_data.get("currency", {})
-        # Replace currency symbols
-        for symbol, word in currency_data.get("symbols", {}).items():
             text = text.replace(symbol, f" {word} ")
         # Process currency codes
-        for code, word in currency_data.get("codes", {}).items():
-            pattern = rf'(\d+)\s*{code}\b'
             text = re.sub(pattern, rf'\1 {word}', text, flags=re.IGNORECASE)
         return text
-    def _process_time(self, text: str) -> str:
-        """Process time formats based on locale"""
-        time_format = self.locale_data.get("time", {}).get("format", "word")
-        def replace_time(match):
-            hour, minute = match.groups()
-            hour_int = int(hour)
-            minute_int = int(minute)
-            if time_format == "word":
-                try:
-                    hour_word = num2words(hour_int, lang=self.language)
-                    minute_word = num2words(minute_int, lang=self.language) if minute_int > 0 else ""
-                    if minute_int == 0:
-                        return hour_word
-                    else:
-                        separator = self.locale_data.get("time", {}).get("separator", " ")
-                        return f"{hour_word}{separator}{minute_word}"
-                except NotImplementedError:
-                    return f"{hour} {minute}"
-            else:
-                return f"{hour} {minute}"
-        pattern = r'(\d{1,2}):(\d{2})'
-        return re.sub(pattern, replace_time, text)
     def _process_date(self, text: str) -> str:
         """Process date formats based on locale"""
         months = self.locale_data.get("months", {})
-        date_format = self.locale_data.get("date", {}).get("format", "YYYY-MM-DD")
         # Convert ISO format dates
         def replace_date(match):
@@ -187,9 +212,11 @@ class TTSPreprocessor:
             month_name = months.get(month, month)
             # Format based on locale preference
-            if "DD MMMM YYYY" in date_format:
                 return f"{int(day)} {month_name} {year}"
-            elif "MMMM DD, YYYY" in date_format:
                 return f"{month_name} {int(day)}, {year}"
             else:
                 return match.group()
@@ -197,27 +224,35 @@ class TTSPreprocessor:
         pattern = r'(\d{4})-(\d{2})-(\d{2})'
         return re.sub(pattern, replace_date, text)
-    def _process_codes(self, text: str) -> str:
-        """Process codes like PNR, flight numbers - language agnostic"""
-        def spell_code(match):
-            code = match.group()
-            return ' '.join(code)
-        # Match uppercase letters followed by numbers
-        pattern = r'\b[A-Z]{2,5}\d{2,5}\b'
-        return re.sub(pattern, spell_code, text)
-    def _process_percentage(self, text: str) -> str:
-        """Process percentage symbols based on locale"""
-        percentage = self.locale_data.get("percentage", {})
-        prefix = percentage.get("prefix", "")
-        suffix = percentage.get("suffix", "")
-        if prefix:
-            pattern = r'%\s*(\d+)'
-            replacement = rf'{prefix} \1'
         else:
-            pattern = r'(\d+)\s*%'
-            replacement = rf'\1 {suffix}'
-        return re.sub(pattern, replacement, text)

         return re.sub(pattern, replace_number, text)
+    def _process_codes(self, text: str) -> str:
+        """Process codes like PNR, flight numbers - language agnostic"""
+        def spell_code(match):
+            code = match.group()
+            return ' '.join(code)
+        # Match uppercase letters followed by numbers
+        pattern = r'\b[A-Z]{2,5}\d{2,5}\b'
+        return re.sub(pattern, spell_code, text)
     def _process_currency(self, text: str) -> str:
         """Process currency symbols and amounts based on locale"""
         currency_data = self.locale_data.get("currency", {})
+        if not isinstance(currency_data, dict):
+            return text
+        symbol = currency_data.get("symbol", "")
+        word = currency_data.get("word", "")
+        code = currency_data.get("code", "")
+        position = currency_data.get("position", "before")
+        if symbol and word:
+            # Replace standalone symbols
             text = text.replace(symbol, f" {word} ")
+            # Replace symbol with amount
+            if position == "before":
+                # $100 -> 100 dollar
+                pattern = rf'{re.escape(symbol)}\s*(\d+(?:[.,]\d+)?)'
+                text = re.sub(pattern, rf'\1 {word}', text)
+            else:
+                # 100₺ -> 100 lira
+                pattern = rf'(\d+(?:[.,]\d+)?)\s*{re.escape(symbol)}'
+                text = re.sub(pattern, rf'\1 {word}', text)
         # Process currency codes
+        if code and word:
+            pattern = rf'(\d+(?:[.,]\d+)?)\s*{code}\b'
             text = re.sub(pattern, rf'\1 {word}', text, flags=re.IGNORECASE)
         return text
+    def _process_percentage(self, text: str) -> str:
+        """Process percentage symbols based on locale"""
+        percentage = self.locale_data.get("percentage", {})
+        if not isinstance(percentage, dict):
+            return text
+        word = percentage.get("word", "percent")
+        position = percentage.get("position", "after")
+        if position == "before":
+            # %50 -> yüzde 50
+            pattern = r'%\s*(\d+(?:[.,]\d+)?)'
+            replacement = rf'{word} \1'
+        else:
+            # 50% -> 50 percent
+            pattern = r'(\d+(?:[.,]\d+)?)\s*%'
+            replacement = rf'\1 {word}'
+        return re.sub(pattern, replacement, text)
     def _process_date(self, text: str) -> str:
         """Process date formats based on locale"""
         months = self.locale_data.get("months", {})
+        date_format = self.locale_data.get("date_format", "YYYY-MM-DD")
+        if not isinstance(months, dict):
+            return text
         # Convert ISO format dates
         def replace_date(match):
             month_name = months.get(month, month)
             # Format based on locale preference
+            if "DD.MM.YYYY" in date_format:
+                # Turkish format with month name
                 return f"{int(day)} {month_name} {year}"
+            elif "MM/DD/YYYY" in date_format:
+                # US format with month name
                 return f"{month_name} {int(day)}, {year}"
             else:
                 return match.group()
         pattern = r'(\d{4})-(\d{2})-(\d{2})'
         return re.sub(pattern, replace_date, text)
+    def _process_time(self, text: str) -> str:
+        """Process time formats based on locale"""
+        time_data = self.locale_data.get("time", {})
+        if not isinstance(time_data, dict):
+            time_format = "word"
+            separator = " "
         else:
+            time_format = time_data.get("format", "word")
+            separator = time_data.get("separator", " ")
+        def replace_time(match):
+            hour, minute = match.groups()
+            hour_int = int(hour)
+            minute_int = int(minute)
+            if time_format == "word":
+                try:
+                    hour_word = num2words(hour_int, lang=self.language)
+                    minute_word = num2words(minute_int, lang=self.language) if minute_int > 0 else ""
+                    if minute_int == 0:
+                        return hour_word
+                    else:
+                        return f"{hour_word}{separator}{minute_word}"
+                except NotImplementedError:
+                    return f"{hour} {minute}"
+            else:
+                return f"{hour} {minute}"
+        pattern = r'(\d{1,2}):(\d{2})'
+        return re.sub(pattern, replace_time, text)