ciyidogan commited on
Commit
17a90d6
·
verified ·
1 Parent(s): 81e4201

Update tts_preprocessor.py

Browse files
Files changed (1) hide show
  1. tts_preprocessor.py +137 -1
tts_preprocessor.py CHANGED
@@ -84,4 +84,140 @@ class TTSPreprocessor:
84
  def replace_number(match):
85
  num_str = match.group()
86
 
87
- # Normalize number format
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  def replace_number(match):
85
  num_str = match.group()
86
 
87
+ # Normalize number format
88
+ if self.language == "tr":
89
+ # Turkish: 1.234,56 -> 1234.56
90
+ num_str = num_str.replace('.', '').replace(',', '.')
91
+ else:
92
+ # English: 1,234.56 -> 1234.56
93
+ num_str = num_str.replace(',', '')
94
+
95
+ try:
96
+ num = float(num_str)
97
+ if num.is_integer():
98
+ num = int(num)
99
+
100
+ # Keep small numbers as is based on threshold
101
+ if isinstance(num, int) and 0 <= num <= threshold:
102
+ return str(num)
103
+
104
+ # Convert large numbers to words
105
+ if isinstance(num, int):
106
+ try:
107
+ return num2words(num, lang=self.language)
108
+ except NotImplementedError:
109
+ # Fallback to English if language not supported
110
+ return num2words(num, lang='en')
111
+ else:
112
+ # Handle decimal
113
+ integer_part = int(num)
114
+ decimal_part = int((num - integer_part) * 100)
115
+
116
+ try:
117
+ int_words = num2words(integer_part, lang=self.language)
118
+ dec_words = num2words(decimal_part, lang=self.language)
119
+ return f"{int_words} {decimal_word} {dec_words}"
120
+ except NotImplementedError:
121
+ # Fallback
122
+ int_words = num2words(integer_part, lang='en')
123
+ dec_words = num2words(decimal_part, lang='en')
124
+ return f"{int_words} {decimal_word} {dec_words}"
125
+
126
+ except:
127
+ return num_str
128
+
129
+ # Match numbers with locale-specific format
130
+ if self.language == "tr":
131
+ pattern = r'\b\d{1,3}(?:\.\d{3})*(?:,\d+)?\b|\b\d+(?:,\d+)?\b'
132
+ else:
133
+ pattern = r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b|\b\d+(?:\.\d+)?\b'
134
+
135
+ return re.sub(pattern, replace_number, text)
136
+
137
+ def _process_currency(self, text: str) -> str:
138
+ """Process currency symbols and amounts based on locale"""
139
+ currency_data = self.locale_data.get("currency", {})
140
+
141
+ # Replace currency symbols
142
+ for symbol, word in currency_data.get("symbols", {}).items():
143
+ text = text.replace(symbol, f" {word} ")
144
+
145
+ # Process currency codes
146
+ for code, word in currency_data.get("codes", {}).items():
147
+ pattern = rf'(\d+)\s*{code}\b'
148
+ text = re.sub(pattern, rf'\1 {word}', text, flags=re.IGNORECASE)
149
+
150
+ return text
151
+
152
+ def _process_time(self, text: str) -> str:
153
+ """Process time formats based on locale"""
154
+ time_format = self.locale_data.get("time", {}).get("format", "word")
155
+
156
+ def replace_time(match):
157
+ hour, minute = match.groups()
158
+ hour_int = int(hour)
159
+ minute_int = int(minute)
160
+
161
+ if time_format == "word":
162
+ try:
163
+ hour_word = num2words(hour_int, lang=self.language)
164
+ minute_word = num2words(minute_int, lang=self.language) if minute_int > 0 else ""
165
+
166
+ if minute_int == 0:
167
+ return hour_word
168
+ else:
169
+ separator = self.locale_data.get("time", {}).get("separator", " ")
170
+ return f"{hour_word}{separator}{minute_word}"
171
+ except NotImplementedError:
172
+ return f"{hour} {minute}"
173
+ else:
174
+ return f"{hour} {minute}"
175
+
176
+ pattern = r'(\d{1,2}):(\d{2})'
177
+ return re.sub(pattern, replace_time, text)
178
+
179
+ def _process_date(self, text: str) -> str:
180
+ """Process date formats based on locale"""
181
+ months = self.locale_data.get("months", {})
182
+ date_format = self.locale_data.get("date", {}).get("format", "YYYY-MM-DD")
183
+
184
+ # Convert ISO format dates
185
+ def replace_date(match):
186
+ year, month, day = match.groups()
187
+ month_name = months.get(month, month)
188
+
189
+ # Format based on locale preference
190
+ if "DD MMMM YYYY" in date_format:
191
+ return f"{int(day)} {month_name} {year}"
192
+ elif "MMMM DD, YYYY" in date_format:
193
+ return f"{month_name} {int(day)}, {year}"
194
+ else:
195
+ return match.group()
196
+
197
+ pattern = r'(\d{4})-(\d{2})-(\d{2})'
198
+ return re.sub(pattern, replace_date, text)
199
+
200
+ def _process_codes(self, text: str) -> str:
201
+ """Process codes like PNR, flight numbers - language agnostic"""
202
+ def spell_code(match):
203
+ code = match.group()
204
+ return ' '.join(code)
205
+
206
+ # Match uppercase letters followed by numbers
207
+ pattern = r'\b[A-Z]{2,5}\d{2,5}\b'
208
+ return re.sub(pattern, spell_code, text)
209
+
210
+ def _process_percentage(self, text: str) -> str:
211
+ """Process percentage symbols based on locale"""
212
+ percentage = self.locale_data.get("percentage", {})
213
+ prefix = percentage.get("prefix", "")
214
+ suffix = percentage.get("suffix", "")
215
+
216
+ if prefix:
217
+ pattern = r'%\s*(\d+)'
218
+ replacement = rf'{prefix} \1'
219
+ else:
220
+ pattern = r'(\d+)\s*%'
221
+ replacement = rf'\1 {suffix}'
222
+
223
+ return re.sub(pattern, replacement, text)