feliksius commited on
Commit
eef12d5
·
verified ·
1 Parent(s): 6e13740

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -8
app.py CHANGED
@@ -86,20 +86,63 @@ def detect_language(text: str) -> str:
86
  return "en"
87
 
88
  def protect_terms(text: str, protected_terms: list) -> tuple[str, dict]:
89
- """Replace protected terms with placeholders."""
90
  modified_text = text
91
  replacements = {}
 
92
  for i, term in enumerate(protected_terms):
93
- placeholder = f"__PROTECTED_{i}__"
 
94
  replacements[placeholder] = term
95
- modified_text = re.sub(r'\b' + re.escape(term) + r'\b', placeholder, modified_text, flags=re.IGNORECASE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  return modified_text, replacements
97
 
98
  def restore_terms(text: str, replacements: dict) -> str:
99
- """Restore protected terms in the translated text."""
100
  restored_text = text
101
- for placeholder, term in replacements.items():
102
- restored_text = restored_text.replace(placeholder, term)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  return restored_text
104
 
105
  # FastAPI endpoints
@@ -147,13 +190,25 @@ async def translate(text: str, source_lang_override: Optional[str] = None):
147
 
148
  # Protect terms before translation
149
  modified_text, replacements = protect_terms(text, PROTECTED_TERMS)
 
 
 
150
 
151
- # Perform translation
152
- result = translator(modified_text, max_length=512, num_beams=4)
 
 
 
 
 
 
 
153
  translated_text = result[0]["translation_text"]
 
154
 
155
  # Restore protected terms
156
  final_text = restore_terms(translated_text, replacements)
 
157
 
158
  return TranslationResponse(
159
  translated_text=final_text,
@@ -254,6 +309,7 @@ def create_gradio_interface():
254
  gr.Examples(
255
  examples=[
256
  ["สวัสดีครับ ยินดีที่ได้รู้จัก การพัฒนา 2030 Aspirations เป็นเป้าหมายสำคัญ", "th"],
 
257
  ["こんにちは、はじめまして。Griffith大学での研究が進んで��ます。", "ja"],
258
  ["你好,很高兴认识你。我们正在为2030 Aspirations制定计划。", "zh"],
259
  ["Xin chào, rất vui được gặp bạn. Griffith là trường đại học tuyệt vời.", "vi"],
 
86
  return "en"
87
 
88
  def protect_terms(text: str, protected_terms: list) -> tuple[str, dict]:
89
+ """Replace protected terms with placeholders using more robust patterns."""
90
  modified_text = text
91
  replacements = {}
92
+
93
  for i, term in enumerate(protected_terms):
94
+ # Create a unique placeholder
95
+ placeholder = f"PROTECTEDTERM{i}PLACEHOLDER"
96
  replacements[placeholder] = term
97
+
98
+ # Use multiple patterns to catch the term
99
+ patterns = [
100
+ # Exact match with word boundaries
101
+ r'\b' + re.escape(term) + r'\b',
102
+ # Case insensitive match
103
+ r'(?i)\b' + re.escape(term) + r'\b',
104
+ # Match with potential spaces/punctuation
105
+ re.escape(term).replace(r'\ ', r'\s+'),
106
+ ]
107
+
108
+ for pattern in patterns:
109
+ if re.search(pattern, modified_text):
110
+ modified_text = re.sub(pattern, placeholder, modified_text)
111
+ logger.debug(f"Protected term '{term}' replaced with '{placeholder}'")
112
+ break
113
+
114
  return modified_text, replacements
115
 
116
  def restore_terms(text: str, replacements: dict) -> str:
117
+ """Restore protected terms in the translated text with fuzzy matching."""
118
  restored_text = text
119
+
120
+ for placeholder, original_term in replacements.items():
121
+ # Direct replacement
122
+ if placeholder in restored_text:
123
+ restored_text = restored_text.replace(placeholder, original_term)
124
+ logger.debug(f"Restored '{placeholder}' to '{original_term}'")
125
+ else:
126
+ # Try to find partial matches or corrupted placeholders
127
+ # Sometimes translation models might alter the placeholder slightly
128
+ words = restored_text.split()
129
+ for i, word in enumerate(words):
130
+ # Check if word contains part of our placeholder
131
+ if "PROTECTEDTERM" in word and "PLACEHOLDER" in word:
132
+ words[i] = original_term
133
+ logger.debug(f"Fuzzy restored corrupted placeholder '{word}' to '{original_term}'")
134
+ # Also check for common corruptions
135
+ elif word.upper().replace(".", "").replace(",", "") == placeholder.upper():
136
+ words[i] = original_term
137
+ logger.debug(f"Restored corrupted '{word}' to '{original_term}'")
138
+
139
+ restored_text = " ".join(words)
140
+
141
+ # Clean up any remaining artifacts (dots, extra spaces)
142
+ restored_text = re.sub(r'\s*\.\s*\.\s*\.\s*\.+', '', restored_text) # Remove multiple dots
143
+ restored_text = re.sub(r'\s+', ' ', restored_text) # Normalize spaces
144
+ restored_text = restored_text.strip()
145
+
146
  return restored_text
147
 
148
  # FastAPI endpoints
 
190
 
191
  # Protect terms before translation
192
  modified_text, replacements = protect_terms(text, PROTECTED_TERMS)
193
+ logger.debug(f"Original text: '{text}'")
194
+ logger.debug(f"Modified text: '{modified_text}'")
195
+ logger.debug(f"Replacements: {replacements}")
196
 
197
+ # Perform translation with more conservative settings
198
+ result = translator(
199
+ modified_text,
200
+ max_length=512,
201
+ num_beams=2, # Reduced from 4 to be more conservative
202
+ do_sample=False,
203
+ early_stopping=True,
204
+ no_repeat_ngram_size=2
205
+ )
206
  translated_text = result[0]["translation_text"]
207
+ logger.debug(f"Raw translation: '{translated_text}'")
208
 
209
  # Restore protected terms
210
  final_text = restore_terms(translated_text, replacements)
211
+ logger.debug(f"Final text after restoration: '{final_text}'")
212
 
213
  return TranslationResponse(
214
  translated_text=final_text,
 
309
  gr.Examples(
310
  examples=[
311
  ["สวัสดีครับ ยินดีที่ได้รู้จัก การพัฒนา 2030 Aspirations เป็นเป้าหมายสำคัญ", "th"],
312
+ ["ฉันเลือกทานอาหารที่ดีต่อสุขภาพร่างกายเพื่อเป็นส่วนหนึ่งในการสนับสนุน 2030 Aspirations", "th"],
313
  ["こんにちは、はじめまして。Griffith大学での研究が進んで��ます。", "ja"],
314
  ["你好,很高兴认识你。我们正在为2030 Aspirations制定计划。", "zh"],
315
  ["Xin chào, rất vui được gặp bạn. Griffith là trường đại học tuyệt vời.", "vi"],