feliksius commited on
Commit
1a58b56
·
verified ·
1 Parent(s): eef12d5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -64
app.py CHANGED
@@ -86,63 +86,20 @@ def detect_language(text: str) -> str:
86
  return "en"
87
 
88
  def protect_terms(text: str, protected_terms: list) -> tuple[str, dict]:
89
- """Replace protected terms with placeholders using more robust patterns."""
90
  modified_text = text
91
  replacements = {}
92
-
93
  for i, term in enumerate(protected_terms):
94
- # Create a unique placeholder
95
- placeholder = f"PROTECTEDTERM{i}PLACEHOLDER"
96
  replacements[placeholder] = term
97
-
98
- # Use multiple patterns to catch the term
99
- patterns = [
100
- # Exact match with word boundaries
101
- r'\b' + re.escape(term) + r'\b',
102
- # Case insensitive match
103
- r'(?i)\b' + re.escape(term) + r'\b',
104
- # Match with potential spaces/punctuation
105
- re.escape(term).replace(r'\ ', r'\s+'),
106
- ]
107
-
108
- for pattern in patterns:
109
- if re.search(pattern, modified_text):
110
- modified_text = re.sub(pattern, placeholder, modified_text)
111
- logger.debug(f"Protected term '{term}' replaced with '{placeholder}'")
112
- break
113
-
114
  return modified_text, replacements
115
 
116
  def restore_terms(text: str, replacements: dict) -> str:
117
- """Restore protected terms in the translated text with fuzzy matching."""
118
  restored_text = text
119
-
120
- for placeholder, original_term in replacements.items():
121
- # Direct replacement
122
- if placeholder in restored_text:
123
- restored_text = restored_text.replace(placeholder, original_term)
124
- logger.debug(f"Restored '{placeholder}' to '{original_term}'")
125
- else:
126
- # Try to find partial matches or corrupted placeholders
127
- # Sometimes translation models might alter the placeholder slightly
128
- words = restored_text.split()
129
- for i, word in enumerate(words):
130
- # Check if word contains part of our placeholder
131
- if "PROTECTEDTERM" in word and "PLACEHOLDER" in word:
132
- words[i] = original_term
133
- logger.debug(f"Fuzzy restored corrupted placeholder '{word}' to '{original_term}'")
134
- # Also check for common corruptions
135
- elif word.upper().replace(".", "").replace(",", "") == placeholder.upper():
136
- words[i] = original_term
137
- logger.debug(f"Restored corrupted '{word}' to '{original_term}'")
138
-
139
- restored_text = " ".join(words)
140
-
141
- # Clean up any remaining artifacts (dots, extra spaces)
142
- restored_text = re.sub(r'\s*\.\s*\.\s*\.\s*\.+', '', restored_text) # Remove multiple dots
143
- restored_text = re.sub(r'\s+', ' ', restored_text) # Normalize spaces
144
- restored_text = restored_text.strip()
145
-
146
  return restored_text
147
 
148
  # FastAPI endpoints
@@ -190,25 +147,13 @@ async def translate(text: str, source_lang_override: Optional[str] = None):
190
 
191
  # Protect terms before translation
192
  modified_text, replacements = protect_terms(text, PROTECTED_TERMS)
193
- logger.debug(f"Original text: '{text}'")
194
- logger.debug(f"Modified text: '{modified_text}'")
195
- logger.debug(f"Replacements: {replacements}")
196
 
197
- # Perform translation with more conservative settings
198
- result = translator(
199
- modified_text,
200
- max_length=512,
201
- num_beams=2, # Reduced from 4 to be more conservative
202
- do_sample=False,
203
- early_stopping=True,
204
- no_repeat_ngram_size=2
205
- )
206
  translated_text = result[0]["translation_text"]
207
- logger.debug(f"Raw translation: '{translated_text}'")
208
 
209
  # Restore protected terms
210
  final_text = restore_terms(translated_text, replacements)
211
- logger.debug(f"Final text after restoration: '{final_text}'")
212
 
213
  return TranslationResponse(
214
  translated_text=final_text,
@@ -309,7 +254,6 @@ def create_gradio_interface():
309
  gr.Examples(
310
  examples=[
311
  ["สวัสดีครับ ยินดีที่ได้รู้จัก การพัฒนา 2030 Aspirations เป็นเป้าหมายสำคัญ", "th"],
312
- ["ฉันเลือกทานอาหารที่ดีต่อสุขภาพร่างกายเพื่อเป็นส่วนหนึ่งในการสนับสนุน 2030 Aspirations", "th"],
313
  ["こんにちは、はじめまして。Griffith大学での研究が進んで��ます。", "ja"],
314
  ["你好,很高兴认识你。我们正在为2030 Aspirations制定计划。", "zh"],
315
  ["Xin chào, rất vui được gặp bạn. Griffith là trường đại học tuyệt vời.", "vi"],
 
86
  return "en"
87
 
88
  def protect_terms(text: str, protected_terms: list) -> tuple[str, dict]:
89
+ """Replace protected terms with placeholders."""
90
  modified_text = text
91
  replacements = {}
 
92
  for i, term in enumerate(protected_terms):
93
+ placeholder = f"__PROTECTED_{i}__"
 
94
  replacements[placeholder] = term
95
+ modified_text = re.sub(r'\b' + re.escape(term) + r'\b', placeholder, modified_text, flags=re.IGNORECASE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  return modified_text, replacements
97
 
98
  def restore_terms(text: str, replacements: dict) -> str:
99
+ """Restore protected terms in the translated text."""
100
  restored_text = text
101
+ for placeholder, term in replacements.items():
102
+ restored_text = restored_text.replace(placeholder, term)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  return restored_text
104
 
105
  # FastAPI endpoints
 
147
 
148
  # Protect terms before translation
149
  modified_text, replacements = protect_terms(text, PROTECTED_TERMS)
 
 
 
150
 
151
+ # Perform translation
152
+ result = translator(modified_text, max_length=512, num_beams=4)
 
 
 
 
 
 
 
153
  translated_text = result[0]["translation_text"]
 
154
 
155
  # Restore protected terms
156
  final_text = restore_terms(translated_text, replacements)
 
157
 
158
  return TranslationResponse(
159
  translated_text=final_text,
 
254
  gr.Examples(
255
  examples=[
256
  ["สวัสดีครับ ยินดีที่ได้รู้จัก การพัฒนา 2030 Aspirations เป็นเป้าหมายสำคัญ", "th"],
 
257
  ["こんにちは、はじめまして。Griffith大学での研究が進んで��ます。", "ja"],
258
  ["你好,很高兴认识你。我们正在为2030 Aspirations制定计划。", "zh"],
259
  ["Xin chào, rất vui được gặp bạn. Griffith là trường đại học tuyệt vời.", "vi"],