Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -86,20 +86,63 @@ def detect_language(text: str) -> str:
|
|
86 |
return "en"
|
87 |
|
88 |
def protect_terms(text: str, protected_terms: list) -> tuple[str, dict]:
|
89 |
-
"""Replace protected terms with placeholders."""
|
90 |
modified_text = text
|
91 |
replacements = {}
|
|
|
92 |
for i, term in enumerate(protected_terms):
|
93 |
-
|
|
|
94 |
replacements[placeholder] = term
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
return modified_text, replacements
|
97 |
|
98 |
def restore_terms(text: str, replacements: dict) -> str:
|
99 |
-
"""Restore protected terms in the translated text."""
|
100 |
restored_text = text
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
return restored_text
|
104 |
|
105 |
# FastAPI endpoints
|
@@ -147,13 +190,25 @@ async def translate(text: str, source_lang_override: Optional[str] = None):
|
|
147 |
|
148 |
# Protect terms before translation
|
149 |
modified_text, replacements = protect_terms(text, PROTECTED_TERMS)
|
|
|
|
|
|
|
150 |
|
151 |
-
# Perform translation
|
152 |
-
result = translator(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
translated_text = result[0]["translation_text"]
|
|
|
154 |
|
155 |
# Restore protected terms
|
156 |
final_text = restore_terms(translated_text, replacements)
|
|
|
157 |
|
158 |
return TranslationResponse(
|
159 |
translated_text=final_text,
|
@@ -254,6 +309,7 @@ def create_gradio_interface():
|
|
254 |
gr.Examples(
|
255 |
examples=[
|
256 |
["สวัสดีครับ ยินดีที่ได้รู้จัก การพัฒนา 2030 Aspirations เป็นเป้าหมายสำคัญ", "th"],
|
|
|
257 |
["こんにちは、はじめまして。Griffith大学での研究が進んで��ます。", "ja"],
|
258 |
["你好,很高兴认识你。我们正在为2030 Aspirations制定计划。", "zh"],
|
259 |
["Xin chào, rất vui được gặp bạn. Griffith là trường đại học tuyệt vời.", "vi"],
|
|
|
86 |
return "en"
|
87 |
|
88 |
def protect_terms(text: str, protected_terms: list) -> tuple[str, dict]:
|
89 |
+
"""Replace protected terms with placeholders using more robust patterns."""
|
90 |
modified_text = text
|
91 |
replacements = {}
|
92 |
+
|
93 |
for i, term in enumerate(protected_terms):
|
94 |
+
# Create a unique placeholder
|
95 |
+
placeholder = f"PROTECTEDTERM{i}PLACEHOLDER"
|
96 |
replacements[placeholder] = term
|
97 |
+
|
98 |
+
# Use multiple patterns to catch the term
|
99 |
+
patterns = [
|
100 |
+
# Exact match with word boundaries
|
101 |
+
r'\b' + re.escape(term) + r'\b',
|
102 |
+
# Case insensitive match
|
103 |
+
r'(?i)\b' + re.escape(term) + r'\b',
|
104 |
+
# Match with potential spaces/punctuation
|
105 |
+
re.escape(term).replace(r'\ ', r'\s+'),
|
106 |
+
]
|
107 |
+
|
108 |
+
for pattern in patterns:
|
109 |
+
if re.search(pattern, modified_text):
|
110 |
+
modified_text = re.sub(pattern, placeholder, modified_text)
|
111 |
+
logger.debug(f"Protected term '{term}' replaced with '{placeholder}'")
|
112 |
+
break
|
113 |
+
|
114 |
return modified_text, replacements
|
115 |
|
116 |
def restore_terms(text: str, replacements: dict) -> str:
|
117 |
+
"""Restore protected terms in the translated text with fuzzy matching."""
|
118 |
restored_text = text
|
119 |
+
|
120 |
+
for placeholder, original_term in replacements.items():
|
121 |
+
# Direct replacement
|
122 |
+
if placeholder in restored_text:
|
123 |
+
restored_text = restored_text.replace(placeholder, original_term)
|
124 |
+
logger.debug(f"Restored '{placeholder}' to '{original_term}'")
|
125 |
+
else:
|
126 |
+
# Try to find partial matches or corrupted placeholders
|
127 |
+
# Sometimes translation models might alter the placeholder slightly
|
128 |
+
words = restored_text.split()
|
129 |
+
for i, word in enumerate(words):
|
130 |
+
# Check if word contains part of our placeholder
|
131 |
+
if "PROTECTEDTERM" in word and "PLACEHOLDER" in word:
|
132 |
+
words[i] = original_term
|
133 |
+
logger.debug(f"Fuzzy restored corrupted placeholder '{word}' to '{original_term}'")
|
134 |
+
# Also check for common corruptions
|
135 |
+
elif word.upper().replace(".", "").replace(",", "") == placeholder.upper():
|
136 |
+
words[i] = original_term
|
137 |
+
logger.debug(f"Restored corrupted '{word}' to '{original_term}'")
|
138 |
+
|
139 |
+
restored_text = " ".join(words)
|
140 |
+
|
141 |
+
# Clean up any remaining artifacts (dots, extra spaces)
|
142 |
+
restored_text = re.sub(r'\s*\.\s*\.\s*\.\s*\.+', '', restored_text) # Remove multiple dots
|
143 |
+
restored_text = re.sub(r'\s+', ' ', restored_text) # Normalize spaces
|
144 |
+
restored_text = restored_text.strip()
|
145 |
+
|
146 |
return restored_text
|
147 |
|
148 |
# FastAPI endpoints
|
|
|
190 |
|
191 |
# Protect terms before translation
|
192 |
modified_text, replacements = protect_terms(text, PROTECTED_TERMS)
|
193 |
+
logger.debug(f"Original text: '{text}'")
|
194 |
+
logger.debug(f"Modified text: '{modified_text}'")
|
195 |
+
logger.debug(f"Replacements: {replacements}")
|
196 |
|
197 |
+
# Perform translation with more conservative settings
|
198 |
+
result = translator(
|
199 |
+
modified_text,
|
200 |
+
max_length=512,
|
201 |
+
num_beams=2, # Reduced from 4 to be more conservative
|
202 |
+
do_sample=False,
|
203 |
+
early_stopping=True,
|
204 |
+
no_repeat_ngram_size=2
|
205 |
+
)
|
206 |
translated_text = result[0]["translation_text"]
|
207 |
+
logger.debug(f"Raw translation: '{translated_text}'")
|
208 |
|
209 |
# Restore protected terms
|
210 |
final_text = restore_terms(translated_text, replacements)
|
211 |
+
logger.debug(f"Final text after restoration: '{final_text}'")
|
212 |
|
213 |
return TranslationResponse(
|
214 |
translated_text=final_text,
|
|
|
309 |
gr.Examples(
|
310 |
examples=[
|
311 |
["สวัสดีครับ ยินดีที่ได้รู้จัก การพัฒนา 2030 Aspirations เป็นเป้าหมายสำคัญ", "th"],
|
312 |
+
["ฉันเลือกทานอาหารที่ดีต่อสุขภาพร่างกายเพื่อเป็นส่วนหนึ่งในการสนับสนุน 2030 Aspirations", "th"],
|
313 |
["こんにちは、はじめまして。Griffith大学での研究が進んで��ます。", "ja"],
|
314 |
["你好,很高兴认识你。我们正在为2030 Aspirations制定计划。", "zh"],
|
315 |
["Xin chào, rất vui được gặp bạn. Griffith là trường đại học tuyệt vời.", "vi"],
|