Spaces:

feliksius
/

Translator

Runtime error

App Files Files Community

feliksius commited on 25 days ago

Commit

eef12d5

verified ·

1 Parent(s): 6e13740

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -8

app.py CHANGED Viewed

@@ -86,20 +86,63 @@ def detect_language(text: str) -> str:
         return "en"
 def protect_terms(text: str, protected_terms: list) -> tuple[str, dict]:
-    """Replace protected terms with placeholders."""
     modified_text = text
     replacements = {}
     for i, term in enumerate(protected_terms):
-        placeholder = f"__PROTECTED_{i}__"
         replacements[placeholder] = term
-        modified_text = re.sub(r'\b' + re.escape(term) + r'\b', placeholder, modified_text, flags=re.IGNORECASE)
     return modified_text, replacements
 def restore_terms(text: str, replacements: dict) -> str:
-    """Restore protected terms in the translated text."""
     restored_text = text
-    for placeholder, term in replacements.items():
-        restored_text = restored_text.replace(placeholder, term)
     return restored_text
 # FastAPI endpoints
@@ -147,13 +190,25 @@ async def translate(text: str, source_lang_override: Optional[str] = None):
         # Protect terms before translation
         modified_text, replacements = protect_terms(text, PROTECTED_TERMS)
-        # Perform translation
-        result = translator(modified_text, max_length=512, num_beams=4)
         translated_text = result[0]["translation_text"]
         # Restore protected terms
         final_text = restore_terms(translated_text, replacements)
         return TranslationResponse(
             translated_text=final_text,
@@ -254,6 +309,7 @@ def create_gradio_interface():
             gr.Examples(
                 examples=[
                     ["สวัสดีครับ ยินดีที่ได้รู้จัก การพัฒนา 2030 Aspirations เป็นเป้าหมายสำคัญ", "th"],
                     ["こんにちは、はじめまして。Griffith大学での研究が進んで��ます。", "ja"],
                     ["你好，很高兴认识你。我们正在为2030 Aspirations制定计划。", "zh"],
                     ["Xin chào, rất vui được gặp bạn. Griffith là trường đại học tuyệt vời.", "vi"],

         return "en"
 def protect_terms(text: str, protected_terms: list) -> tuple[str, dict]:
+    """Replace protected terms with placeholders using more robust patterns."""
     modified_text = text
     replacements = {}
     for i, term in enumerate(protected_terms):
+        # Create a unique placeholder
+        placeholder = f"PROTECTEDTERM{i}PLACEHOLDER"
         replacements[placeholder] = term
+        # Use multiple patterns to catch the term
+        patterns = [
+            # Exact match with word boundaries
+            r'\b' + re.escape(term) + r'\b',
+            # Case insensitive match
+            r'(?i)\b' + re.escape(term) + r'\b',
+            # Match with potential spaces/punctuation
+            re.escape(term).replace(r'\ ', r'\s+'),
+        ]
+        for pattern in patterns:
+            if re.search(pattern, modified_text):
+                modified_text = re.sub(pattern, placeholder, modified_text)
+                logger.debug(f"Protected term '{term}' replaced with '{placeholder}'")
+                break
     return modified_text, replacements
 def restore_terms(text: str, replacements: dict) -> str:
+    """Restore protected terms in the translated text with fuzzy matching."""
     restored_text = text
+    for placeholder, original_term in replacements.items():
+        # Direct replacement
+        if placeholder in restored_text:
+            restored_text = restored_text.replace(placeholder, original_term)
+            logger.debug(f"Restored '{placeholder}' to '{original_term}'")
+        else:
+            # Try to find partial matches or corrupted placeholders
+            # Sometimes translation models might alter the placeholder slightly
+            words = restored_text.split()
+            for i, word in enumerate(words):
+                # Check if word contains part of our placeholder
+                if "PROTECTEDTERM" in word and "PLACEHOLDER" in word:
+                    words[i] = original_term
+                    logger.debug(f"Fuzzy restored corrupted placeholder '{word}' to '{original_term}'")
+                # Also check for common corruptions
+                elif word.upper().replace(".", "").replace(",", "") == placeholder.upper():
+                    words[i] = original_term
+                    logger.debug(f"Restored corrupted '{word}' to '{original_term}'")
+            restored_text = " ".join(words)
+    # Clean up any remaining artifacts (dots, extra spaces)
+    restored_text = re.sub(r'\s*\.\s*\.\s*\.\s*\.+', '', restored_text)  # Remove multiple dots
+    restored_text = re.sub(r'\s+', ' ', restored_text)  # Normalize spaces
+    restored_text = restored_text.strip()
     return restored_text
 # FastAPI endpoints
         # Protect terms before translation
         modified_text, replacements = protect_terms(text, PROTECTED_TERMS)
+        logger.debug(f"Original text: '{text}'")
+        logger.debug(f"Modified text: '{modified_text}'")
+        logger.debug(f"Replacements: {replacements}")
+        # Perform translation with more conservative settings
+        result = translator(
+            modified_text,
+            max_length=512,
+            num_beams=2,  # Reduced from 4 to be more conservative
+            do_sample=False,
+            early_stopping=True,
+            no_repeat_ngram_size=2
+        )
         translated_text = result[0]["translation_text"]
+        logger.debug(f"Raw translation: '{translated_text}'")
         # Restore protected terms
         final_text = restore_terms(translated_text, replacements)
+        logger.debug(f"Final text after restoration: '{final_text}'")
         return TranslationResponse(
             translated_text=final_text,
             gr.Examples(
                 examples=[
                     ["สวัสดีครับ ยินดีที่ได้รู้จัก การพัฒนา 2030 Aspirations เป็นเป้าหมายสำคัญ", "th"],
+                    ["ฉันเลือกทานอาหารที่ดีต่อสุขภาพร่างกายเพื่อเป็นส่วนหนึ่งในการสนับสนุน 2030 Aspirations", "th"],
                     ["こんにちは、はじめまして。Griffith大学での研究が進んで��ます。", "ja"],
                     ["你好，很高兴认识你。我们正在为2030 Aspirations制定计划。", "zh"],
                     ["Xin chào, rất vui được gặp bạn. Griffith là trường đại học tuyệt vời.", "vi"],