File size: 5,952 Bytes
acf8bfe
1fd0997
129257a
acf8bfe
129257a
54a9930
97596e3
 
 
 
fc58506
e8b7c49
5dc46ff
 
 
fc58506
acf8bfe
97596e3
 
 
 
 
acf8bfe
 
54a9930
1fd0997
a371d81
 
e8b7c49
a371d81
1fd0997
 
54a9930
 
 
97596e3
1fd0997
97596e3
 
 
 
 
 
 
 
 
 
 
 
e8b7c49
97596e3
fc58506
97596e3
129257a
97596e3
129257a
e8b7c49
97596e3
e8b7c49
97596e3
e8b7c49
56ffb39
97596e3
56ffb39
129257a
56ffb39
129257a
 
54a9930
97596e3
54a9930
 
 
 
 
97596e3
 
 
 
54a9930
 
 
97596e3
54a9930
 
 
 
e8b7c49
fc58506
56ffb39
e8b7c49
54a9930
 
e8b7c49
acf8bfe
e8b7c49
 
97596e3
acf8bfe
54a9930
56ffb39
 
97596e3
56ffb39
97596e3
 
e8b7c49
54a9930
129257a
97596e3
129257a
e8b7c49
97596e3
 
1fd0997
e8b7c49
 
 
 
 
 
54a9930
 
97596e3
54a9930
97596e3
 
 
e8b7c49
97596e3
54a9930
 
 
97596e3
e8b7c49
54a9930
e8b7c49
 
acf8bfe
e8b7c49
97596e3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from fastapi import FastAPI, HTTPException
from transformers import pipeline
import langdetect
import logging
import os
from typing import Optional
import re
from functools import lru_cache
import asyncio
import logging.handlers

# Set environment variables for Hugging Face cache
os.environ["HF_HOME"] = "/app/cache"
os.environ["TRANSFORMERS_CACHE"] = "/app/cache"

app = FastAPI()

# Configure asynchronous logging with RotatingFileHandler to reduce I/O blocking
logging.basicConfig(
    level=logging.INFO,
    handlers=[logging.handlers.RotatingFileHandler("app.log", maxBytes=1000000, backupCount=1)]
)
logger = logging.getLogger(__name__)

# Map of supported language models
MODEL_MAP = {
    "th": "Helsinki-NLP/opus-mt-th-en",
    "ja": "Helsinki-NLP/opus-mt-ja-en",
    "zh": "Helsinki-NLP/opus-mt-zh-en",
    "vi": "Helsinki-NLP/opus-mt-vi-en",
}

# List of terms to protect from translation
PROTECTED_TERMS = ["2030 Aspirations"]

# Cache for translators to avoid reloading models unnecessarily
translators = {}

def get_translator(lang: str):
    """Load or retrieve cached translator for the given language."""
    if lang not in translators:
        logger.info(f"Loading model for {lang} from {MODEL_MAP[lang]}...")
        # Optimize pipeline with max_length and num_beams for faster inference
        translators[lang] = pipeline(
            "translation",
            model=MODEL_MAP[lang],
            device=-1,  # Explicitly use CPU for Hugging Face Spaces (free tier)
            model_kwargs={"load_in_8bit": True} if os.getenv("USE_QUANTIZATION", "0") == "1" else {}
        )
        logger.info(f"Model for {lang} loaded successfully.")
    return translators[lang]

@lru_cache(maxsize=100)
def detect_language(text: str) -> str:
    """Cached language detection to reduce overhead for repeated inputs."""
    try:
        detected_lang = langdetect.detect(text)
        logger.debug(f"langdetect raw result: '{detected_lang}' for text: '{text[:50]}...'")
        if detected_lang.startswith('zh'):
            logger.debug(f"Normalizing '{detected_lang}' to 'zh' for Mandarin.")
            return 'zh'
        final_lang = detected_lang if detected_lang in MODEL_MAP else "en"
        logger.debug(f"Final determined language: '{final_lang}'. (Based on raw detected: '{detected_lang}')")
        return final_lang
    except Exception as e:
        logger.warning(f"Language detection FAILED for text: '{text[:50]}...'. Error: {str(e)}. Defaulting to English.")
        return "en"

def protect_terms(text: str, protected_terms: list) -> tuple[str, dict]:
    """Replace protected terms with placeholders using regex for efficiency."""
    modified_text = text
    replacements = {}
    for i, term in enumerate(protected_terms):
        placeholder = f"__PROTECTED_{i}__"
        replacements[placeholder] = term
        # Use regex for case-sensitive replacement in a single pass
        modified_text = re.sub(r'\b' + re.escape(term) + r'\b', placeholder, modified_text)
    if replacements:
        logger.debug(f"Protected terms replaced: {replacements}")
    return modified_text, replacements

def restore_terms(text: str, replacements: dict) -> str:
    """Restore protected terms in the translated text."""
    restored_text = text
    for placeholder, term in replacements.items():
        restored_text = restored_text.replace(placeholder, term)
    return restored_text

@app.post("/translate")
async def translate(text: str, source_lang_override: Optional[str] = None):
    """
    Translate text to English, preserving protected terms like '2030 Aspirations'.
    Automatically detects source language or uses override.
    """
    if not text:
        raise HTTPException(status_code=400, detail="Text input is required.")

    try.lytic
    try:
        # Determine source language
        if source_lang_override and source_lang_override in MODEL_MAP:
            source_lang = source_lang_override
            logger.debug(f"Source language overridden by user to: '{source_lang_override}'.")
        else:
            source_lang = await asyncio.to_thread(detect_language, text)  # Run detection in a thread to avoid blocking
            logger.debug(f"Determined source language for translation: '{source_lang}'.")

        # If source language is English, return original text
        if source_lang == "en":
            logger.debug("Source language is English or unrecognized, returning original text.")
            return {"translated_text": text}

        # Get translator (lazy-loaded)
        translator = get_translator(source_lang)
        if not translator:
            logger.error(f"No translator found for language: '{source_lang}'.")
            raise HTTPException(
                status_code=400,
                detail=f"Translation not supported for language: {source_lang}."
            )

        # Protect terms before translation
        modified_text, replacements = protect_terms(text, PROTECTED_TERMS)
        logger.debug(f"Text after protecting terms: '{modified_text[:50]}...'")

        # Perform translation in a thread to avoid blocking the event loop
        logger.debug(f"Translating text from '{source_lang}' to English...")
        result = await asyncio.to_thread(translator, modified_text, max_length=512, num_beams=4)
        translated_text = result[0]["translation_text"]
        logger.debug(f"Translation successful. Original: '{modified_text[:50]}...', Translated: '{translated_text[:50]}...'")

        # Restore protected terms
        final_text = restore_terms(translated_text, replacements)
        logger.debug(f"Final translated text with restored terms: '{final_text[:50]}...'")

        return {"translated_text": final_text}
    except HTTPException as e:
        raise e
    except Exception as e:
        logger.error(f"An unexpected error occurred during processing: {str(e)}", exc_info=True)
        raise HTTPException(statusinvestigate further