feliksius commited on
Commit
97596e3
·
verified ·
1 Parent(s): 54a9930

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -36
app.py CHANGED
@@ -4,6 +4,10 @@ import langdetect
4
  import logging
5
  import os
6
  from typing import Optional
 
 
 
 
7
 
8
  # Set environment variables for Hugging Face cache
9
  os.environ["HF_HOME"] = "/app/cache"
@@ -11,16 +15,16 @@ os.environ["TRANSFORMERS_CACHE"] = "/app/cache"
11
 
12
  app = FastAPI()
13
 
14
- # Configure logging
15
- logging.basicConfig(level=logging.INFO)
 
 
 
16
  logger = logging.getLogger(__name__)
17
 
18
  # Map of supported language models
19
  MODEL_MAP = {
20
- "id": "Helsinki-NLP/opus-mt-id-en",
21
  "th": "Helsinki-NLP/opus-mt-th-en",
22
- "fr": "Helsinki-NLP/opus-mt-fr-en",
23
- "es": "Helsinki-NLP/opus-mt-es-en",
24
  "ja": "Helsinki-NLP/opus-mt-ja-en",
25
  "zh": "Helsinki-NLP/opus-mt-zh-en",
26
  "vi": "Helsinki-NLP/opus-mt-vi-en",
@@ -29,47 +33,54 @@ MODEL_MAP = {
29
  # List of terms to protect from translation
30
  PROTECTED_TERMS = ["2030 Aspirations"]
31
 
 
32
  translators = {}
33
- try:
34
- for lang, model_name in MODEL_MAP.items():
35
- logger.info(f"Loading model for {lang} from {model_name}...")
36
- translators[lang] = pipeline("translation", model=model_name)
 
 
 
 
 
 
 
 
37
  logger.info(f"Model for {lang} loaded successfully.")
38
- except Exception as e:
39
- logger.error(f"Model initialization failed: {str(e)}")
40
- raise Exception(f"Model initialization failed: {str(e)}")
41
 
 
42
  def detect_language(text: str) -> str:
 
43
  try:
44
  detected_lang = langdetect.detect(text)
45
- logger.info(f"langdetect raw result: '{detected_lang}' for text: '{text[:50]}...'")
46
  if detected_lang.startswith('zh'):
47
- logger.info(f"Normalizing '{detected_lang}' to 'zh' for Mandarin.")
48
  return 'zh'
49
  final_lang = detected_lang if detected_lang in MODEL_MAP else "en"
50
- logger.info(f"Final determined language: '{final_lang}'. (Based on raw detected: '{detected_lang}')")
51
  return final_lang
52
  except Exception as e:
53
  logger.warning(f"Language detection FAILED for text: '{text[:50]}...'. Error: {str(e)}. Defaulting to English.")
54
  return "en"
55
 
56
  def protect_terms(text: str, protected_terms: list) -> tuple[str, dict]:
57
- """
58
- Replace protected terms with placeholders to prevent translation.
59
- Returns the modified text and a dictionary mapping placeholders to original terms.
60
- """
61
  modified_text = text
62
  replacements = {}
63
  for i, term in enumerate(protected_terms):
64
  placeholder = f"__PROTECTED_{i}__"
65
  replacements[placeholder] = term
66
- modified_text = modified_text.replace(term, placeholder)
 
 
 
67
  return modified_text, replacements
68
 
69
  def restore_terms(text: str, replacements: dict) -> str:
70
- """
71
- Restore protected terms in the translated text using the replacements dictionary.
72
- """
73
  restored_text = text
74
  for placeholder, term in replacements.items():
75
  restored_text = restored_text.replace(placeholder, term)
@@ -84,22 +95,23 @@ async def translate(text: str, source_lang_override: Optional[str] = None):
84
  if not text:
85
  raise HTTPException(status_code=400, detail="Text input is required.")
86
 
 
87
  try:
88
  # Determine source language
89
  if source_lang_override and source_lang_override in MODEL_MAP:
90
  source_lang = source_lang_override
91
- logger.info(f"Source language overridden by user to: '{source_lang_override}'.")
92
  else:
93
- source_lang = detect_language(text)
94
- logger.info(f"Determined source language for translation: '{source_lang}'.")
95
 
96
  # If source language is English, return original text
97
  if source_lang == "en":
98
- logger.info("Source language is English or unrecognized, returning original text.")
99
  return {"translated_text": text}
100
 
101
- # Get translator
102
- translator = translators.get(source_lang)
103
  if not translator:
104
  logger.error(f"No translator found for language: '{source_lang}'.")
105
  raise HTTPException(
@@ -109,21 +121,21 @@ async def translate(text: str, source_lang_override: Optional[str] = None):
109
 
110
  # Protect terms before translation
111
  modified_text, replacements = protect_terms(text, PROTECTED_TERMS)
112
- logger.info(f"Text after protecting terms: '{modified_text[:50]}...'")
113
 
114
- # Perform translation
115
- logger.info(f"Translating text from '{source_lang}' to English...")
116
- result = translator(modified_text)
117
  translated_text = result[0]["translation_text"]
118
- logger.info(f"Translation successful. Original: '{modified_text[:50]}...', Translated: '{translated_text[:50]}...'")
119
 
120
  # Restore protected terms
121
  final_text = restore_terms(translated_text, replacements)
122
- logger.info(f"Final translated text with restored terms: '{final_text[:50]}...'")
123
 
124
  return {"translated_text": final_text}
125
  except HTTPException as e:
126
  raise e
127
  except Exception as e:
128
  logger.error(f"An unexpected error occurred during processing: {str(e)}", exc_info=True)
129
- raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
 
4
  import logging
5
  import os
6
  from typing import Optional
7
+ import re
8
+ from functools import lru_cache
9
+ import asyncio
10
+ import logging.handlers
11
 
12
  # Set environment variables for Hugging Face cache
13
  os.environ["HF_HOME"] = "/app/cache"
 
15
 
16
  app = FastAPI()
17
 
18
+ # Configure asynchronous logging with RotatingFileHandler to reduce I/O blocking
19
+ logging.basicConfig(
20
+ level=logging.INFO,
21
+ handlers=[logging.handlers.RotatingFileHandler("app.log", maxBytes=1000000, backupCount=1)]
22
+ )
23
  logger = logging.getLogger(__name__)
24
 
25
  # Map of supported language models
26
  MODEL_MAP = {
 
27
  "th": "Helsinki-NLP/opus-mt-th-en",
 
 
28
  "ja": "Helsinki-NLP/opus-mt-ja-en",
29
  "zh": "Helsinki-NLP/opus-mt-zh-en",
30
  "vi": "Helsinki-NLP/opus-mt-vi-en",
 
33
  # List of terms to protect from translation
34
  PROTECTED_TERMS = ["2030 Aspirations"]
35
 
36
+ # Cache for translators to avoid reloading models unnecessarily
37
  translators = {}
38
+
39
+ def get_translator(lang: str):
40
+ """Load or retrieve cached translator for the given language."""
41
+ if lang not in translators:
42
+ logger.info(f"Loading model for {lang} from {MODEL_MAP[lang]}...")
43
+ # Optimize pipeline with max_length and num_beams for faster inference
44
+ translators[lang] = pipeline(
45
+ "translation",
46
+ model=MODEL_MAP[lang],
47
+ device=-1, # Explicitly use CPU for Hugging Face Spaces (free tier)
48
+ model_kwargs={"load_in_8bit": True} if os.getenv("USE_QUANTIZATION", "0") == "1" else {}
49
+ )
50
  logger.info(f"Model for {lang} loaded successfully.")
51
+ return translators[lang]
 
 
52
 
53
+ @lru_cache(maxsize=100)
54
  def detect_language(text: str) -> str:
55
+ """Cached language detection to reduce overhead for repeated inputs."""
56
  try:
57
  detected_lang = langdetect.detect(text)
58
+ logger.debug(f"langdetect raw result: '{detected_lang}' for text: '{text[:50]}...'")
59
  if detected_lang.startswith('zh'):
60
+ logger.debug(f"Normalizing '{detected_lang}' to 'zh' for Mandarin.")
61
  return 'zh'
62
  final_lang = detected_lang if detected_lang in MODEL_MAP else "en"
63
+ logger.debug(f"Final determined language: '{final_lang}'. (Based on raw detected: '{detected_lang}')")
64
  return final_lang
65
  except Exception as e:
66
  logger.warning(f"Language detection FAILED for text: '{text[:50]}...'. Error: {str(e)}. Defaulting to English.")
67
  return "en"
68
 
69
  def protect_terms(text: str, protected_terms: list) -> tuple[str, dict]:
70
+ """Replace protected terms with placeholders using regex for efficiency."""
 
 
 
71
  modified_text = text
72
  replacements = {}
73
  for i, term in enumerate(protected_terms):
74
  placeholder = f"__PROTECTED_{i}__"
75
  replacements[placeholder] = term
76
+ # Use regex for case-sensitive replacement in a single pass
77
+ modified_text = re.sub(r'\b' + re.escape(term) + r'\b', placeholder, modified_text)
78
+ if replacements:
79
+ logger.debug(f"Protected terms replaced: {replacements}")
80
  return modified_text, replacements
81
 
82
  def restore_terms(text: str, replacements: dict) -> str:
83
+ """Restore protected terms in the translated text."""
 
 
84
  restored_text = text
85
  for placeholder, term in replacements.items():
86
  restored_text = restored_text.replace(placeholder, term)
 
95
  if not text:
96
  raise HTTPException(status_code=400, detail="Text input is required.")
97
 
98
+ try.lytic
99
  try:
100
  # Determine source language
101
  if source_lang_override and source_lang_override in MODEL_MAP:
102
  source_lang = source_lang_override
103
+ logger.debug(f"Source language overridden by user to: '{source_lang_override}'.")
104
  else:
105
+ source_lang = await asyncio.to_thread(detect_language, text) # Run detection in a thread to avoid blocking
106
+ logger.debug(f"Determined source language for translation: '{source_lang}'.")
107
 
108
  # If source language is English, return original text
109
  if source_lang == "en":
110
+ logger.debug("Source language is English or unrecognized, returning original text.")
111
  return {"translated_text": text}
112
 
113
+ # Get translator (lazy-loaded)
114
+ translator = get_translator(source_lang)
115
  if not translator:
116
  logger.error(f"No translator found for language: '{source_lang}'.")
117
  raise HTTPException(
 
121
 
122
  # Protect terms before translation
123
  modified_text, replacements = protect_terms(text, PROTECTED_TERMS)
124
+ logger.debug(f"Text after protecting terms: '{modified_text[:50]}...'")
125
 
126
+ # Perform translation in a thread to avoid blocking the event loop
127
+ logger.debug(f"Translating text from '{source_lang}' to English...")
128
+ result = await asyncio.to_thread(translator, modified_text, max_length=512, num_beams=4)
129
  translated_text = result[0]["translation_text"]
130
+ logger.debug(f"Translation successful. Original: '{modified_text[:50]}...', Translated: '{translated_text[:50]}...'")
131
 
132
  # Restore protected terms
133
  final_text = restore_terms(translated_text, replacements)
134
+ logger.debug(f"Final translated text with restored terms: '{final_text[:50]}...'")
135
 
136
  return {"translated_text": final_text}
137
  except HTTPException as e:
138
  raise e
139
  except Exception as e:
140
  logger.error(f"An unexpected error occurred during processing: {str(e)}", exc_info=True)
141
+ raise HTTPException(statusinvestigate further