feliksius commited on
Commit
54a9930
·
verified ·
1 Parent(s): 8559748

Update app.py

Browse files

Update protected Terms

Files changed (1) hide show
  1. app.py +44 -32
app.py CHANGED
@@ -3,62 +3,49 @@ from transformers import pipeline
3
  import langdetect
4
  import logging
5
  import os
6
- from typing import Optional # Import Optional untuk parameter di FastAPI
7
 
8
  # Set environment variables for Hugging Face cache
9
- # Ini penting agar model di-cache di lokasi yang benar di dalam container Hugging Face Space
10
  os.environ["HF_HOME"] = "/app/cache"
11
  os.environ["TRANSFORMERS_CACHE"] = "/app/cache"
12
 
13
  app = FastAPI()
14
 
15
- # Konfigurasi logging untuk melihat pesan debug di log Space kamu
16
  logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
18
 
19
- # Peta model untuk setiap bahasa yang didukung
20
  MODEL_MAP = {
21
  "id": "Helsinki-NLP/opus-mt-id-en",
22
  "th": "Helsinki-NLP/opus-mt-th-en",
23
  "fr": "Helsinki-NLP/opus-mt-fr-en",
24
  "es": "Helsinki-NLP/opus-mt-es-en",
25
  "ja": "Helsinki-NLP/opus-mt-ja-en",
26
- # Entri tunggal untuk Mandarin, kita akan normalisasi deteksi bahasanya
27
  "zh": "Helsinki-NLP/opus-mt-zh-en",
28
  "vi": "Helsinki-NLP/opus-mt-vi-en",
29
  }
30
 
 
 
 
31
  translators = {}
32
  try:
33
- # Inisialisasi semua model saat aplikasi dimulai
34
  for lang, model_name in MODEL_MAP.items():
35
  logger.info(f"Loading model for {lang} from {model_name}...")
36
- # Pastikan kita menggunakan device="cpu" atau "cuda" jika GPU tersedia
37
- # Untuk Hugging Face Space gratis biasanya CPU, jadi lebih aman tidak specify device
38
  translators[lang] = pipeline("translation", model=model_name)
39
  logger.info(f"Model for {lang} loaded successfully.")
40
  except Exception as e:
41
- # Tangani kegagalan inisialisasi model
42
  logger.error(f"Model initialization failed: {str(e)}")
43
- # Hentikan aplikasi jika model gagal dimuat, karena aplikasi tidak akan berfungsi
44
  raise Exception(f"Model initialization failed: {str(e)}")
45
 
46
-
47
  def detect_language(text: str) -> str:
48
  try:
49
  detected_lang = langdetect.detect(text)
50
- # Log ini SANGAT PENTING untuk debugging! Ini menunjukkan hasil mentah dari langdetect.
51
  logger.info(f"langdetect raw result: '{detected_lang}' for text: '{text[:50]}...'")
52
-
53
- # Normalisasi untuk bahasa Mandarin:
54
- # Jika langdetect mengembalikan 'zh-cn', 'zh-tw', 'zh-hk', dll.,
55
- # kita paksa menjadi 'zh' agar sesuai dengan kunci di MODEL_MAP.
56
  if detected_lang.startswith('zh'):
57
  logger.info(f"Normalizing '{detected_lang}' to 'zh' for Mandarin.")
58
  return 'zh'
59
-
60
- # Jika bahasa terdeteksi ada di MODEL_MAP, gunakan itu.
61
- # Jika tidak, default ke 'en' (bahasa Inggris).
62
  final_lang = detected_lang if detected_lang in MODEL_MAP else "en"
63
  logger.info(f"Final determined language: '{final_lang}'. (Based on raw detected: '{detected_lang}')")
64
  return final_lang
@@ -66,18 +53,39 @@ def detect_language(text: str) -> str:
66
  logger.warning(f"Language detection FAILED for text: '{text[:50]}...'. Error: {str(e)}. Defaulting to English.")
67
  return "en"
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  @app.post("/translate")
71
  async def translate(text: str, source_lang_override: Optional[str] = None):
72
  """
73
- Menerima teks dan mengembalikannya dalam bahasa Inggris.
74
- Secara otomatis mendeteksi bahasa sumber, atau bisa di-override oleh pengguna.
75
  """
76
  if not text:
77
  raise HTTPException(status_code=400, detail="Text input is required.")
78
 
79
  try:
80
- # Tentukan bahasa sumber: gunakan override jika diberikan dan valid, kalau tidak, deteksi otomatis.
81
  if source_lang_override and source_lang_override in MODEL_MAP:
82
  source_lang = source_lang_override
83
  logger.info(f"Source language overridden by user to: '{source_lang_override}'.")
@@ -85,15 +93,13 @@ async def translate(text: str, source_lang_override: Optional[str] = None):
85
  source_lang = detect_language(text)
86
  logger.info(f"Determined source language for translation: '{source_lang}'.")
87
 
88
- # Jika bahasa sumber sudah Bahasa Inggris, kembalikan teks aslinya
89
  if source_lang == "en":
90
  logger.info("Source language is English or unrecognized, returning original text.")
91
  return {"translated_text": text}
92
 
93
- # Dapatkan translator yang sesuai dari kamus translators
94
  translator = translators.get(source_lang)
95
-
96
- # Jika tidak ada translator yang mendukung bahasa yang terdeteksi/di-override
97
  if not translator:
98
  logger.error(f"No translator found for language: '{source_lang}'.")
99
  raise HTTPException(
@@ -101,17 +107,23 @@ async def translate(text: str, source_lang_override: Optional[str] = None):
101
  detail=f"Translation not supported for language: {source_lang}."
102
  )
103
 
104
- # Lakukan terjemahan
 
 
 
 
105
  logger.info(f"Translating text from '{source_lang}' to English...")
106
- result = translator(text)
107
  translated_text = result[0]["translation_text"]
108
- logger.info(f"Translation successful. Original: '{text[:50]}...', Translated: '{translated_text[:50]}...'")
 
 
 
 
109
 
110
- return {"translated_text": translated_text}
111
  except HTTPException as e:
112
- # Tangani HTTPExceptions yang sudah kita definisikan sebelumnya
113
  raise e
114
  except Exception as e:
115
- # Tangani error tak terduga lainnya
116
  logger.error(f"An unexpected error occurred during processing: {str(e)}", exc_info=True)
117
  raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
 
3
  import langdetect
4
  import logging
5
  import os
6
+ from typing import Optional
7
 
8
  # Set environment variables for Hugging Face cache
 
9
  os.environ["HF_HOME"] = "/app/cache"
10
  os.environ["TRANSFORMERS_CACHE"] = "/app/cache"
11
 
12
  app = FastAPI()
13
 
14
+ # Configure logging
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
17
 
18
+ # Map of supported language models
19
  MODEL_MAP = {
20
  "id": "Helsinki-NLP/opus-mt-id-en",
21
  "th": "Helsinki-NLP/opus-mt-th-en",
22
  "fr": "Helsinki-NLP/opus-mt-fr-en",
23
  "es": "Helsinki-NLP/opus-mt-es-en",
24
  "ja": "Helsinki-NLP/opus-mt-ja-en",
 
25
  "zh": "Helsinki-NLP/opus-mt-zh-en",
26
  "vi": "Helsinki-NLP/opus-mt-vi-en",
27
  }
28
 
29
+ # List of terms to protect from translation
30
+ PROTECTED_TERMS = ["2030 Aspirations"]
31
+
32
  translators = {}
33
  try:
 
34
  for lang, model_name in MODEL_MAP.items():
35
  logger.info(f"Loading model for {lang} from {model_name}...")
 
 
36
  translators[lang] = pipeline("translation", model=model_name)
37
  logger.info(f"Model for {lang} loaded successfully.")
38
  except Exception as e:
 
39
  logger.error(f"Model initialization failed: {str(e)}")
 
40
  raise Exception(f"Model initialization failed: {str(e)}")
41
 
 
42
  def detect_language(text: str) -> str:
43
  try:
44
  detected_lang = langdetect.detect(text)
 
45
  logger.info(f"langdetect raw result: '{detected_lang}' for text: '{text[:50]}...'")
 
 
 
 
46
  if detected_lang.startswith('zh'):
47
  logger.info(f"Normalizing '{detected_lang}' to 'zh' for Mandarin.")
48
  return 'zh'
 
 
 
49
  final_lang = detected_lang if detected_lang in MODEL_MAP else "en"
50
  logger.info(f"Final determined language: '{final_lang}'. (Based on raw detected: '{detected_lang}')")
51
  return final_lang
 
53
  logger.warning(f"Language detection FAILED for text: '{text[:50]}...'. Error: {str(e)}. Defaulting to English.")
54
  return "en"
55
 
56
+ def protect_terms(text: str, protected_terms: list) -> tuple[str, dict]:
57
+ """
58
+ Replace protected terms with placeholders to prevent translation.
59
+ Returns the modified text and a dictionary mapping placeholders to original terms.
60
+ """
61
+ modified_text = text
62
+ replacements = {}
63
+ for i, term in enumerate(protected_terms):
64
+ placeholder = f"__PROTECTED_{i}__"
65
+ replacements[placeholder] = term
66
+ modified_text = modified_text.replace(term, placeholder)
67
+ return modified_text, replacements
68
+
69
+ def restore_terms(text: str, replacements: dict) -> str:
70
+ """
71
+ Restore protected terms in the translated text using the replacements dictionary.
72
+ """
73
+ restored_text = text
74
+ for placeholder, term in replacements.items():
75
+ restored_text = restored_text.replace(placeholder, term)
76
+ return restored_text
77
 
78
  @app.post("/translate")
79
  async def translate(text: str, source_lang_override: Optional[str] = None):
80
  """
81
+ Translate text to English, preserving protected terms like '2030 Aspirations'.
82
+ Automatically detects source language or uses override.
83
  """
84
  if not text:
85
  raise HTTPException(status_code=400, detail="Text input is required.")
86
 
87
  try:
88
+ # Determine source language
89
  if source_lang_override and source_lang_override in MODEL_MAP:
90
  source_lang = source_lang_override
91
  logger.info(f"Source language overridden by user to: '{source_lang_override}'.")
 
93
  source_lang = detect_language(text)
94
  logger.info(f"Determined source language for translation: '{source_lang}'.")
95
 
96
+ # If source language is English, return original text
97
  if source_lang == "en":
98
  logger.info("Source language is English or unrecognized, returning original text.")
99
  return {"translated_text": text}
100
 
101
+ # Get translator
102
  translator = translators.get(source_lang)
 
 
103
  if not translator:
104
  logger.error(f"No translator found for language: '{source_lang}'.")
105
  raise HTTPException(
 
107
  detail=f"Translation not supported for language: {source_lang}."
108
  )
109
 
110
+ # Protect terms before translation
111
+ modified_text, replacements = protect_terms(text, PROTECTED_TERMS)
112
+ logger.info(f"Text after protecting terms: '{modified_text[:50]}...'")
113
+
114
+ # Perform translation
115
  logger.info(f"Translating text from '{source_lang}' to English...")
116
+ result = translator(modified_text)
117
  translated_text = result[0]["translation_text"]
118
+ logger.info(f"Translation successful. Original: '{modified_text[:50]}...', Translated: '{translated_text[:50]}...'")
119
+
120
+ # Restore protected terms
121
+ final_text = restore_terms(translated_text, replacements)
122
+ logger.info(f"Final translated text with restored terms: '{final_text[:50]}...'")
123
 
124
+ return {"translated_text": final_text}
125
  except HTTPException as e:
 
126
  raise e
127
  except Exception as e:
 
128
  logger.error(f"An unexpected error occurred during processing: {str(e)}", exc_info=True)
129
  raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")