feliksius commited on
Commit
db6cecb
·
verified ·
1 Parent(s): 44bf8b7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -210
app.py CHANGED
@@ -5,13 +5,12 @@ from transformers import pipeline
5
  import langdetect
6
  import logging
7
  import os
8
- from typing import Optional
9
  import re
10
  from functools import lru_cache
11
  import asyncio
12
- import threading
13
- import time
14
 
 
15
  # Create necessary directories
16
  os.makedirs("./cache", exist_ok=True)
17
  os.makedirs("./logs", exist_ok=True)
@@ -21,7 +20,7 @@ os.environ["HF_HOME"] = "./cache"
21
  os.environ["TRANSFORMERS_CACHE"] = "./cache"
22
 
23
  # Environment configuration
24
- DEVICE = -1 # Always use CPU for HF Spaces
25
  MAX_TEXT_LENGTH = int(os.getenv("MAX_TEXT_LENGTH", "5000"))
26
 
27
  # Configure logging
@@ -31,21 +30,21 @@ logging.basicConfig(
31
  )
32
  logger = logging.getLogger(__name__)
33
 
34
- # Map of supported language models
35
  MODEL_MAP = {
36
  "th": "Helsinki-NLP/opus-mt-th-en",
37
- "ja": "Helsinki-NLP/opus-mt-ja-en",
38
  "zh": "Helsinki-NLP/opus-mt-zh-en",
39
  "vi": "Helsinki-NLP/opus-mt-vi-en",
40
  }
41
 
42
- # List of terms to protect from translation
43
  PROTECTED_TERMS = ["2030 Aspirations", "Griffith"]
44
 
45
- # Cache for translators
46
- translators = {}
47
 
48
- # Pydantic models
49
  class TranslationRequest(BaseModel):
50
  text: str
51
  source_lang_override: Optional[str] = None
@@ -54,278 +53,197 @@ class TranslationResponse(BaseModel):
54
  translated_text: str
55
  source_language: Optional[str] = None
56
 
57
- # FastAPI app
58
  app = FastAPI(title="Translation Service API")
59
 
60
- def get_translator(lang: str):
61
- """Load or retrieve cached translator for the given language."""
62
- if lang not in translators:
63
- logger.info(f"Loading model for {lang}...")
 
 
64
  try:
65
- translators[lang] = pipeline(
66
- "translation",
67
- model=MODEL_MAP[lang],
68
- device=-1
69
- )
70
- logger.info(f"Model for {lang} loaded successfully.")
71
  except Exception as e:
72
- logger.error(f"Failed to load model for {lang}: {str(e)}")
73
- raise
74
- return translators[lang]
75
-
76
- @lru_cache(maxsize=100)
 
 
 
 
 
 
 
 
77
  def detect_language(text: str) -> str:
78
- """Cached language detection."""
79
  try:
80
- detected_lang = langdetect.detect(text)
 
 
81
  if detected_lang.startswith('zh'):
82
  return 'zh'
83
  return detected_lang if detected_lang in MODEL_MAP else "en"
84
  except Exception as e:
85
- logger.warning(f"Language detection failed: {str(e)}")
86
  return "en"
87
 
88
  def protect_terms(text: str, protected_terms: list) -> tuple[str, dict]:
89
- """Replace protected terms with placeholders."""
90
- modified_text = text
91
  replacements = {}
92
  for i, term in enumerate(protected_terms):
93
  placeholder = f"__PROTECTED_{i}__"
94
- replacements[placeholder] = term
95
- modified_text = re.sub(r'\b' + re.escape(term) + r'\b', placeholder, modified_text, flags=re.IGNORECASE)
96
- return modified_text, replacements
 
 
 
 
97
 
98
  def restore_terms(text: str, replacements: dict) -> str:
99
- """Restore protected terms in the translated text."""
100
- restored_text = text
101
  for placeholder, term in replacements.items():
102
- restored_text = restored_text.replace(placeholder, term)
103
- return restored_text
104
-
105
- # FastAPI endpoints
106
- @app.get("/")
107
- async def root():
108
- return {"message": "Translation Service API is running"}
109
-
110
- @app.get("/health")
111
- async def health_check():
112
- return {"status": "healthy", "supported_languages": list(MODEL_MAP.keys())}
113
 
114
- @app.post("/translate", response_model=TranslationResponse)
115
- async def translate_api(request: TranslationRequest):
116
- """API endpoint for translation."""
117
- return await translate(request.text, request.source_lang_override)
118
-
119
- # Core translation function
120
- async def translate(text: str, source_lang_override: Optional[str] = None):
121
- """Core translation function used by both API and Gradio."""
122
  if not text or not text.strip():
123
- raise HTTPException(status_code=400, detail="Text input is required.")
124
 
125
  if len(text) > MAX_TEXT_LENGTH:
126
  raise HTTPException(
127
  status_code=413,
128
- detail=f"Text too long. Max allowed length: {MAX_TEXT_LENGTH}."
129
  )
130
 
131
  try:
132
- # Determine source language
133
  if source_lang_override and source_lang_override in MODEL_MAP:
134
  source_lang = source_lang_override
135
  else:
136
  source_lang = detect_language(text)
137
 
138
- # If source language is English, return original text
139
  if source_lang == "en":
140
- return TranslationResponse(
141
- translated_text=text,
142
- source_language=source_lang
143
- )
144
 
145
- # Get translator
146
  translator = get_translator(source_lang)
147
 
148
- # Protect terms before translation
149
  modified_text, replacements = protect_terms(text, PROTECTED_TERMS)
150
 
151
- # Perform translation
152
- result = translator(modified_text, max_length=512, num_beams=4)
 
 
 
 
153
  translated_text = result[0]["translation_text"]
154
 
155
- # Restore protected terms
156
  final_text = restore_terms(translated_text, replacements)
157
 
158
- return TranslationResponse(
159
- translated_text=final_text,
160
- source_language=source_lang
161
- )
162
-
163
  except Exception as e:
164
- logger.error(f"Translation error: {str(e)}")
165
- raise HTTPException(status_code=500, detail=f"Translation failed: {str(e)}")
166
-
167
- # Gradio interface functions
168
- def translate_gradio(text: str, source_lang: str = "auto"):
169
- """Gradio wrapper for translation function."""
170
- if not text.strip():
171
- return "Please enter some text to translate.", "N/A"
172
-
 
 
 
 
 
 
 
 
 
 
173
  try:
174
  source_lang_param = source_lang if source_lang != "auto" else None
175
-
176
- # Call the async function synchronously for Gradio
177
- import asyncio
178
- loop = asyncio.new_event_loop()
179
- asyncio.set_event_loop(loop)
180
-
181
- result = loop.run_until_complete(translate(text, source_lang_param))
182
-
183
  return result.translated_text, result.source_language or "Unknown"
184
-
185
  except HTTPException as e:
186
  return f"Error: {e.detail}", "Error"
187
  except Exception as e:
188
  return f"Error: {str(e)}", "Error"
189
 
190
- # Create Gradio interface
 
191
  def create_gradio_interface():
192
  with gr.Blocks(
193
  title="Multi-Language Translation Service",
194
  theme=gr.themes.Soft(),
195
- css="""
196
- .gradio-container {
197
- max-width: 1200px !important;
198
- }
199
- """
200
  ) as interface:
201
-
202
  gr.Markdown("""
203
  # 🌐 Multi-Language Translation Service
204
-
205
- Translate text from **Thai**, **Japanese**, **Chinese**, or **Vietnamese** to **English**
206
-
207
- ✨ Features: Automatic language detection • Protected terms preservation • Fast Helsinki-NLP models
208
  """)
209
 
210
  with gr.Row():
211
  with gr.Column(scale=1):
212
- text_input = gr.Textbox(
213
- label="📝 Input Text",
214
- placeholder="Enter text to translate...",
215
- lines=6,
216
- max_lines=10
217
- )
218
-
219
  with gr.Row():
220
  lang_dropdown = gr.Dropdown(
221
  choices=[
222
- ("🔍 Auto-detect", "auto"),
223
- ("🇹🇭 Thai", "th"),
224
- ("🇯🇵 Japanese", "ja"),
225
- ("🇨🇳 Chinese", "zh"),
226
- ("🇻🇳 Vietnamese", "vi")
227
  ],
228
- value="auto",
229
- label="Source Language"
230
  )
231
-
232
- translate_btn = gr.Button(
233
- "🚀 Translate",
234
- variant="primary",
235
- size="lg"
236
- )
237
-
238
  with gr.Column(scale=1):
239
- output_text = gr.Textbox(
240
- label="🎯 Translation Result",
241
- lines=6,
242
- max_lines=10,
243
- interactive=False
244
- )
245
-
246
- detected_lang = gr.Textbox(
247
- label="🔍 Detected Language",
248
- interactive=False,
249
- max_lines=1
250
- )
251
-
252
- # Examples section
253
- with gr.Row():
254
- gr.Examples(
255
- examples=[
256
- ["สวัสดีครับ ยินดีที่ได้รู้จัก การพัฒนา 2030 Aspirations เป็นเป้าหมายสำคัญ", "th"],
257
- ["こんにちは、はじめまして。Griffith大学での研究が進んでいます。", "ja"],
258
- ["你好,很高兴认识你。我们正在为2030 Aspirations制定计划。", "zh"],
259
- ["Xin chào, rất vui được gặp bạn. Griffith là trường đại học tuyệt vời.", "vi"],
260
- ],
261
- inputs=[text_input, lang_dropdown],
262
- outputs=[output_text, detected_lang],
263
- fn=translate_gradio,
264
- cache_examples=False,
265
- label="📋 Try these examples:"
266
- )
267
-
268
- # Event handlers
269
- translate_btn.click(
270
- fn=translate_gradio,
271
  inputs=[text_input, lang_dropdown],
272
- outputs=[output_text, detected_lang]
 
 
273
  )
274
 
275
- text_input.submit(
276
- fn=translate_gradio,
277
- inputs=[text_input, lang_dropdown],
278
- outputs=[output_text, detected_lang]
279
- )
280
 
281
- # Information accordion
282
- with gr.Accordion("ℹ️ About this service", open=False):
283
- gr.Markdown("""
284
- ### 🎯 Supported Languages:
285
- - **Thai (th)** → English
286
- - **Japanese (ja)** → English
287
- - **Chinese (zh)** → English
288
- - **Vietnamese (vi)** → English
289
-
290
- ### 🛡️ Special Features:
291
- - **Protected Terms**: Certain terms like "2030 Aspirations" and "Griffith" are preserved during translation
292
- - **Auto Detection**: Automatically detects the source language if not specified
293
- - **Fast Processing**: Uses optimized Helsinki-NLP translation models
294
-
295
- ### 🚀 How to use:
296
- 1. Paste or type your text in the input box
297
- 2. Choose source language or leave as 'Auto-detect'
298
- 3. Click 'Translate' or press Enter
299
- 4. Get your English translation instantly!
300
-
301
- ### 🔧 API Access:
302
- This service also provides REST API endpoints:
303
- - `GET /health` - Check service status
304
- - `POST /translate` - Translate text (JSON payload required)
305
- """)
306
-
307
  return interface
308
 
309
- # Start FastAPI in a separate thread
310
- def start_fastapi():
311
- import uvicorn
312
- uvicorn.run(app, host="0.0.0.0", port=7860, log_level="info")
313
-
314
- # Main execution
315
- if __name__ == "__main__":
316
- # Start FastAPI server in background thread
317
- fastapi_thread = threading.Thread(target=start_fastapi, daemon=True)
318
- fastapi_thread.start()
319
-
320
- # Give FastAPI time to start
321
- time.sleep(2)
322
-
323
- # Create and launch Gradio interface
324
- demo = create_gradio_interface()
325
- demo.queue(max_size=10)
326
- demo.launch(
327
- server_name="0.0.0.0",
328
- server_port=7861,
329
- share=False,
330
- show_error=True
331
- )
 
5
  import langdetect
6
  import logging
7
  import os
8
+ from typing import Optional, Dict
9
  import re
10
  from functools import lru_cache
11
  import asyncio
 
 
12
 
13
+ # --- 1. Konfigurasi Awal (Tetap Sama) ---
14
  # Create necessary directories
15
  os.makedirs("./cache", exist_ok=True)
16
  os.makedirs("./logs", exist_ok=True)
 
20
  os.environ["TRANSFORMERS_CACHE"] = "./cache"
21
 
22
  # Environment configuration
23
+ DEVICE = -1 # Selalu CPU untuk efisiensi di banyak environment
24
  MAX_TEXT_LENGTH = int(os.getenv("MAX_TEXT_LENGTH", "5000"))
25
 
26
  # Configure logging
 
30
  )
31
  logger = logging.getLogger(__name__)
32
 
33
+ # Map model yang didukung
34
  MODEL_MAP = {
35
  "th": "Helsinki-NLP/opus-mt-th-en",
36
+ "ja": "Helsinki-NLP/opus-mt-ja-en",
37
  "zh": "Helsinki-NLP/opus-mt-zh-en",
38
  "vi": "Helsinki-NLP/opus-mt-vi-en",
39
  }
40
 
41
+ # Istilah yang dilindungi dari translasi
42
  PROTECTED_TERMS = ["2030 Aspirations", "Griffith"]
43
 
44
+ # Cache untuk translator (pipeline)
45
+ translators: Dict[str, pipeline] = {}
46
 
47
+ # --- Pydantic Models (Tetap Sama) ---
48
  class TranslationRequest(BaseModel):
49
  text: str
50
  source_lang_override: Optional[str] = None
 
53
  translated_text: str
54
  source_language: Optional[str] = None
55
 
56
+ # --- 2. Inisialisasi Aplikasi FastAPI ---
57
  app = FastAPI(title="Translation Service API")
58
 
59
+ # --- 3. OPTIMASI: Prapemuatan Model saat Startup ---
60
+ @app.on_event("startup")
61
+ async def startup_event():
62
+ """Memuat semua model translasi saat aplikasi dimulai."""
63
+ logger.info("Memulai prapemuatan model translasi...")
64
+ for lang, model_name in MODEL_MAP.items():
65
  try:
66
+ logger.info(f"Memuat model untuk bahasa: {lang} ({model_name})")
67
+ translators[lang] = pipeline("translation", model=model_name, device=DEVICE)
68
+ logger.info(f"Model untuk {lang} berhasil dimuat.")
 
 
 
69
  except Exception as e:
70
+ logger.error(f"Gagal memuat model untuk {lang}: {str(e)}")
71
+ logger.info("Semua model telah dimuat.")
72
+
73
+ def get_translator(lang: str) -> pipeline:
74
+ """Mengambil translator yang sudah dimuat dari cache."""
75
+ translator = translators.get(lang)
76
+ if not translator:
77
+ logger.error(f"Translator untuk bahasa '{lang}' tidak ditemukan. Mungkin gagal dimuat saat startup.")
78
+ raise HTTPException(status_code=500, detail=f"Model terjemahan untuk '{lang}' tidak tersedia.")
79
+ return translator
80
+
81
+ # --- Fungsi Utility (Hampir Sama, Sedikit Perbaikan) ---
82
+ @lru_cache(maxsize=128) # Cache lebih besar jika perlu
83
  def detect_language(text: str) -> str:
84
+ """Deteksi bahasa dengan cache."""
85
  try:
86
+ # Potong teks untuk deteksi yang lebih cepat jika teks sangat panjang
87
+ preview_text = text[:500]
88
+ detected_lang = langdetect.detect(preview_text)
89
  if detected_lang.startswith('zh'):
90
  return 'zh'
91
  return detected_lang if detected_lang in MODEL_MAP else "en"
92
  except Exception as e:
93
+ logger.warning(f"Deteksi bahasa gagal: {str(e)}. Mengasumsikan 'en'.")
94
  return "en"
95
 
96
  def protect_terms(text: str, protected_terms: list) -> tuple[str, dict]:
97
+ """Mengganti istilah yang dilindungi dengan placeholder."""
 
98
  replacements = {}
99
  for i, term in enumerate(protected_terms):
100
  placeholder = f"__PROTECTED_{i}__"
101
+ # Gunakan word boundary (\b) untuk memastikan hanya kata utuh yang diganti
102
+ modified_text = re.sub(r'\b' + re.escape(term) + r'\b', placeholder, text, flags=re.IGNORECASE)
103
+ # Hanya tambahkan ke replacement jika ada perubahan
104
+ if modified_text != text:
105
+ replacements[placeholder] = term
106
+ text = modified_text
107
+ return text, replacements
108
 
109
  def restore_terms(text: str, replacements: dict) -> str:
110
+ """Mengembalikan istilah yang dilindungi."""
 
111
  for placeholder, term in replacements.items():
112
+ text = text.replace(placeholder, term)
113
+ return text
 
 
 
 
 
 
 
 
 
114
 
115
+ # --- 4. OPTIMASI: Fungsi Inti dan Endpoint API menjadi Full Asynchronous ---
116
+ async def perform_translation(text: str, source_lang_override: Optional[str] = None) -> TranslationResponse:
117
+ """Fungsi inti translasi yang sepenuhnya async."""
 
 
 
 
 
118
  if not text or not text.strip():
119
+ raise HTTPException(status_code=400, detail="Teks input tidak boleh kosong.")
120
 
121
  if len(text) > MAX_TEXT_LENGTH:
122
  raise HTTPException(
123
  status_code=413,
124
+ detail=f"Teks terlalu panjang. Panjang maksimal yang diizinkan: {MAX_TEXT_LENGTH}."
125
  )
126
 
127
  try:
128
+ # Tentukan bahasa sumber
129
  if source_lang_override and source_lang_override in MODEL_MAP:
130
  source_lang = source_lang_override
131
  else:
132
  source_lang = detect_language(text)
133
 
134
+ # Jika bahasa sumber adalah Inggris, kembalikan teks asli
135
  if source_lang == "en":
136
+ return TranslationResponse(translated_text=text, source_language=source_lang)
 
 
 
137
 
138
+ # Ambil translator
139
  translator = get_translator(source_lang)
140
 
141
+ # Lindungi istilah sebelum translasi
142
  modified_text, replacements = protect_terms(text, PROTECTED_TERMS)
143
 
144
+ # --- OPTIMASI KUNCI: Jalankan model di thread terpisah ---
145
+ # Ini mencegah pipeline yang berat memblokir event loop utama
146
+ def _translate_task():
147
+ return translator(modified_text, max_length=512, num_beams=4)
148
+
149
+ result = await asyncio.to_thread(_translate_task)
150
  translated_text = result[0]["translation_text"]
151
 
152
+ # Kembalikan istilah yang dilindungi
153
  final_text = restore_terms(translated_text, replacements)
154
 
155
+ return TranslationResponse(translated_text=final_text, source_language=source_lang)
156
+
157
+ except HTTPException as e:
158
+ raise e # Re-raise HTTPException agar status code-nya benar
 
159
  except Exception as e:
160
+ logger.error(f"Terjadi kesalahan saat translasi: {str(e)}")
161
+ raise HTTPException(status_code=500, detail=f"Proses translasi gagal: {str(e)}")
162
+
163
+ @app.post("/translate", response_model=TranslationResponse)
164
+ async def translate_api(request: TranslationRequest):
165
+ """Endpoint API untuk translasi."""
166
+ return await perform_translation(request.text, request.source_lang_override)
167
+
168
+ @app.get("/health")
169
+ async def health_check():
170
+ return {"status": "healthy", "loaded_models": list(translators.keys())}
171
+
172
+
173
+ # --- 5. OPTIMASI: Handler Gradio menjadi Asynchronous ---
174
+ async def translate_gradio(text: str, source_lang: str = "auto"):
175
+ """Wrapper Gradio yang sekarang async dan lebih efisien."""
176
+ if not text or not text.strip():
177
+ return "Masukkan teks untuk diterjemahkan.", "N/A"
178
+
179
  try:
180
  source_lang_param = source_lang if source_lang != "auto" else None
181
+ result = await perform_translation(text, source_lang_param)
 
 
 
 
 
 
 
182
  return result.translated_text, result.source_language or "Unknown"
183
+
184
  except HTTPException as e:
185
  return f"Error: {e.detail}", "Error"
186
  except Exception as e:
187
  return f"Error: {str(e)}", "Error"
188
 
189
+ # --- 6. OPTIMASI: Mount Gradio ke FastAPI ---
190
+ # Fungsi untuk membuat UI Gradio tetap sama
191
  def create_gradio_interface():
192
  with gr.Blocks(
193
  title="Multi-Language Translation Service",
194
  theme=gr.themes.Soft(),
195
+ css=".gradio-container { max-width: 1200px !important; }"
 
 
 
 
196
  ) as interface:
 
197
  gr.Markdown("""
198
  # 🌐 Multi-Language Translation Service
199
+ Terjemahkan teks dari **Thai**, **Jepang**, **Mandarin**, atau **Vietnam** ke **Inggris**.
200
+ Fitur: Deteksi bahasa otomatis Perlindungan istilah Model Helsinki-NLP yang cepat.
 
 
201
  """)
202
 
203
  with gr.Row():
204
  with gr.Column(scale=1):
205
+ text_input = gr.Textbox(label="📝 Input Text", placeholder="Enter text to translate...", lines=6, max_lines=10)
 
 
 
 
 
 
206
  with gr.Row():
207
  lang_dropdown = gr.Dropdown(
208
  choices=[
209
+ ("🔍 Auto-detect", "auto"), ("🇹🇭 Thai", "th"), ("🇯🇵 Japanese", "ja"),
210
+ ("🇨🇳 Chinese", "zh"), ("🇻🇳 Vietnamese", "vi")
 
 
 
211
  ],
212
+ value="auto", label="Source Language"
 
213
  )
214
+ translate_btn = gr.Button("🚀 Translate", variant="primary", size="lg")
215
+
 
 
 
 
 
216
  with gr.Column(scale=1):
217
+ output_text = gr.Textbox(label="🎯 Translation Result", lines=6, max_lines=10, interactive=False)
218
+ detected_lang = gr.Textbox(label="🔍 Detected Language", interactive=False, max_lines=1)
219
+
220
+ gr.Examples(
221
+ examples=[
222
+ ["สวัสดีครับ ยินดีที่ได้รู้จัก การพัฒนา 2030 Aspirations เป็นเป้าหมายสำคัญ", "th"],
223
+ ["こんにちは��はじめまして。Griffith大学での研究が進んでいます。", "ja"],
224
+ ["你好,很高兴认识你。我们正在为2030 Aspirations制定计划。", "zh"],
225
+ ["Xin chào, rất vui được gặp bạn. Griffith là trường đại học tuyệt vời.", "vi"],
226
+ ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  inputs=[text_input, lang_dropdown],
228
+ outputs=[output_text, detected_lang],
229
+ fn=translate_gradio, # Sekarang memanggil fungsi async secara langsung
230
+ cache_examples=False
231
  )
232
 
233
+ # Event handlers sekarang bisa langsung memanggil fungsi async
234
+ translate_btn.click(fn=translate_gradio, inputs=[text_input, lang_dropdown], outputs=[output_text, detected_lang])
235
+ text_input.submit(fn=translate_gradio, inputs=[text_input, lang_dropdown], outputs=[output_text, detected_lang])
 
 
236
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  return interface
238
 
239
+ # Buat UI Gradio
240
+ gradio_app = create_gradio_interface()
241
+
242
+ # Mount Gradio app ke FastAPI di path "/"
243
+ # Ini adalah cara yang benar untuk mengintegrasikan keduanya
244
+ app = gr.mount_gradio_app(app, gradio_app, path="/")
245
+
246
+
247
+ # Untuk menjalankan:
248
+ # Simpan file ini sebagai app.py dan jalankan dengan uvicorn
249
+ # > uvicorn app:app --reload --port 7860