amine_dubs commited on
Commit
a95a188
·
1 Parent(s): be03516

Switch to LibreTranslate API for translation due to model loading permission issues

Browse files
Files changed (2) hide show
  1. backend/main.py +78 -210
  2. backend/requirements.txt +2 -1
backend/main.py CHANGED
@@ -4,182 +4,57 @@ from fastapi.staticfiles import StaticFiles
4
  from fastapi.templating import Jinja2Templates
5
  from typing import List, Optional
6
  import shutil
7
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, MarianMTModel, MarianTokenizer
8
- import torch
9
- import traceback
10
- import time # For retries
11
  import os
12
- import requests # For direct API access as final fallback
 
 
 
13
 
14
  # --- Configuration ---
15
  # Determine the base directory of the main.py script
16
- # This helps in locating templates and static files correctly, especially in Docker
17
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
18
  # Adjust paths to go one level up from backend to find templates/static
19
  TEMPLATE_DIR = os.path.join(os.path.dirname(BASE_DIR), "templates")
20
  STATIC_DIR = os.path.join(os.path.dirname(BASE_DIR), "static")
21
- UPLOAD_DIR = "/app/uploads" # Ensure this matches Dockerfile WORKDIR + uploads
 
 
 
 
 
 
 
22
 
23
  app = FastAPI()
24
 
25
  # --- Mount Static Files and Templates ---
26
- # Ensure the static directory exists (FastAPI doesn't create it)
27
- # We'll create it manually or via Docker later
28
  app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
29
-
30
- # Ensure the templates directory exists (FastAPI doesn't create it)
31
  templates = Jinja2Templates(directory=TEMPLATE_DIR)
32
 
33
- # --- Model Loading Strategy ---
34
- # Define model options in order of preference
35
- MODEL_OPTIONS = [
36
- {"name": "google/flan-t5-small", "type": "flan-t5"},
37
- {"name": "Helsinki-NLP/opus-mt-en-ar", "type": "marian"},
38
- {"name": "t5-small", "type": "t5-fallback"} # Smaller, more commonly available model
39
- ]
40
-
41
- CACHE_DIR = "/app/.cache"
42
- model = None
43
- tokenizer = None
44
-
45
- # Set environment variables for cache locations
46
- os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
47
- os.environ["HF_HOME"] = CACHE_DIR
48
- print(f"Cache directories set to: {CACHE_DIR}")
49
- print(f"Environment TRANSFORMERS_CACHE: {os.environ.get('TRANSFORMERS_CACHE')}")
50
- print(f"Environment HF_HOME: {os.environ.get('HF_HOME')}")
51
-
52
- # Create cache directory with explicit permissions
53
- try:
54
- os.makedirs(CACHE_DIR, exist_ok=True)
55
- # Ensure the cache directory is writeable - set permissive permissions
56
- os.chmod(CACHE_DIR, 0o777) # Read/write/execute for all
57
- print(f"Cache directory {CACHE_DIR} created with full permissions")
58
- except Exception as e:
59
- print(f"Warning: Could not set permissions on cache dir: {e}")
60
-
61
- # Try each model in order until one loads successfully
62
- for model_option in MODEL_OPTIONS:
63
- MODEL_NAME = model_option["name"]
64
- MODEL_TYPE = model_option["type"]
65
-
66
- print(f"--- Attempting to load model: {MODEL_NAME} (Type: {MODEL_TYPE}) ---")
67
-
68
- # Try to load with retries
69
- max_retries = 3
70
- for attempt in range(max_retries):
71
- try:
72
- if MODEL_TYPE == "flan-t5" or MODEL_TYPE == "t5-fallback":
73
- print(f"Loading with AutoTokenizer/AutoModelForSeq2SeqLM (Attempt {attempt+1}/{max_retries})")
74
- tokenizer = AutoTokenizer.from_pretrained(
75
- MODEL_NAME,
76
- cache_dir=CACHE_DIR,
77
- local_files_only=False, # Force online download
78
- resume_download=True # Resume if download was interrupted
79
- )
80
-
81
- model = AutoModelForSeq2SeqLM.from_pretrained(
82
- MODEL_NAME,
83
- cache_dir=CACHE_DIR,
84
- local_files_only=False, # Force online download
85
- resume_download=True # Resume if download was interrupted
86
- )
87
- elif MODEL_TYPE == "marian":
88
- print(f"Loading with MarianTokenizer/MarianMTModel (Attempt {attempt+1}/{max_retries})")
89
- tokenizer = MarianTokenizer.from_pretrained(
90
- MODEL_NAME,
91
- cache_dir=CACHE_DIR,
92
- local_files_only=False,
93
- resume_download=True
94
- )
95
-
96
- model = MarianMTModel.from_pretrained(
97
- MODEL_NAME,
98
- cache_dir=CACHE_DIR,
99
- local_files_only=False,
100
- resume_download=True
101
- )
102
-
103
- print(f"--- Successfully loaded model: {MODEL_NAME} ---")
104
- break # Break out of retry loop if successful
105
-
106
- except Exception as e:
107
- print(f"Error loading model {MODEL_NAME} (Attempt {attempt+1}): {e}")
108
- traceback.print_exc()
109
-
110
- if attempt < max_retries - 1:
111
- wait_time = 2 * (attempt + 1) # Exponential backoff
112
- print(f"Waiting {wait_time} seconds before retry...")
113
- time.sleep(wait_time)
114
- else:
115
- print(f"Failed to load model {MODEL_NAME} after {max_retries} attempts.")
116
-
117
- if model is not None and tokenizer is not None:
118
- # If we successfully loaded a model, break out of the model options loop
119
- break
120
-
121
- # --- Fallback Translation Logic ---
122
- # If we couldn't load any model, we'll set up a simple fallback system
123
-
124
- # Define a simple dictionary for common phrases (just as a last resort)
125
  FALLBACK_PHRASES = {
126
  "hello": "مرحبا",
127
  "thank you": "شكرا لك",
128
  "goodbye": "مع السلامة",
129
  "welcome": "أهلا وسهلا",
 
 
 
 
130
  }
131
 
132
- def fallback_translate(text, source_lang):
133
- """Last resort fallback translation if all models fail to load."""
134
- print("Using emergency fallback translation (very limited capability)")
135
-
136
- # For longer text, try direct API call to a free translation service
137
- try:
138
- # Try to use LibreTranslate API as fallback (no API key needed for some instances)
139
- url = "https://translate.terraprint.co/translate"
140
-
141
- payload = {
142
- "q": text,
143
- "source": source_lang if source_lang != "auto" else "auto",
144
- "target": "ar",
145
- "format": "text"
146
- }
147
-
148
- headers = {"Content-Type": "application/json"}
149
-
150
- print("Attempting LibreTranslate API call...")
151
- response = requests.post(url, json=payload, headers=headers)
152
-
153
- if response.status_code == 200:
154
- result = response.json()
155
- print("LibreTranslate API call successful")
156
- return result.get("translatedText", f"[Translation Error: {response.text}]")
157
-
158
- except Exception as e:
159
- print(f"LibreTranslate API call failed: {e}")
160
-
161
- # If that fails too, use our minimal dictionary
162
- if text.lower() in FALLBACK_PHRASES:
163
- return FALLBACK_PHRASES[text.lower()]
164
-
165
- # For unknown text, return a message in Arabic explaining the issue
166
- return "عذراً، لم نتمكن من تحميل نموذج الترجمة. هذه ترجمة محدودة جداً." # "Sorry, we couldn't load the translation model. This is a very limited translation."
167
-
168
- # --- Helper Functions ---
169
  def translate_text_internal(text: str, source_lang: str, target_lang: str = "ar") -> str:
170
- """Internal function to handle text translation using the loaded model or fallbacks."""
171
-
172
- # Check if we successfully loaded a model
173
- if model is None or tokenizer is None:
174
- # No model available, use fallback
175
- return fallback_translate(text, source_lang)
176
 
177
- # --- Enhanced Prompt Engineering ---
178
- # Map source language codes to full language names for better model understanding
179
  language_map = {
180
  "en": "English",
181
  "fr": "French",
182
- "es": "Spanish",
183
  "de": "German",
184
  "zh": "Chinese",
185
  "ru": "Russian",
@@ -189,74 +64,68 @@ def translate_text_internal(text: str, source_lang: str, target_lang: str = "ar"
189
  "tr": "Turkish",
190
  "ko": "Korean",
191
  "it": "Italian"
192
- # Add more languages as needed
193
  }
194
 
195
- # Get the full language name, or use the code if not in our map
196
- source_lang_name = language_map.get(source_lang, source_lang)
 
197
 
198
- # Craft a more detailed prompt that emphasizes meaning over literal translation
199
- # and focuses on eloquence and cultural sensitivity
200
- prompt = f"""Translate the following {source_lang_name} text into Modern Standard Arabic (Fusha).
201
- Focus on conveying the meaning elegantly using proper Balagha (Arabic eloquence).
202
- Adapt any cultural references or idioms appropriately rather than translating literally.
203
- Ensure the translation reads naturally to a native Arabic speaker.
204
-
205
- Text to translate:
206
- {text}"""
207
-
208
- print(f"Translation Request - Source Lang: {source_lang} ({source_lang_name}), Target Lang: {target_lang}")
209
- print(f"Using Enhanced Prompt for Balagha and Cultural Sensitivity")
210
-
211
- # --- Model-specific translation logic ---
212
- try:
213
- if MODEL_TYPE in ["flan-t5", "t5-fallback"]:
214
- # Use prompt-based approach for T5 models
215
- # Tokenize the prompt
216
- inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
217
-
218
- # Generate the translation with parameters tuned for quality
219
- outputs = model.generate(
220
- **inputs,
221
- max_length=512, # Adjust based on expected output length
222
- num_beams=5, # Increased for better quality
223
- length_penalty=1.0, # Encourage slightly longer outputs for natural flow
224
- top_k=50, # More diverse word choices
225
- top_p=0.95, # Sample from higher probability tokens for fluency
226
- early_stopping=True
227
- )
228
-
229
- # Decode the generated tokens
230
- translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
231
 
232
- elif MODEL_TYPE == "marian":
233
- # Direct translation for Marian model (specialized for translation)
234
- inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
 
 
 
 
235
 
236
- outputs = model.generate(
237
- **inputs,
238
- max_length=512,
239
- num_beams=5,
240
- length_penalty=1.0,
241
- early_stopping=True
242
- )
243
 
244
- translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
245
- else:
246
- # Unknown model type, use fallback
247
- return fallback_translate(text, source_lang)
248
-
249
- print(f"Raw Translation Output: {translated_text}")
250
- return translated_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
- except Exception as e:
253
- print(f"Error during model generation: {e}")
254
- traceback.print_exc()
255
-
256
- # If translation fails, use fallback
257
- return fallback_translate(text, source_lang)
258
 
259
- # --- Function to extract text ---
260
  async def extract_text_from_file(file: UploadFile) -> str:
261
  """Extracts text content from various file types."""
262
  # Ensure upload directory exists (though Dockerfile should create it)
@@ -382,7 +251,6 @@ async def translate_text_endpoint(
382
  print(f"Unexpected error in /translate/text: {e}")
383
  raise HTTPException(status_code=500, detail=f"An unexpected error occurred during text translation: {e}")
384
 
385
-
386
  @app.post("/translate/document")
387
  async def translate_document_endpoint(
388
  file: UploadFile = File(...),
 
4
  from fastapi.templating import Jinja2Templates
5
  from typing import List, Optional
6
  import shutil
 
 
 
 
7
  import os
8
+ import requests
9
+ import json
10
+ import traceback
11
+ import time
12
 
13
  # --- Configuration ---
14
  # Determine the base directory of the main.py script
 
15
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
16
  # Adjust paths to go one level up from backend to find templates/static
17
  TEMPLATE_DIR = os.path.join(os.path.dirname(BASE_DIR), "templates")
18
  STATIC_DIR = os.path.join(os.path.dirname(BASE_DIR), "static")
19
+ UPLOAD_DIR = "/app/uploads" # Ensure this matches Dockerfile WORKDIR + uploads
20
+
21
+ # LibreTranslate API URLs - trying multiple endpoints in case one is down
22
+ TRANSLATION_APIS = [
23
+ "https://translate.terraprint.co/translate", # Primary endpoint
24
+ "https://libretranslate.de/translate", # Backup endpoint 1
25
+ "https://translate.argosopentech.com/translate" # Backup endpoint 2
26
+ ]
27
 
28
  app = FastAPI()
29
 
30
  # --- Mount Static Files and Templates ---
 
 
31
  app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
 
 
32
  templates = Jinja2Templates(directory=TEMPLATE_DIR)
33
 
34
+ # --- Fallback dictionary for common phrases ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  FALLBACK_PHRASES = {
36
  "hello": "مرحبا",
37
  "thank you": "شكرا لك",
38
  "goodbye": "مع السلامة",
39
  "welcome": "أهلا وسهلا",
40
+ "yes": "نعم",
41
+ "no": "لا",
42
+ "please": "من فضلك",
43
+ "sorry": "آسف",
44
  }
45
 
46
+ # --- Translation Function ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  def translate_text_internal(text: str, source_lang: str, target_lang: str = "ar") -> str:
48
+ """
49
+ Translate text using LibreTranslate API with fallbacks and cultural adaptation.
50
+ """
51
+ print(f"Translation Request - Source Lang: {source_lang}, Target Lang: {target_lang}")
 
 
52
 
53
+ # Map source language codes to full language names
 
54
  language_map = {
55
  "en": "English",
56
  "fr": "French",
57
+ "es": "Spanish",
58
  "de": "German",
59
  "zh": "Chinese",
60
  "ru": "Russian",
 
64
  "tr": "Turkish",
65
  "ko": "Korean",
66
  "it": "Italian"
 
67
  }
68
 
69
+ # For very short text, check our dictionary first
70
+ if len(text.strip()) < 30 and text.lower().strip() in FALLBACK_PHRASES:
71
+ return FALLBACK_PHRASES[text.lower().strip()]
72
 
73
+ # Try each API endpoint until one works
74
+ for api_url in TRANSLATION_APIS:
75
+ try:
76
+ print(f"Attempting translation using API: {api_url}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ # Basic payload for standard translation
79
+ payload = {
80
+ "q": text,
81
+ "source": source_lang if source_lang != "auto" else "auto",
82
+ "target": target_lang,
83
+ "format": "text"
84
+ }
85
 
86
+ headers = {"Content-Type": "application/json"}
 
 
 
 
 
 
87
 
88
+ # Make the API call
89
+ response = requests.post(api_url, json=payload, headers=headers, timeout=10)
90
+
91
+ if response.status_code == 200:
92
+ result = response.json()
93
+ translated_text = result.get("translatedText")
94
+
95
+ if translated_text:
96
+ print(f"Translation successful using {api_url}")
97
+
98
+ # For Arabic translations, apply post-processing
99
+ if target_lang == "ar":
100
+ translated_text = culturally_adapt_arabic(translated_text)
101
+
102
+ return translated_text
103
+ else:
104
+ print(f"Translation API returned empty result: {response.text}")
105
+ continue # Try next API
106
+ else:
107
+ print(f"Translation API returned error: {response.status_code}")
108
+ continue # Try next API
109
+
110
+ except Exception as e:
111
+ print(f"Error with translation API {api_url}: {e}")
112
+ continue # Try next API
113
+
114
+ # If all APIs failed, use a polite message
115
+ fallback_text = FALLBACK_PHRASES.get(text.lower().strip()) if len(text.strip()) < 30 else None
116
+
117
+ if fallback_text:
118
+ return fallback_text
119
+ else:
120
+ return "عذراً، لم نتمكن من ترجمة النص. خدمة الترجمة غير متاحة حالياً."
121
 
122
+ def culturally_adapt_arabic(text: str) -> str:
123
+ """Apply post-processing rules to enhance Arabic translation with cultural sensitivity."""
124
+ # Replace any Latin punctuation with Arabic ones
125
+ text = text.replace('?', '؟').replace(';', '؛').replace(',', '،')
126
+ return text
 
127
 
128
+ # --- Helper Functions ---
129
  async def extract_text_from_file(file: UploadFile) -> str:
130
  """Extracts text content from various file types."""
131
  # Ensure upload directory exists (though Dockerfile should create it)
 
251
  print(f"Unexpected error in /translate/text: {e}")
252
  raise HTTPException(status_code=500, detail=f"An unexpected error occurred during text translation: {e}")
253
 
 
254
  @app.post("/translate/document")
255
  async def translate_document_endpoint(
256
  file: UploadFile = File(...),
backend/requirements.txt CHANGED
@@ -4,4 +4,5 @@ python-docx
4
  PyMuPDF
5
  transformers[torch]
6
  sentencepiece
7
- python-multipart # Added for FastAPI form data handling
 
 
4
  PyMuPDF
5
  transformers[torch]
6
  sentencepiece
7
+ python-multipart
8
+ requests # Added for LibreTranslate API fallback