amine_dubs commited on
Commit
7dfe957
·
1 Parent(s): 986397d

Use public HF models with custom prompt for eloquent Arabic translations

Browse files
Files changed (1) hide show
  1. backend/main.py +85 -114
backend/main.py CHANGED
@@ -2,7 +2,7 @@ from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Request
2
  from fastapi.responses import HTMLResponse, JSONResponse
3
  from fastapi.staticfiles import StaticFiles
4
  from fastapi.templating import Jinja2Templates
5
- from typing import List, Optional, Dict, Any
6
  import os
7
  import requests
8
  import json
@@ -16,13 +16,8 @@ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
16
  TEMPLATE_DIR = os.path.join(os.path.dirname(BASE_DIR), "templates")
17
  STATIC_DIR = os.path.join(os.path.dirname(BASE_DIR), "static")
18
 
19
- # Hugging Face API configurations
20
- HF_API_URL = "https://api-inference.huggingface.co/models/t5-base"
21
- HF_HEADERS = {"Authorization": "Bearer hf_api_key_placeholder"} # Replace with your API key or remove if using a free model
22
-
23
  app = FastAPI()
24
-
25
- # --- Mount Static Files and Templates ---
26
  app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
27
  templates = Jinja2Templates(directory=TEMPLATE_DIR)
28
 
@@ -42,36 +37,27 @@ LANGUAGE_MAP = {
42
  "it": "Italian"
43
  }
44
 
45
- # --- Fallback dictionary for common phrases ---
46
- FALLBACK_PHRASES = {
47
- "hello": "مرحبا",
48
- "thank you": "شكرا لك",
49
- "goodbye": "مع السلامة",
50
- "welcome": "أهلا وسهلا",
51
- "yes": "نعم",
52
- "no": "لا",
53
- "please": "من فضلك",
54
- "sorry": "آسف",
55
- }
56
 
57
  # --- Translation Function ---
58
  def translate_text_internal(text: str, source_lang: str, target_lang: str = "ar") -> str:
59
  """
60
- Translate text using Hugging Face Inference API with prompt engineering.
61
  """
62
  if not text.strip():
63
  return ""
64
 
65
  print(f"Translation Request - Source Lang: {source_lang}, Target Lang: {target_lang}")
66
 
67
- # For very short text, check our dictionary first
68
- if len(text.strip()) < 20 and text.lower().strip() in FALLBACK_PHRASES:
69
- return FALLBACK_PHRASES[text.lower().strip()]
70
-
71
- # Get full language name if available
72
  source_lang_name = LANGUAGE_MAP.get(source_lang, source_lang)
73
 
74
- # Construct our prompt with instructions for eloquent Arabic translation
75
  prompt = f"""Translate the following {source_lang_name} text into Modern Standard Arabic (Fusha).
76
  Focus on conveying the meaning elegantly using proper Balagha (Arabic eloquence).
77
  Adapt any cultural references or idioms appropriately rather than translating literally.
@@ -80,100 +66,98 @@ Ensure the translation reads naturally to a native Arabic speaker.
80
  Text to translate:
81
  {text}"""
82
 
83
- # Try multiple models in order of preference
84
- models_to_try = [
85
- "Helsinki-NLP/opus-mt-en-ar", # specialized English-Arabic translator
86
- "facebook/nllb-200-distilled-600M", # multilingual model
87
- "t5-base", # general-purpose model that can follow instructions
88
- "google/mt5-small" # small multilingual model
89
  ]
90
 
91
- for model in models_to_try:
92
  try:
93
- print(f"Attempting translation using Hugging Face model: {model}")
94
-
95
- # Update API URL for current model
96
  api_url = f"https://api-inference.huggingface.co/models/{model}"
97
 
98
- # Prepare request payload based on model type
99
- if "opus-mt" in model:
100
- # Helsinki NMT models use direct input
101
- payload = {"inputs": text}
102
- elif "nllb" in model:
103
- # NLLB models need language tags
104
- src_lang_code = source_lang if source_lang != "auto" else "eng_Latn"
105
  payload = {
106
  "inputs": text,
107
  "parameters": {
108
- "source_lang": src_lang_code,
109
- "target_lang": "arb_Arab"
110
  }
111
  }
 
 
112
  else:
113
- # T5 and other instruction-following models use our prompt
114
  payload = {"inputs": prompt}
115
 
116
- # Make the API call
117
- response = requests.post(api_url, headers=HF_HEADERS, json=payload, timeout=30)
118
 
119
- # Handle different response formats based on model
120
  if response.status_code == 200:
121
  result = response.json()
122
-
123
- # Extract translated text based on response structure
124
  translated_text = None
 
 
125
  if isinstance(result, list) and len(result) > 0:
126
- if isinstance(result[0], dict) and "generated_text" in result[0]:
127
- translated_text = result[0]["generated_text"]
128
- elif isinstance(result[0], dict) and "translation_text" in result[0]:
129
- translated_text = result[0]["translation_text"]
130
  else:
131
  translated_text = str(result[0])
132
- elif isinstance(result, dict) and "generated_text" in result:
133
- translated_text = result["generated_text"]
134
-
135
  if translated_text:
136
  print(f"Translation successful using {model}")
137
- # Apply post-processing
138
  return culturally_adapt_arabic(translated_text)
139
- else:
140
- print(f"Unexpected response format: {response.text}")
141
- continue # Try next model
142
  else:
143
- print(f"API error: {response.status_code}, {response.text}")
144
- continue # Try next model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
 
 
 
146
  except Exception as e:
147
- print(f"Error with model {model}: {e}")
148
- continue # Try next model
149
 
150
- # If all models failed, try LibreTranslate as a backup
151
- try:
152
- print("Attempting LibreTranslate API as backup")
153
- libre_api = "https://translate.terraprint.co/translate"
154
- payload = {
155
- "q": text,
156
- "source": source_lang if source_lang != "auto" else "auto",
157
- "target": target_lang,
158
- "format": "text"
159
- }
160
-
161
- response = requests.post(libre_api, json=payload, timeout=10)
162
- if response.status_code == 200:
163
- result = response.json()
164
- translated_text = result.get("translatedText")
165
- if translated_text:
166
- return culturally_adapt_arabic(translated_text)
167
- except Exception as e:
168
- print(f"LibreTranslate backup failed: {e}")
169
 
170
- # All translation attempts failed, use fallback
171
- fallback_text = FALLBACK_PHRASES.get(text.lower().strip()) if len(text.strip()) < 20 else None
172
 
173
- if fallback_text:
174
- return fallback_text
175
- else:
176
- return "عذراً، لم نتمكن من ترجمة النص. خدمة الترجمة غير متاحة حالياً."
177
 
178
  def culturally_adapt_arabic(text: str) -> str:
179
  """Apply post-processing rules to enhance Arabic translation with cultural sensitivity."""
@@ -184,7 +168,7 @@ def culturally_adapt_arabic(text: str) -> str:
184
  # --- Helper Functions ---
185
  async def extract_text_from_file(file: UploadFile) -> str:
186
  """Extracts text content from uploaded files without writing to disk."""
187
- content = await file.read() # Read file content into memory
188
  file_extension = os.path.splitext(file.filename)[1].lower()
189
  extracted_text = ""
190
 
@@ -212,7 +196,7 @@ async def extract_text_from_file(file: UploadFile) -> str:
212
  doc = docx.Document(doc_stream)
213
  extracted_text = '\n'.join([para.text for para in doc.paragraphs])
214
  except ImportError:
215
- raise HTTPException(status_code=501, detail="DOCX processing requires 'python-docx' library, which is not installed.")
216
 
217
  elif file_extension == '.pdf':
218
  try:
@@ -229,7 +213,7 @@ async def extract_text_from_file(file: UploadFile) -> str:
229
  extracted_text = "\n".join(page_texts)
230
  doc.close()
231
  except ImportError:
232
- raise HTTPException(status_code=501, detail="PDF processing requires 'PyMuPDF' library, which is not installed.")
233
 
234
  else:
235
  raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_extension}")
@@ -237,21 +221,15 @@ async def extract_text_from_file(file: UploadFile) -> str:
237
  print(f"Extracted text length: {len(extracted_text)}")
238
  return extracted_text
239
 
240
- except HTTPException as e:
241
- raise e
242
  except Exception as e:
243
  print(f"Error processing file {file.filename}: {e}")
244
  traceback.print_exc()
245
- raise HTTPException(status_code=500, detail=f"An unexpected error occurred processing the document: {e}")
246
 
247
  # --- API Endpoints ---
248
  @app.get("/", response_class=HTMLResponse)
249
  async def read_root(request: Request):
250
  """Serves the main HTML page."""
251
- if not os.path.exists(TEMPLATE_DIR):
252
- raise HTTPException(status_code=500, detail=f"Template directory not found at {TEMPLATE_DIR}")
253
- if not os.path.exists(os.path.join(TEMPLATE_DIR, "index.html")):
254
- raise HTTPException(status_code=500, detail=f"index.html not found in {TEMPLATE_DIR}")
255
  return templates.TemplateResponse("index.html", {"request": request})
256
 
257
  @app.post("/translate/text")
@@ -264,17 +242,13 @@ async def translate_text_endpoint(
264
  if not text:
265
  raise HTTPException(status_code=400, detail="No text provided for translation.")
266
 
267
- if target_lang != "ar":
268
- raise HTTPException(status_code=400, detail="Currently, only translation to Arabic (ar) is supported via this endpoint.")
269
-
270
  try:
271
  translated_text = translate_text_internal(text, source_lang, target_lang)
272
  return JSONResponse(content={"translated_text": translated_text, "source_lang": source_lang})
273
- except HTTPException as http_exc:
274
- raise http_exc
275
  except Exception as e:
276
- print(f"Unexpected error in /translate/text: {e}")
277
- raise HTTPException(status_code=500, detail=f"An unexpected error occurred during text translation: {e}")
 
278
 
279
  @app.post("/translate/document")
280
  async def translate_document_endpoint(
@@ -282,10 +256,7 @@ async def translate_document_endpoint(
282
  source_lang: str = Form(...),
283
  target_lang: str = Form("ar")
284
  ):
285
- """Translates text extracted from an uploaded document without saving to disk."""
286
- if target_lang != "ar":
287
- raise HTTPException(status_code=400, detail="Currently, only document translation to Arabic (ar) is supported.")
288
-
289
  try:
290
  # Extract text directly from the uploaded file
291
  extracted_text = await extract_text_from_file(file)
@@ -305,11 +276,11 @@ async def translate_document_endpoint(
305
  except HTTPException as http_exc:
306
  raise http_exc
307
  except Exception as e:
308
- raise HTTPException(status_code=500, detail=f"An unexpected error occurred processing the document: {e}")
 
 
309
 
310
  # --- Run the server (for local development) ---
311
  if __name__ == "__main__":
312
  import uvicorn
313
- print(f"Template Directory: {TEMPLATE_DIR}")
314
- print(f"Static Directory: {STATIC_DIR}")
315
  uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
 
2
  from fastapi.responses import HTMLResponse, JSONResponse
3
  from fastapi.staticfiles import StaticFiles
4
  from fastapi.templating import Jinja2Templates
5
+ from typing import List, Optional
6
  import os
7
  import requests
8
  import json
 
16
  TEMPLATE_DIR = os.path.join(os.path.dirname(BASE_DIR), "templates")
17
  STATIC_DIR = os.path.join(os.path.dirname(BASE_DIR), "static")
18
 
19
+ # --- Initialize FastAPI ---
 
 
 
20
  app = FastAPI()
 
 
21
  app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
22
  templates = Jinja2Templates(directory=TEMPLATE_DIR)
23
 
 
37
  "it": "Italian"
38
  }
39
 
40
+ # --- Free translation APIs ---
41
+ LIBRE_TRANSLATE_ENDPOINTS = [
42
+ "https://translate.terraprint.co/translate",
43
+ "https://libretranslate.de/translate",
44
+ "https://translate.argosopentech.com/translate"
45
+ ]
 
 
 
 
 
46
 
47
  # --- Translation Function ---
48
  def translate_text_internal(text: str, source_lang: str, target_lang: str = "ar") -> str:
49
  """
50
+ Translate text using Hugging Face Inference API and LibreTranslate as backup
51
  """
52
  if not text.strip():
53
  return ""
54
 
55
  print(f"Translation Request - Source Lang: {source_lang}, Target Lang: {target_lang}")
56
 
57
+ # Get full language name for prompt
 
 
 
 
58
  source_lang_name = LANGUAGE_MAP.get(source_lang, source_lang)
59
 
60
+ # Construct our eloquent Arabic translation prompt
61
  prompt = f"""Translate the following {source_lang_name} text into Modern Standard Arabic (Fusha).
62
  Focus on conveying the meaning elegantly using proper Balagha (Arabic eloquence).
63
  Adapt any cultural references or idioms appropriately rather than translating literally.
 
66
  Text to translate:
67
  {text}"""
68
 
69
+ # Try Hugging Face Inference API with models that are reliably available on the free tier
70
+ hf_models = [
71
+ "facebook/m2m100_418M", # Very reliable multilingual model
72
+ "Helsinki-NLP/opus-mt-tc-big-en-ar" # Good for English to Arabic
 
 
73
  ]
74
 
75
+ for model in hf_models:
76
  try:
77
+ print(f"Attempting translation via Hugging Face Inference API: {model}")
 
 
78
  api_url = f"https://api-inference.huggingface.co/models/{model}"
79
 
80
+ # Different payloads based on model architecture
81
+ if "m2m" in model:
 
 
 
 
 
82
  payload = {
83
  "inputs": text,
84
  "parameters": {
85
+ "src_lang": source_lang.upper() if source_lang != "zh" else "ZH",
86
+ "tgt_lang": "AR"
87
  }
88
  }
89
+ elif "opus-mt" in model:
90
+ payload = {"inputs": text}
91
  else:
 
92
  payload = {"inputs": prompt}
93
 
94
+ # No auth header for public models on free tier
95
+ response = requests.post(api_url, json=payload, timeout=30)
96
 
 
97
  if response.status_code == 200:
98
  result = response.json()
 
 
99
  translated_text = None
100
+
101
+ # Extract text from various response formats
102
  if isinstance(result, list) and len(result) > 0:
103
+ if isinstance(result[0], dict):
104
+ translated_text = result[0].get("translation_text") or result[0].get("generated_text")
 
 
105
  else:
106
  translated_text = str(result[0])
107
+ elif isinstance(result, dict):
108
+ translated_text = result.get("translation_text") or result.get("generated_text")
109
+
110
  if translated_text:
111
  print(f"Translation successful using {model}")
 
112
  return culturally_adapt_arabic(translated_text)
113
+
114
+ print(f"Unexpected response format: {response.text}")
 
115
  else:
116
+ print(f"API error: {response.status_code}")
117
+
118
+ except Exception as e:
119
+ print(f"Error with Hugging Face model {model}: {e}")
120
+
121
+ # If Hugging Face fails, try LibreTranslate
122
+ for endpoint in LIBRE_TRANSLATE_ENDPOINTS:
123
+ try:
124
+ print(f"Attempting translation using LibreTranslate: {endpoint}")
125
+ payload = {
126
+ "q": text,
127
+ "source": source_lang if source_lang != "auto" else "auto",
128
+ "target": target_lang,
129
+ "format": "text"
130
+ }
131
+
132
+ response = requests.post(endpoint, json=payload, timeout=10)
133
+
134
+ if response.status_code == 200:
135
+ result = response.json()
136
+ translated_text = result.get("translatedText")
137
 
138
+ if translated_text:
139
+ print(f"Translation successful using LibreTranslate {endpoint}")
140
+ return culturally_adapt_arabic(translated_text)
141
  except Exception as e:
142
+ print(f"Error with LibreTranslate {endpoint}: {e}")
 
143
 
144
+ # If all else fails, use a simple English-Arabic dictionary for common phrases
145
+ common_phrases = {
146
+ "hello": "مرحبا",
147
+ "thank you": "شكرا لك",
148
+ "goodbye": "مع السلامة",
149
+ "welcome": "أهلا وسهلا",
150
+ "yes": "نعم",
151
+ "no": "لا",
152
+ "please": "من فضلك",
153
+ "sorry": "آسف",
154
+ }
 
 
 
 
 
 
 
 
155
 
156
+ if text.lower().strip() in common_phrases:
157
+ return common_phrases[text.lower().strip()]
158
 
159
+ # Last resort message
160
+ return "عذراً، لم نتمكن من ترجمة النص بسبب خطأ فني. الرجاء المحاولة لاحقاً."
 
 
161
 
162
  def culturally_adapt_arabic(text: str) -> str:
163
  """Apply post-processing rules to enhance Arabic translation with cultural sensitivity."""
 
168
  # --- Helper Functions ---
169
  async def extract_text_from_file(file: UploadFile) -> str:
170
  """Extracts text content from uploaded files without writing to disk."""
171
+ content = await file.read()
172
  file_extension = os.path.splitext(file.filename)[1].lower()
173
  extracted_text = ""
174
 
 
196
  doc = docx.Document(doc_stream)
197
  extracted_text = '\n'.join([para.text for para in doc.paragraphs])
198
  except ImportError:
199
+ raise HTTPException(status_code=501, detail="DOCX processing requires 'python-docx' library")
200
 
201
  elif file_extension == '.pdf':
202
  try:
 
213
  extracted_text = "\n".join(page_texts)
214
  doc.close()
215
  except ImportError:
216
+ raise HTTPException(status_code=501, detail="PDF processing requires 'PyMuPDF' library")
217
 
218
  else:
219
  raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_extension}")
 
221
  print(f"Extracted text length: {len(extracted_text)}")
222
  return extracted_text
223
 
 
 
224
  except Exception as e:
225
  print(f"Error processing file {file.filename}: {e}")
226
  traceback.print_exc()
227
+ raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}")
228
 
229
  # --- API Endpoints ---
230
  @app.get("/", response_class=HTMLResponse)
231
  async def read_root(request: Request):
232
  """Serves the main HTML page."""
 
 
 
 
233
  return templates.TemplateResponse("index.html", {"request": request})
234
 
235
  @app.post("/translate/text")
 
242
  if not text:
243
  raise HTTPException(status_code=400, detail="No text provided for translation.")
244
 
 
 
 
245
  try:
246
  translated_text = translate_text_internal(text, source_lang, target_lang)
247
  return JSONResponse(content={"translated_text": translated_text, "source_lang": source_lang})
 
 
248
  except Exception as e:
249
+ print(f"Translation error: {e}")
250
+ traceback.print_exc()
251
+ raise HTTPException(status_code=500, detail=f"Translation error: {str(e)}")
252
 
253
  @app.post("/translate/document")
254
  async def translate_document_endpoint(
 
256
  source_lang: str = Form(...),
257
  target_lang: str = Form("ar")
258
  ):
259
+ """Translates text extracted from an uploaded document."""
 
 
 
260
  try:
261
  # Extract text directly from the uploaded file
262
  extracted_text = await extract_text_from_file(file)
 
276
  except HTTPException as http_exc:
277
  raise http_exc
278
  except Exception as e:
279
+ print(f"Document translation error: {e}")
280
+ traceback.print_exc()
281
+ raise HTTPException(status_code=500, detail=f"Document translation error: {str(e)}")
282
 
283
  # --- Run the server (for local development) ---
284
  if __name__ == "__main__":
285
  import uvicorn
 
 
286
  uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)