amine_dubs
commited on
Commit
·
7dfe957
1
Parent(s):
986397d
Use public HF models with custom prompt for eloquent Arabic translations
Browse files- backend/main.py +85 -114
backend/main.py
CHANGED
@@ -2,7 +2,7 @@ from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Request
|
|
2 |
from fastapi.responses import HTMLResponse, JSONResponse
|
3 |
from fastapi.staticfiles import StaticFiles
|
4 |
from fastapi.templating import Jinja2Templates
|
5 |
-
from typing import List, Optional
|
6 |
import os
|
7 |
import requests
|
8 |
import json
|
@@ -16,13 +16,8 @@ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
16 |
TEMPLATE_DIR = os.path.join(os.path.dirname(BASE_DIR), "templates")
|
17 |
STATIC_DIR = os.path.join(os.path.dirname(BASE_DIR), "static")
|
18 |
|
19 |
-
#
|
20 |
-
HF_API_URL = "https://api-inference.huggingface.co/models/t5-base"
|
21 |
-
HF_HEADERS = {"Authorization": "Bearer hf_api_key_placeholder"} # Replace with your API key or remove if using a free model
|
22 |
-
|
23 |
app = FastAPI()
|
24 |
-
|
25 |
-
# --- Mount Static Files and Templates ---
|
26 |
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
|
27 |
templates = Jinja2Templates(directory=TEMPLATE_DIR)
|
28 |
|
@@ -42,36 +37,27 @@ LANGUAGE_MAP = {
|
|
42 |
"it": "Italian"
|
43 |
}
|
44 |
|
45 |
-
# ---
|
46 |
-
|
47 |
-
"
|
48 |
-
"
|
49 |
-
"
|
50 |
-
|
51 |
-
"yes": "نعم",
|
52 |
-
"no": "لا",
|
53 |
-
"please": "من فضلك",
|
54 |
-
"sorry": "آسف",
|
55 |
-
}
|
56 |
|
57 |
# --- Translation Function ---
|
58 |
def translate_text_internal(text: str, source_lang: str, target_lang: str = "ar") -> str:
|
59 |
"""
|
60 |
-
Translate text using Hugging Face Inference API
|
61 |
"""
|
62 |
if not text.strip():
|
63 |
return ""
|
64 |
|
65 |
print(f"Translation Request - Source Lang: {source_lang}, Target Lang: {target_lang}")
|
66 |
|
67 |
-
#
|
68 |
-
if len(text.strip()) < 20 and text.lower().strip() in FALLBACK_PHRASES:
|
69 |
-
return FALLBACK_PHRASES[text.lower().strip()]
|
70 |
-
|
71 |
-
# Get full language name if available
|
72 |
source_lang_name = LANGUAGE_MAP.get(source_lang, source_lang)
|
73 |
|
74 |
-
# Construct our
|
75 |
prompt = f"""Translate the following {source_lang_name} text into Modern Standard Arabic (Fusha).
|
76 |
Focus on conveying the meaning elegantly using proper Balagha (Arabic eloquence).
|
77 |
Adapt any cultural references or idioms appropriately rather than translating literally.
|
@@ -80,100 +66,98 @@ Ensure the translation reads naturally to a native Arabic speaker.
|
|
80 |
Text to translate:
|
81 |
{text}"""
|
82 |
|
83 |
-
# Try
|
84 |
-
|
85 |
-
"
|
86 |
-
"
|
87 |
-
"t5-base", # general-purpose model that can follow instructions
|
88 |
-
"google/mt5-small" # small multilingual model
|
89 |
]
|
90 |
|
91 |
-
for model in
|
92 |
try:
|
93 |
-
print(f"Attempting translation
|
94 |
-
|
95 |
-
# Update API URL for current model
|
96 |
api_url = f"https://api-inference.huggingface.co/models/{model}"
|
97 |
|
98 |
-
#
|
99 |
-
if "
|
100 |
-
# Helsinki NMT models use direct input
|
101 |
-
payload = {"inputs": text}
|
102 |
-
elif "nllb" in model:
|
103 |
-
# NLLB models need language tags
|
104 |
-
src_lang_code = source_lang if source_lang != "auto" else "eng_Latn"
|
105 |
payload = {
|
106 |
"inputs": text,
|
107 |
"parameters": {
|
108 |
-
"
|
109 |
-
"
|
110 |
}
|
111 |
}
|
|
|
|
|
112 |
else:
|
113 |
-
# T5 and other instruction-following models use our prompt
|
114 |
payload = {"inputs": prompt}
|
115 |
|
116 |
-
#
|
117 |
-
response = requests.post(api_url,
|
118 |
|
119 |
-
# Handle different response formats based on model
|
120 |
if response.status_code == 200:
|
121 |
result = response.json()
|
122 |
-
|
123 |
-
# Extract translated text based on response structure
|
124 |
translated_text = None
|
|
|
|
|
125 |
if isinstance(result, list) and len(result) > 0:
|
126 |
-
if isinstance(result[0], dict)
|
127 |
-
translated_text = result[0]["generated_text"
|
128 |
-
elif isinstance(result[0], dict) and "translation_text" in result[0]:
|
129 |
-
translated_text = result[0]["translation_text"]
|
130 |
else:
|
131 |
translated_text = str(result[0])
|
132 |
-
elif isinstance(result, dict)
|
133 |
-
translated_text = result
|
134 |
-
|
135 |
if translated_text:
|
136 |
print(f"Translation successful using {model}")
|
137 |
-
# Apply post-processing
|
138 |
return culturally_adapt_arabic(translated_text)
|
139 |
-
|
140 |
-
|
141 |
-
continue # Try next model
|
142 |
else:
|
143 |
-
print(f"API error: {response.status_code}
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
|
|
|
|
|
|
146 |
except Exception as e:
|
147 |
-
print(f"Error with
|
148 |
-
continue # Try next model
|
149 |
|
150 |
-
# If all
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
response = requests.post(libre_api, json=payload, timeout=10)
|
162 |
-
if response.status_code == 200:
|
163 |
-
result = response.json()
|
164 |
-
translated_text = result.get("translatedText")
|
165 |
-
if translated_text:
|
166 |
-
return culturally_adapt_arabic(translated_text)
|
167 |
-
except Exception as e:
|
168 |
-
print(f"LibreTranslate backup failed: {e}")
|
169 |
|
170 |
-
|
171 |
-
|
172 |
|
173 |
-
|
174 |
-
|
175 |
-
else:
|
176 |
-
return "عذراً، لم نتمكن من ترجمة النص. خدمة الترجمة غير متاحة حالياً."
|
177 |
|
178 |
def culturally_adapt_arabic(text: str) -> str:
|
179 |
"""Apply post-processing rules to enhance Arabic translation with cultural sensitivity."""
|
@@ -184,7 +168,7 @@ def culturally_adapt_arabic(text: str) -> str:
|
|
184 |
# --- Helper Functions ---
|
185 |
async def extract_text_from_file(file: UploadFile) -> str:
|
186 |
"""Extracts text content from uploaded files without writing to disk."""
|
187 |
-
content = await file.read()
|
188 |
file_extension = os.path.splitext(file.filename)[1].lower()
|
189 |
extracted_text = ""
|
190 |
|
@@ -212,7 +196,7 @@ async def extract_text_from_file(file: UploadFile) -> str:
|
|
212 |
doc = docx.Document(doc_stream)
|
213 |
extracted_text = '\n'.join([para.text for para in doc.paragraphs])
|
214 |
except ImportError:
|
215 |
-
raise HTTPException(status_code=501, detail="DOCX processing requires 'python-docx' library
|
216 |
|
217 |
elif file_extension == '.pdf':
|
218 |
try:
|
@@ -229,7 +213,7 @@ async def extract_text_from_file(file: UploadFile) -> str:
|
|
229 |
extracted_text = "\n".join(page_texts)
|
230 |
doc.close()
|
231 |
except ImportError:
|
232 |
-
raise HTTPException(status_code=501, detail="PDF processing requires 'PyMuPDF' library
|
233 |
|
234 |
else:
|
235 |
raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_extension}")
|
@@ -237,21 +221,15 @@ async def extract_text_from_file(file: UploadFile) -> str:
|
|
237 |
print(f"Extracted text length: {len(extracted_text)}")
|
238 |
return extracted_text
|
239 |
|
240 |
-
except HTTPException as e:
|
241 |
-
raise e
|
242 |
except Exception as e:
|
243 |
print(f"Error processing file {file.filename}: {e}")
|
244 |
traceback.print_exc()
|
245 |
-
raise HTTPException(status_code=500, detail=f"
|
246 |
|
247 |
# --- API Endpoints ---
|
248 |
@app.get("/", response_class=HTMLResponse)
|
249 |
async def read_root(request: Request):
|
250 |
"""Serves the main HTML page."""
|
251 |
-
if not os.path.exists(TEMPLATE_DIR):
|
252 |
-
raise HTTPException(status_code=500, detail=f"Template directory not found at {TEMPLATE_DIR}")
|
253 |
-
if not os.path.exists(os.path.join(TEMPLATE_DIR, "index.html")):
|
254 |
-
raise HTTPException(status_code=500, detail=f"index.html not found in {TEMPLATE_DIR}")
|
255 |
return templates.TemplateResponse("index.html", {"request": request})
|
256 |
|
257 |
@app.post("/translate/text")
|
@@ -264,17 +242,13 @@ async def translate_text_endpoint(
|
|
264 |
if not text:
|
265 |
raise HTTPException(status_code=400, detail="No text provided for translation.")
|
266 |
|
267 |
-
if target_lang != "ar":
|
268 |
-
raise HTTPException(status_code=400, detail="Currently, only translation to Arabic (ar) is supported via this endpoint.")
|
269 |
-
|
270 |
try:
|
271 |
translated_text = translate_text_internal(text, source_lang, target_lang)
|
272 |
return JSONResponse(content={"translated_text": translated_text, "source_lang": source_lang})
|
273 |
-
except HTTPException as http_exc:
|
274 |
-
raise http_exc
|
275 |
except Exception as e:
|
276 |
-
print(f"
|
277 |
-
|
|
|
278 |
|
279 |
@app.post("/translate/document")
|
280 |
async def translate_document_endpoint(
|
@@ -282,10 +256,7 @@ async def translate_document_endpoint(
|
|
282 |
source_lang: str = Form(...),
|
283 |
target_lang: str = Form("ar")
|
284 |
):
|
285 |
-
"""Translates text extracted from an uploaded document
|
286 |
-
if target_lang != "ar":
|
287 |
-
raise HTTPException(status_code=400, detail="Currently, only document translation to Arabic (ar) is supported.")
|
288 |
-
|
289 |
try:
|
290 |
# Extract text directly from the uploaded file
|
291 |
extracted_text = await extract_text_from_file(file)
|
@@ -305,11 +276,11 @@ async def translate_document_endpoint(
|
|
305 |
except HTTPException as http_exc:
|
306 |
raise http_exc
|
307 |
except Exception as e:
|
308 |
-
|
|
|
|
|
309 |
|
310 |
# --- Run the server (for local development) ---
|
311 |
if __name__ == "__main__":
|
312 |
import uvicorn
|
313 |
-
print(f"Template Directory: {TEMPLATE_DIR}")
|
314 |
-
print(f"Static Directory: {STATIC_DIR}")
|
315 |
uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
|
|
|
2 |
from fastapi.responses import HTMLResponse, JSONResponse
|
3 |
from fastapi.staticfiles import StaticFiles
|
4 |
from fastapi.templating import Jinja2Templates
|
5 |
+
from typing import List, Optional
|
6 |
import os
|
7 |
import requests
|
8 |
import json
|
|
|
16 |
TEMPLATE_DIR = os.path.join(os.path.dirname(BASE_DIR), "templates")
|
17 |
STATIC_DIR = os.path.join(os.path.dirname(BASE_DIR), "static")
|
18 |
|
19 |
+
# --- Initialize FastAPI ---
|
|
|
|
|
|
|
20 |
app = FastAPI()
|
|
|
|
|
21 |
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
|
22 |
templates = Jinja2Templates(directory=TEMPLATE_DIR)
|
23 |
|
|
|
37 |
"it": "Italian"
|
38 |
}
|
39 |
|
40 |
+
# --- Free translation APIs ---
|
41 |
+
LIBRE_TRANSLATE_ENDPOINTS = [
|
42 |
+
"https://translate.terraprint.co/translate",
|
43 |
+
"https://libretranslate.de/translate",
|
44 |
+
"https://translate.argosopentech.com/translate"
|
45 |
+
]
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
# --- Translation Function ---
|
48 |
def translate_text_internal(text: str, source_lang: str, target_lang: str = "ar") -> str:
|
49 |
"""
|
50 |
+
Translate text using Hugging Face Inference API and LibreTranslate as backup
|
51 |
"""
|
52 |
if not text.strip():
|
53 |
return ""
|
54 |
|
55 |
print(f"Translation Request - Source Lang: {source_lang}, Target Lang: {target_lang}")
|
56 |
|
57 |
+
# Get full language name for prompt
|
|
|
|
|
|
|
|
|
58 |
source_lang_name = LANGUAGE_MAP.get(source_lang, source_lang)
|
59 |
|
60 |
+
# Construct our eloquent Arabic translation prompt
|
61 |
prompt = f"""Translate the following {source_lang_name} text into Modern Standard Arabic (Fusha).
|
62 |
Focus on conveying the meaning elegantly using proper Balagha (Arabic eloquence).
|
63 |
Adapt any cultural references or idioms appropriately rather than translating literally.
|
|
|
66 |
Text to translate:
|
67 |
{text}"""
|
68 |
|
69 |
+
# Try Hugging Face Inference API with models that are reliably available on the free tier
|
70 |
+
hf_models = [
|
71 |
+
"facebook/m2m100_418M", # Very reliable multilingual model
|
72 |
+
"Helsinki-NLP/opus-mt-tc-big-en-ar" # Good for English to Arabic
|
|
|
|
|
73 |
]
|
74 |
|
75 |
+
for model in hf_models:
|
76 |
try:
|
77 |
+
print(f"Attempting translation via Hugging Face Inference API: {model}")
|
|
|
|
|
78 |
api_url = f"https://api-inference.huggingface.co/models/{model}"
|
79 |
|
80 |
+
# Different payloads based on model architecture
|
81 |
+
if "m2m" in model:
|
|
|
|
|
|
|
|
|
|
|
82 |
payload = {
|
83 |
"inputs": text,
|
84 |
"parameters": {
|
85 |
+
"src_lang": source_lang.upper() if source_lang != "zh" else "ZH",
|
86 |
+
"tgt_lang": "AR"
|
87 |
}
|
88 |
}
|
89 |
+
elif "opus-mt" in model:
|
90 |
+
payload = {"inputs": text}
|
91 |
else:
|
|
|
92 |
payload = {"inputs": prompt}
|
93 |
|
94 |
+
# No auth header for public models on free tier
|
95 |
+
response = requests.post(api_url, json=payload, timeout=30)
|
96 |
|
|
|
97 |
if response.status_code == 200:
|
98 |
result = response.json()
|
|
|
|
|
99 |
translated_text = None
|
100 |
+
|
101 |
+
# Extract text from various response formats
|
102 |
if isinstance(result, list) and len(result) > 0:
|
103 |
+
if isinstance(result[0], dict):
|
104 |
+
translated_text = result[0].get("translation_text") or result[0].get("generated_text")
|
|
|
|
|
105 |
else:
|
106 |
translated_text = str(result[0])
|
107 |
+
elif isinstance(result, dict):
|
108 |
+
translated_text = result.get("translation_text") or result.get("generated_text")
|
109 |
+
|
110 |
if translated_text:
|
111 |
print(f"Translation successful using {model}")
|
|
|
112 |
return culturally_adapt_arabic(translated_text)
|
113 |
+
|
114 |
+
print(f"Unexpected response format: {response.text}")
|
|
|
115 |
else:
|
116 |
+
print(f"API error: {response.status_code}")
|
117 |
+
|
118 |
+
except Exception as e:
|
119 |
+
print(f"Error with Hugging Face model {model}: {e}")
|
120 |
+
|
121 |
+
# If Hugging Face fails, try LibreTranslate
|
122 |
+
for endpoint in LIBRE_TRANSLATE_ENDPOINTS:
|
123 |
+
try:
|
124 |
+
print(f"Attempting translation using LibreTranslate: {endpoint}")
|
125 |
+
payload = {
|
126 |
+
"q": text,
|
127 |
+
"source": source_lang if source_lang != "auto" else "auto",
|
128 |
+
"target": target_lang,
|
129 |
+
"format": "text"
|
130 |
+
}
|
131 |
+
|
132 |
+
response = requests.post(endpoint, json=payload, timeout=10)
|
133 |
+
|
134 |
+
if response.status_code == 200:
|
135 |
+
result = response.json()
|
136 |
+
translated_text = result.get("translatedText")
|
137 |
|
138 |
+
if translated_text:
|
139 |
+
print(f"Translation successful using LibreTranslate {endpoint}")
|
140 |
+
return culturally_adapt_arabic(translated_text)
|
141 |
except Exception as e:
|
142 |
+
print(f"Error with LibreTranslate {endpoint}: {e}")
|
|
|
143 |
|
144 |
+
# If all else fails, use a simple English-Arabic dictionary for common phrases
|
145 |
+
common_phrases = {
|
146 |
+
"hello": "مرحبا",
|
147 |
+
"thank you": "شكرا لك",
|
148 |
+
"goodbye": "مع السلامة",
|
149 |
+
"welcome": "أهلا وسهلا",
|
150 |
+
"yes": "نعم",
|
151 |
+
"no": "لا",
|
152 |
+
"please": "من فضلك",
|
153 |
+
"sorry": "آسف",
|
154 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
|
156 |
+
if text.lower().strip() in common_phrases:
|
157 |
+
return common_phrases[text.lower().strip()]
|
158 |
|
159 |
+
# Last resort message
|
160 |
+
return "عذراً، لم نتمكن من ترجمة النص بسبب خطأ فني. الرجاء المحاولة لاحقاً."
|
|
|
|
|
161 |
|
162 |
def culturally_adapt_arabic(text: str) -> str:
|
163 |
"""Apply post-processing rules to enhance Arabic translation with cultural sensitivity."""
|
|
|
168 |
# --- Helper Functions ---
|
169 |
async def extract_text_from_file(file: UploadFile) -> str:
|
170 |
"""Extracts text content from uploaded files without writing to disk."""
|
171 |
+
content = await file.read()
|
172 |
file_extension = os.path.splitext(file.filename)[1].lower()
|
173 |
extracted_text = ""
|
174 |
|
|
|
196 |
doc = docx.Document(doc_stream)
|
197 |
extracted_text = '\n'.join([para.text for para in doc.paragraphs])
|
198 |
except ImportError:
|
199 |
+
raise HTTPException(status_code=501, detail="DOCX processing requires 'python-docx' library")
|
200 |
|
201 |
elif file_extension == '.pdf':
|
202 |
try:
|
|
|
213 |
extracted_text = "\n".join(page_texts)
|
214 |
doc.close()
|
215 |
except ImportError:
|
216 |
+
raise HTTPException(status_code=501, detail="PDF processing requires 'PyMuPDF' library")
|
217 |
|
218 |
else:
|
219 |
raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_extension}")
|
|
|
221 |
print(f"Extracted text length: {len(extracted_text)}")
|
222 |
return extracted_text
|
223 |
|
|
|
|
|
224 |
except Exception as e:
|
225 |
print(f"Error processing file {file.filename}: {e}")
|
226 |
traceback.print_exc()
|
227 |
+
raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}")
|
228 |
|
229 |
# --- API Endpoints ---
|
230 |
@app.get("/", response_class=HTMLResponse)
|
231 |
async def read_root(request: Request):
|
232 |
"""Serves the main HTML page."""
|
|
|
|
|
|
|
|
|
233 |
return templates.TemplateResponse("index.html", {"request": request})
|
234 |
|
235 |
@app.post("/translate/text")
|
|
|
242 |
if not text:
|
243 |
raise HTTPException(status_code=400, detail="No text provided for translation.")
|
244 |
|
|
|
|
|
|
|
245 |
try:
|
246 |
translated_text = translate_text_internal(text, source_lang, target_lang)
|
247 |
return JSONResponse(content={"translated_text": translated_text, "source_lang": source_lang})
|
|
|
|
|
248 |
except Exception as e:
|
249 |
+
print(f"Translation error: {e}")
|
250 |
+
traceback.print_exc()
|
251 |
+
raise HTTPException(status_code=500, detail=f"Translation error: {str(e)}")
|
252 |
|
253 |
@app.post("/translate/document")
|
254 |
async def translate_document_endpoint(
|
|
|
256 |
source_lang: str = Form(...),
|
257 |
target_lang: str = Form("ar")
|
258 |
):
|
259 |
+
"""Translates text extracted from an uploaded document."""
|
|
|
|
|
|
|
260 |
try:
|
261 |
# Extract text directly from the uploaded file
|
262 |
extracted_text = await extract_text_from_file(file)
|
|
|
276 |
except HTTPException as http_exc:
|
277 |
raise http_exc
|
278 |
except Exception as e:
|
279 |
+
print(f"Document translation error: {e}")
|
280 |
+
traceback.print_exc()
|
281 |
+
raise HTTPException(status_code=500, detail=f"Document translation error: {str(e)}")
|
282 |
|
283 |
# --- Run the server (for local development) ---
|
284 |
if __name__ == "__main__":
|
285 |
import uvicorn
|
|
|
|
|
286 |
uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
|