amine_dubs
commited on
Commit
·
7b65e1e
1
Parent(s):
a95a188
Switch to Hugging Face Inference API with in-memory file processing
Browse files- backend/main.py +161 -200
- backend/requirements.txt +1 -3
backend/main.py
CHANGED
@@ -2,13 +2,12 @@ from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Request
|
|
2 |
from fastapi.responses import HTMLResponse, JSONResponse
|
3 |
from fastapi.staticfiles import StaticFiles
|
4 |
from fastapi.templating import Jinja2Templates
|
5 |
-
from typing import List, Optional
|
6 |
-
import shutil
|
7 |
import os
|
8 |
import requests
|
9 |
import json
|
10 |
import traceback
|
11 |
-
import
|
12 |
|
13 |
# --- Configuration ---
|
14 |
# Determine the base directory of the main.py script
|
@@ -16,14 +15,10 @@ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
16 |
# Adjust paths to go one level up from backend to find templates/static
|
17 |
TEMPLATE_DIR = os.path.join(os.path.dirname(BASE_DIR), "templates")
|
18 |
STATIC_DIR = os.path.join(os.path.dirname(BASE_DIR), "static")
|
19 |
-
UPLOAD_DIR = "/app/uploads" # Ensure this matches Dockerfile WORKDIR + uploads
|
20 |
|
21 |
-
#
|
22 |
-
|
23 |
-
|
24 |
-
"https://libretranslate.de/translate", # Backup endpoint 1
|
25 |
-
"https://translate.argosopentech.com/translate" # Backup endpoint 2
|
26 |
-
]
|
27 |
|
28 |
app = FastAPI()
|
29 |
|
@@ -31,6 +26,22 @@ app = FastAPI()
|
|
31 |
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
|
32 |
templates = Jinja2Templates(directory=TEMPLATE_DIR)
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
# --- Fallback dictionary for common phrases ---
|
35 |
FALLBACK_PHRASES = {
|
36 |
"hello": "مرحبا",
|
@@ -46,73 +57,118 @@ FALLBACK_PHRASES = {
|
|
46 |
# --- Translation Function ---
|
47 |
def translate_text_internal(text: str, source_lang: str, target_lang: str = "ar") -> str:
|
48 |
"""
|
49 |
-
Translate text using
|
50 |
"""
|
|
|
|
|
|
|
51 |
print(f"Translation Request - Source Lang: {source_lang}, Target Lang: {target_lang}")
|
52 |
|
53 |
-
# Map source language codes to full language names
|
54 |
-
language_map = {
|
55 |
-
"en": "English",
|
56 |
-
"fr": "French",
|
57 |
-
"es": "Spanish",
|
58 |
-
"de": "German",
|
59 |
-
"zh": "Chinese",
|
60 |
-
"ru": "Russian",
|
61 |
-
"ja": "Japanese",
|
62 |
-
"hi": "Hindi",
|
63 |
-
"pt": "Portuguese",
|
64 |
-
"tr": "Turkish",
|
65 |
-
"ko": "Korean",
|
66 |
-
"it": "Italian"
|
67 |
-
}
|
68 |
-
|
69 |
# For very short text, check our dictionary first
|
70 |
-
if len(text.strip()) <
|
71 |
return FALLBACK_PHRASES[text.lower().strip()]
|
72 |
|
73 |
-
#
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
try:
|
76 |
-
print(f"Attempting translation using
|
77 |
|
78 |
-
#
|
79 |
-
|
80 |
-
"q": text,
|
81 |
-
"source": source_lang if source_lang != "auto" else "auto",
|
82 |
-
"target": target_lang,
|
83 |
-
"format": "text"
|
84 |
-
}
|
85 |
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
# Make the API call
|
89 |
-
response = requests.post(api_url,
|
90 |
|
|
|
91 |
if response.status_code == 200:
|
92 |
result = response.json()
|
93 |
-
translated_text = result.get("translatedText")
|
94 |
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
-
|
|
|
|
|
|
|
103 |
else:
|
104 |
-
print(f"
|
105 |
-
continue # Try next
|
106 |
else:
|
107 |
-
print(f"
|
108 |
-
continue # Try next
|
109 |
|
110 |
except Exception as e:
|
111 |
-
print(f"Error with
|
112 |
-
continue # Try next
|
113 |
|
114 |
-
# If all
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
if fallback_text:
|
118 |
return fallback_text
|
@@ -127,92 +183,71 @@ def culturally_adapt_arabic(text: str) -> str:
|
|
127 |
|
128 |
# --- Helper Functions ---
|
129 |
async def extract_text_from_file(file: UploadFile) -> str:
|
130 |
-
"""Extracts text content from
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
# Secure filename and define path
|
136 |
-
# Use a temporary filename to avoid collisions and complex sanitization
|
137 |
-
# Make sure the filename is safe for the filesystem
|
138 |
-
safe_filename = os.path.basename(file.filename) # Basic safety
|
139 |
-
temp_file_path = os.path.join(UPLOAD_DIR, f"temp_{safe_filename}")
|
140 |
-
print(f"Attempting to save uploaded file to: {temp_file_path}")
|
141 |
-
extracted_text = "" # Initialize extracted_text
|
142 |
|
143 |
try:
|
144 |
-
# Save the uploaded file temporarily
|
145 |
-
# Use async file writing if possible with a library like aiofiles,
|
146 |
-
# but standard file I/O is often sufficient here.
|
147 |
-
with open(temp_file_path, "wb") as buffer:
|
148 |
-
content = await file.read() # Read content
|
149 |
-
buffer.write(content) # Write to file
|
150 |
-
print(f"File saved successfully to: {temp_file_path}")
|
151 |
-
|
152 |
-
# Determine file type and extract text
|
153 |
-
file_extension = os.path.splitext(safe_filename)[1].lower()
|
154 |
-
|
155 |
if file_extension == '.txt':
|
156 |
-
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
elif file_extension == '.docx':
|
159 |
try:
|
160 |
import docx
|
161 |
-
|
162 |
-
|
|
|
|
|
|
|
|
|
163 |
except ImportError:
|
164 |
raise HTTPException(status_code=501, detail="DOCX processing requires 'python-docx' library, which is not installed.")
|
165 |
-
|
166 |
-
raise HTTPException(status_code=500, detail=f"Error reading DOCX file: {e}")
|
167 |
elif file_extension == '.pdf':
|
168 |
try:
|
169 |
-
import fitz
|
170 |
-
|
171 |
-
|
|
|
|
|
|
|
|
|
|
|
172 |
for page in doc:
|
173 |
-
|
|
|
174 |
doc.close()
|
175 |
except ImportError:
|
176 |
-
|
177 |
-
|
178 |
-
raise HTTPException(status_code=500, detail=f"Error reading PDF file: {e}")
|
179 |
-
# Add support for other types (pptx, xlsx) similarly if needed
|
180 |
-
# elif file_extension == '.pptx': ...
|
181 |
-
# elif file_extension == '.xlsx': ...
|
182 |
else:
|
183 |
raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_extension}")
|
184 |
|
185 |
print(f"Extracted text length: {len(extracted_text)}")
|
186 |
-
return extracted_text
|
187 |
|
188 |
-
except IOError as e:
|
189 |
-
print(f"IOError saving/reading file {temp_file_path}: {e}")
|
190 |
-
# Check permissions specifically
|
191 |
-
if e.errno == 13: # Permission denied
|
192 |
-
raise HTTPException(status_code=500, detail=f"Permission denied writing to {temp_file_path}. Check container permissions for {UPLOAD_DIR}.")
|
193 |
-
raise HTTPException(status_code=500, detail=f"Error saving/accessing uploaded file: {e}")
|
194 |
except HTTPException as e:
|
195 |
-
# Re-raise HTTPExceptions directly
|
196 |
raise e
|
197 |
except Exception as e:
|
198 |
print(f"Error processing file {file.filename}: {e}")
|
199 |
traceback.print_exc()
|
200 |
raise HTTPException(status_code=500, detail=f"An unexpected error occurred processing the document: {e}")
|
201 |
-
finally:
|
202 |
-
# Clean up the temporary file
|
203 |
-
if os.path.exists(temp_file_path):
|
204 |
-
try:
|
205 |
-
os.remove(temp_file_path)
|
206 |
-
print(f"Temporary file removed: {temp_file_path}")
|
207 |
-
except OSError as e:
|
208 |
-
# Log error but don't crash the request if cleanup fails
|
209 |
-
print(f"Error removing temporary file {temp_file_path}: {e}")
|
210 |
|
211 |
# --- API Endpoints ---
|
212 |
@app.get("/", response_class=HTMLResponse)
|
213 |
async def read_root(request: Request):
|
214 |
"""Serves the main HTML page."""
|
215 |
-
# Ensure templates directory exists before trying to render
|
216 |
if not os.path.exists(TEMPLATE_DIR):
|
217 |
raise HTTPException(status_code=500, detail=f"Template directory not found at {TEMPLATE_DIR}")
|
218 |
if not os.path.exists(os.path.join(TEMPLATE_DIR, "index.html")):
|
@@ -222,30 +257,20 @@ async def read_root(request: Request):
|
|
222 |
@app.post("/translate/text")
|
223 |
async def translate_text_endpoint(
|
224 |
text: str = Form(...),
|
225 |
-
source_lang: str = Form(...),
|
226 |
-
target_lang: str = Form("ar")
|
227 |
):
|
228 |
"""Translates direct text input."""
|
229 |
if not text:
|
230 |
raise HTTPException(status_code=400, detail="No text provided for translation.")
|
231 |
-
|
232 |
-
# if target_lang != "ar" and source_lang != "ar":
|
233 |
-
# raise HTTPException(status_code=400, detail="Translation must involve Arabic (either as source or target). Specify 'ar' in source_lang or target_lang.")
|
234 |
-
|
235 |
-
# Simplified: For now, stick to the primary goal: other -> Arabic
|
236 |
if target_lang != "ar":
|
237 |
raise HTTPException(status_code=400, detail="Currently, only translation to Arabic (ar) is supported via this endpoint.")
|
238 |
|
239 |
try:
|
240 |
-
|
241 |
-
|
242 |
-
# if source_lang == 'auto':
|
243 |
-
# actual_source_lang = detect_language(text) # Needs implementation
|
244 |
-
|
245 |
-
translated_text = translate_text_internal(text, actual_source_lang, target_lang)
|
246 |
-
return JSONResponse(content={"translated_text": translated_text, "source_lang": actual_source_lang})
|
247 |
except HTTPException as http_exc:
|
248 |
-
# Re-raise HTTP exceptions from internal functions
|
249 |
raise http_exc
|
250 |
except Exception as e:
|
251 |
print(f"Unexpected error in /translate/text: {e}")
|
@@ -254,101 +279,37 @@ async def translate_text_endpoint(
|
|
254 |
@app.post("/translate/document")
|
255 |
async def translate_document_endpoint(
|
256 |
file: UploadFile = File(...),
|
257 |
-
source_lang: str = Form(...),
|
258 |
-
target_lang: str = Form("ar")
|
259 |
):
|
260 |
-
"""Translates text extracted from an uploaded document."""
|
261 |
-
# Allow translation to Arabic or from Arabic
|
262 |
-
# if target_lang != "ar" and source_lang != "ar":
|
263 |
-
# raise HTTPException(status_code=400, detail="Document translation must involve Arabic (either as source or target). Specify 'ar' in source_lang or target_lang.")
|
264 |
-
|
265 |
-
# Simplified: For now, stick to the primary goal: other -> Arabic
|
266 |
if target_lang != "ar":
|
267 |
raise HTTPException(status_code=400, detail="Currently, only document translation to Arabic (ar) is supported.")
|
268 |
|
269 |
-
# Ensure upload directory exists
|
270 |
-
if not os.path.exists(UPLOAD_DIR):
|
271 |
-
try:
|
272 |
-
os.makedirs(UPLOAD_DIR)
|
273 |
-
except OSError as e:
|
274 |
-
raise HTTPException(status_code=500, detail=f"Could not create upload directory: {e}")
|
275 |
-
|
276 |
-
# Create a safe temporary file path
|
277 |
-
temp_file_path = os.path.join(UPLOAD_DIR, f"temp_{file.filename}")
|
278 |
-
|
279 |
try:
|
280 |
-
#
|
281 |
-
with open(temp_file_path, "wb") as buffer:
|
282 |
-
shutil.copyfileobj(file.file, buffer)
|
283 |
-
|
284 |
-
# Extract text based on content type
|
285 |
extracted_text = await extract_text_from_file(file)
|
286 |
-
|
287 |
-
|
288 |
if not extracted_text:
|
289 |
-
# This case might be less likely if extract_text_from_file handles errors robustly
|
290 |
-
# but keep it as a safeguard.
|
291 |
-
if os.path.exists(temp_file_path):
|
292 |
-
os.remove(temp_file_path)
|
293 |
raise HTTPException(status_code=400, detail="Could not extract any text from the document.")
|
294 |
|
295 |
-
# Determine actual source language if 'auto' (requires model/library support)
|
296 |
-
actual_source_lang = source_lang # Placeholder
|
297 |
-
# if source_lang == 'auto':
|
298 |
-
# actual_source_lang = detect_language(extracted_text) # Needs implementation
|
299 |
-
|
300 |
# Translate the extracted text
|
301 |
-
translated_text = translate_text_internal(extracted_text,
|
302 |
-
|
303 |
-
# Clean up the temporary file *after* successful processing
|
304 |
-
if os.path.exists(temp_file_path):
|
305 |
-
os.remove(temp_file_path)
|
306 |
|
307 |
return JSONResponse(content={
|
308 |
"original_filename": file.filename,
|
309 |
-
"detected_source_lang":
|
310 |
"translated_text": translated_text
|
311 |
})
|
312 |
|
313 |
except HTTPException as http_exc:
|
314 |
-
|
315 |
-
if os.path.exists(temp_file_path):
|
316 |
-
try:
|
317 |
-
os.remove(temp_file_path)
|
318 |
-
except:
|
319 |
-
pass
|
320 |
-
raise http_exc # Re-raise the exception
|
321 |
except Exception as e:
|
322 |
-
# Clean up temp file on unexpected errors
|
323 |
-
if os.path.exists(temp_file_path):
|
324 |
-
try:
|
325 |
-
os.remove(temp_file_path)
|
326 |
-
except:
|
327 |
-
pass
|
328 |
raise HTTPException(status_code=500, detail=f"An unexpected error occurred processing the document: {e}")
|
329 |
|
330 |
-
# --- Optional: Add endpoint for reverse translation (Arabic to other) ---
|
331 |
-
# @app.post("/translate/reverse")
|
332 |
-
# async def translate_reverse_endpoint(text: str = Form(...), target_lang: str = Form(...)):
|
333 |
-
# # Implement logic similar to translate_text_endpoint but with source="ar"
|
334 |
-
# # You'll need a model capable of ar -> target_lang translation
|
335 |
-
# pass
|
336 |
-
|
337 |
# --- Run the server (for local development) ---
|
338 |
if __name__ == "__main__":
|
339 |
import uvicorn
|
340 |
-
# Make sure to install PyMuPDF, python-docx etc. if testing locally:
|
341 |
-
# pip install -r requirements.txt (from backend directory)
|
342 |
print(f"Template Directory: {TEMPLATE_DIR}")
|
343 |
print(f"Static Directory: {STATIC_DIR}")
|
344 |
-
print(f"Upload Directory: {UPLOAD_DIR}")
|
345 |
-
# Ensure necessary directories exist for local run
|
346 |
-
if not os.path.exists(TEMPLATE_DIR): os.makedirs(TEMPLATE_DIR)
|
347 |
-
if not os.path.exists(STATIC_DIR): os.makedirs(STATIC_DIR)
|
348 |
-
if not os.path.exists(UPLOAD_DIR): os.makedirs(UPLOAD_DIR)
|
349 |
-
# Create dummy index.html if it doesn't exist for local run
|
350 |
-
if not os.path.exists(os.path.join(TEMPLATE_DIR, "index.html")):
|
351 |
-
with open(os.path.join(TEMPLATE_DIR, "index.html"), "w") as f:
|
352 |
-
f.write("<html><body><h1>Placeholder Frontend</h1></body></html>")
|
353 |
-
|
354 |
uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
|
|
|
2 |
from fastapi.responses import HTMLResponse, JSONResponse
|
3 |
from fastapi.staticfiles import StaticFiles
|
4 |
from fastapi.templating import Jinja2Templates
|
5 |
+
from typing import List, Optional, Dict, Any
|
|
|
6 |
import os
|
7 |
import requests
|
8 |
import json
|
9 |
import traceback
|
10 |
+
import io
|
11 |
|
12 |
# --- Configuration ---
|
13 |
# Determine the base directory of the main.py script
|
|
|
15 |
# Adjust paths to go one level up from backend to find templates/static
|
16 |
TEMPLATE_DIR = os.path.join(os.path.dirname(BASE_DIR), "templates")
|
17 |
STATIC_DIR = os.path.join(os.path.dirname(BASE_DIR), "static")
|
|
|
18 |
|
19 |
+
# Hugging Face API configurations
|
20 |
+
HF_API_URL = "https://api-inference.huggingface.co/models/t5-base"
|
21 |
+
HF_HEADERS = {"Authorization": "Bearer hf_api_key_placeholder"} # Replace with your API key or remove if using a free model
|
|
|
|
|
|
|
22 |
|
23 |
app = FastAPI()
|
24 |
|
|
|
26 |
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
|
27 |
templates = Jinja2Templates(directory=TEMPLATE_DIR)
|
28 |
|
29 |
+
# --- Language mapping ---
|
30 |
+
LANGUAGE_MAP = {
|
31 |
+
"en": "English",
|
32 |
+
"fr": "French",
|
33 |
+
"es": "Spanish",
|
34 |
+
"de": "German",
|
35 |
+
"zh": "Chinese",
|
36 |
+
"ru": "Russian",
|
37 |
+
"ja": "Japanese",
|
38 |
+
"hi": "Hindi",
|
39 |
+
"pt": "Portuguese",
|
40 |
+
"tr": "Turkish",
|
41 |
+
"ko": "Korean",
|
42 |
+
"it": "Italian"
|
43 |
+
}
|
44 |
+
|
45 |
# --- Fallback dictionary for common phrases ---
|
46 |
FALLBACK_PHRASES = {
|
47 |
"hello": "مرحبا",
|
|
|
57 |
# --- Translation Function ---
|
58 |
def translate_text_internal(text: str, source_lang: str, target_lang: str = "ar") -> str:
|
59 |
"""
|
60 |
+
Translate text using Hugging Face Inference API with prompt engineering.
|
61 |
"""
|
62 |
+
if not text.strip():
|
63 |
+
return ""
|
64 |
+
|
65 |
print(f"Translation Request - Source Lang: {source_lang}, Target Lang: {target_lang}")
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
# For very short text, check our dictionary first
|
68 |
+
if len(text.strip()) < 20 and text.lower().strip() in FALLBACK_PHRASES:
|
69 |
return FALLBACK_PHRASES[text.lower().strip()]
|
70 |
|
71 |
+
# Get full language name if available
|
72 |
+
source_lang_name = LANGUAGE_MAP.get(source_lang, source_lang)
|
73 |
+
|
74 |
+
# Construct our prompt with instructions for eloquent Arabic translation
|
75 |
+
prompt = f"""Translate the following {source_lang_name} text into Modern Standard Arabic (Fusha).
|
76 |
+
Focus on conveying the meaning elegantly using proper Balagha (Arabic eloquence).
|
77 |
+
Adapt any cultural references or idioms appropriately rather than translating literally.
|
78 |
+
Ensure the translation reads naturally to a native Arabic speaker.
|
79 |
+
|
80 |
+
Text to translate:
|
81 |
+
{text}"""
|
82 |
+
|
83 |
+
# Try multiple models in order of preference
|
84 |
+
models_to_try = [
|
85 |
+
"Helsinki-NLP/opus-mt-en-ar", # specialized English-Arabic translator
|
86 |
+
"facebook/nllb-200-distilled-600M", # multilingual model
|
87 |
+
"t5-base", # general-purpose model that can follow instructions
|
88 |
+
"google/mt5-small" # small multilingual model
|
89 |
+
]
|
90 |
+
|
91 |
+
for model in models_to_try:
|
92 |
try:
|
93 |
+
print(f"Attempting translation using Hugging Face model: {model}")
|
94 |
|
95 |
+
# Update API URL for current model
|
96 |
+
api_url = f"https://api-inference.huggingface.co/models/{model}"
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
+
# Prepare request payload based on model type
|
99 |
+
if "opus-mt" in model:
|
100 |
+
# Helsinki NMT models use direct input
|
101 |
+
payload = {"inputs": text}
|
102 |
+
elif "nllb" in model:
|
103 |
+
# NLLB models need language tags
|
104 |
+
src_lang_code = source_lang if source_lang != "auto" else "eng_Latn"
|
105 |
+
payload = {
|
106 |
+
"inputs": text,
|
107 |
+
"parameters": {
|
108 |
+
"source_lang": src_lang_code,
|
109 |
+
"target_lang": "arb_Arab"
|
110 |
+
}
|
111 |
+
}
|
112 |
+
else:
|
113 |
+
# T5 and other instruction-following models use our prompt
|
114 |
+
payload = {"inputs": prompt}
|
115 |
|
116 |
# Make the API call
|
117 |
+
response = requests.post(api_url, headers=HF_HEADERS, json=payload, timeout=30)
|
118 |
|
119 |
+
# Handle different response formats based on model
|
120 |
if response.status_code == 200:
|
121 |
result = response.json()
|
|
|
122 |
|
123 |
+
# Extract translated text based on response structure
|
124 |
+
translated_text = None
|
125 |
+
if isinstance(result, list) and len(result) > 0:
|
126 |
+
if isinstance(result[0], dict) and "generated_text" in result[0]:
|
127 |
+
translated_text = result[0]["generated_text"]
|
128 |
+
elif isinstance(result[0], dict) and "translation_text" in result[0]:
|
129 |
+
translated_text = result[0]["translation_text"]
|
130 |
+
else:
|
131 |
+
translated_text = str(result[0])
|
132 |
+
elif isinstance(result, dict) and "generated_text" in result:
|
133 |
+
translated_text = result["generated_text"]
|
134 |
|
135 |
+
if translated_text:
|
136 |
+
print(f"Translation successful using {model}")
|
137 |
+
# Apply post-processing
|
138 |
+
return culturally_adapt_arabic(translated_text)
|
139 |
else:
|
140 |
+
print(f"Unexpected response format: {response.text}")
|
141 |
+
continue # Try next model
|
142 |
else:
|
143 |
+
print(f"API error: {response.status_code}, {response.text}")
|
144 |
+
continue # Try next model
|
145 |
|
146 |
except Exception as e:
|
147 |
+
print(f"Error with model {model}: {e}")
|
148 |
+
continue # Try next model
|
149 |
|
150 |
+
# If all models failed, try LibreTranslate as a backup
|
151 |
+
try:
|
152 |
+
print("Attempting LibreTranslate API as backup")
|
153 |
+
libre_api = "https://translate.terraprint.co/translate"
|
154 |
+
payload = {
|
155 |
+
"q": text,
|
156 |
+
"source": source_lang if source_lang != "auto" else "auto",
|
157 |
+
"target": target_lang,
|
158 |
+
"format": "text"
|
159 |
+
}
|
160 |
+
|
161 |
+
response = requests.post(libre_api, json=payload, timeout=10)
|
162 |
+
if response.status_code == 200:
|
163 |
+
result = response.json()
|
164 |
+
translated_text = result.get("translatedText")
|
165 |
+
if translated_text:
|
166 |
+
return culturally_adapt_arabic(translated_text)
|
167 |
+
except Exception as e:
|
168 |
+
print(f"LibreTranslate backup failed: {e}")
|
169 |
+
|
170 |
+
# All translation attempts failed, use fallback
|
171 |
+
fallback_text = FALLBACK_PHRASES.get(text.lower().strip()) if len(text.strip()) < 20 else None
|
172 |
|
173 |
if fallback_text:
|
174 |
return fallback_text
|
|
|
183 |
|
184 |
# --- Helper Functions ---
|
185 |
async def extract_text_from_file(file: UploadFile) -> str:
|
186 |
+
"""Extracts text content from uploaded files without writing to disk."""
|
187 |
+
content = await file.read() # Read file content into memory
|
188 |
+
file_extension = os.path.splitext(file.filename)[1].lower()
|
189 |
+
extracted_text = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
|
191 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
if file_extension == '.txt':
|
193 |
+
# Process text file directly from bytes
|
194 |
+
try:
|
195 |
+
extracted_text = content.decode('utf-8')
|
196 |
+
except UnicodeDecodeError:
|
197 |
+
# Try other common encodings if UTF-8 fails
|
198 |
+
for encoding in ['latin-1', 'cp1252', 'utf-16']:
|
199 |
+
try:
|
200 |
+
extracted_text = content.decode(encoding)
|
201 |
+
break
|
202 |
+
except UnicodeDecodeError:
|
203 |
+
continue
|
204 |
+
|
205 |
elif file_extension == '.docx':
|
206 |
try:
|
207 |
import docx
|
208 |
+
from io import BytesIO
|
209 |
+
|
210 |
+
# Load DOCX from memory
|
211 |
+
doc_stream = BytesIO(content)
|
212 |
+
doc = docx.Document(doc_stream)
|
213 |
+
extracted_text = '\n'.join([para.text for para in doc.paragraphs])
|
214 |
except ImportError:
|
215 |
raise HTTPException(status_code=501, detail="DOCX processing requires 'python-docx' library, which is not installed.")
|
216 |
+
|
|
|
217 |
elif file_extension == '.pdf':
|
218 |
try:
|
219 |
+
import fitz # PyMuPDF
|
220 |
+
from io import BytesIO
|
221 |
+
|
222 |
+
# Load PDF from memory
|
223 |
+
pdf_stream = BytesIO(content)
|
224 |
+
doc = fitz.open(stream=pdf_stream, filetype="pdf")
|
225 |
+
|
226 |
+
page_texts = []
|
227 |
for page in doc:
|
228 |
+
page_texts.append(page.get_text())
|
229 |
+
extracted_text = "\n".join(page_texts)
|
230 |
doc.close()
|
231 |
except ImportError:
|
232 |
+
raise HTTPException(status_code=501, detail="PDF processing requires 'PyMuPDF' library, which is not installed.")
|
233 |
+
|
|
|
|
|
|
|
|
|
234 |
else:
|
235 |
raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_extension}")
|
236 |
|
237 |
print(f"Extracted text length: {len(extracted_text)}")
|
238 |
+
return extracted_text
|
239 |
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
except HTTPException as e:
|
|
|
241 |
raise e
|
242 |
except Exception as e:
|
243 |
print(f"Error processing file {file.filename}: {e}")
|
244 |
traceback.print_exc()
|
245 |
raise HTTPException(status_code=500, detail=f"An unexpected error occurred processing the document: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
|
247 |
# --- API Endpoints ---
|
248 |
@app.get("/", response_class=HTMLResponse)
|
249 |
async def read_root(request: Request):
|
250 |
"""Serves the main HTML page."""
|
|
|
251 |
if not os.path.exists(TEMPLATE_DIR):
|
252 |
raise HTTPException(status_code=500, detail=f"Template directory not found at {TEMPLATE_DIR}")
|
253 |
if not os.path.exists(os.path.join(TEMPLATE_DIR, "index.html")):
|
|
|
257 |
@app.post("/translate/text")
|
258 |
async def translate_text_endpoint(
|
259 |
text: str = Form(...),
|
260 |
+
source_lang: str = Form(...),
|
261 |
+
target_lang: str = Form("ar")
|
262 |
):
|
263 |
"""Translates direct text input."""
|
264 |
if not text:
|
265 |
raise HTTPException(status_code=400, detail="No text provided for translation.")
|
266 |
+
|
|
|
|
|
|
|
|
|
267 |
if target_lang != "ar":
|
268 |
raise HTTPException(status_code=400, detail="Currently, only translation to Arabic (ar) is supported via this endpoint.")
|
269 |
|
270 |
try:
|
271 |
+
translated_text = translate_text_internal(text, source_lang, target_lang)
|
272 |
+
return JSONResponse(content={"translated_text": translated_text, "source_lang": source_lang})
|
|
|
|
|
|
|
|
|
|
|
273 |
except HTTPException as http_exc:
|
|
|
274 |
raise http_exc
|
275 |
except Exception as e:
|
276 |
print(f"Unexpected error in /translate/text: {e}")
|
|
|
279 |
@app.post("/translate/document")
|
280 |
async def translate_document_endpoint(
|
281 |
file: UploadFile = File(...),
|
282 |
+
source_lang: str = Form(...),
|
283 |
+
target_lang: str = Form("ar")
|
284 |
):
|
285 |
+
"""Translates text extracted from an uploaded document without saving to disk."""
|
|
|
|
|
|
|
|
|
|
|
286 |
if target_lang != "ar":
|
287 |
raise HTTPException(status_code=400, detail="Currently, only document translation to Arabic (ar) is supported.")
|
288 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
try:
|
290 |
+
# Extract text directly from the uploaded file
|
|
|
|
|
|
|
|
|
291 |
extracted_text = await extract_text_from_file(file)
|
292 |
+
|
|
|
293 |
if not extracted_text:
|
|
|
|
|
|
|
|
|
294 |
raise HTTPException(status_code=400, detail="Could not extract any text from the document.")
|
295 |
|
|
|
|
|
|
|
|
|
|
|
296 |
# Translate the extracted text
|
297 |
+
translated_text = translate_text_internal(extracted_text, source_lang, target_lang)
|
|
|
|
|
|
|
|
|
298 |
|
299 |
return JSONResponse(content={
|
300 |
"original_filename": file.filename,
|
301 |
+
"detected_source_lang": source_lang,
|
302 |
"translated_text": translated_text
|
303 |
})
|
304 |
|
305 |
except HTTPException as http_exc:
|
306 |
+
raise http_exc
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
except Exception as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
raise HTTPException(status_code=500, detail=f"An unexpected error occurred processing the document: {e}")
|
309 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
310 |
# --- Run the server (for local development) ---
|
311 |
if __name__ == "__main__":
|
312 |
import uvicorn
|
|
|
|
|
313 |
print(f"Template Directory: {TEMPLATE_DIR}")
|
314 |
print(f"Static Directory: {STATIC_DIR}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
|
backend/requirements.txt
CHANGED
@@ -2,7 +2,5 @@ fastapi
|
|
2 |
uvicorn
|
3 |
python-docx
|
4 |
PyMuPDF
|
5 |
-
|
6 |
-
sentencepiece
|
7 |
python-multipart
|
8 |
-
requests # Added for LibreTranslate API fallback
|
|
|
2 |
uvicorn
|
3 |
python-docx
|
4 |
PyMuPDF
|
5 |
+
requests
|
|
|
6 |
python-multipart
|
|