amine_dubs commited on
Commit
7b65e1e
·
1 Parent(s): a95a188

Switch to Hugging Face Inference API with in-memory file processing

Browse files
Files changed (2) hide show
  1. backend/main.py +161 -200
  2. backend/requirements.txt +1 -3
backend/main.py CHANGED
@@ -2,13 +2,12 @@ from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Request
2
  from fastapi.responses import HTMLResponse, JSONResponse
3
  from fastapi.staticfiles import StaticFiles
4
  from fastapi.templating import Jinja2Templates
5
- from typing import List, Optional
6
- import shutil
7
  import os
8
  import requests
9
  import json
10
  import traceback
11
- import time
12
 
13
  # --- Configuration ---
14
  # Determine the base directory of the main.py script
@@ -16,14 +15,10 @@ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
16
  # Adjust paths to go one level up from backend to find templates/static
17
  TEMPLATE_DIR = os.path.join(os.path.dirname(BASE_DIR), "templates")
18
  STATIC_DIR = os.path.join(os.path.dirname(BASE_DIR), "static")
19
- UPLOAD_DIR = "/app/uploads" # Ensure this matches Dockerfile WORKDIR + uploads
20
 
21
- # LibreTranslate API URLs - trying multiple endpoints in case one is down
22
- TRANSLATION_APIS = [
23
- "https://translate.terraprint.co/translate", # Primary endpoint
24
- "https://libretranslate.de/translate", # Backup endpoint 1
25
- "https://translate.argosopentech.com/translate" # Backup endpoint 2
26
- ]
27
 
28
  app = FastAPI()
29
 
@@ -31,6 +26,22 @@ app = FastAPI()
31
  app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
32
  templates = Jinja2Templates(directory=TEMPLATE_DIR)
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  # --- Fallback dictionary for common phrases ---
35
  FALLBACK_PHRASES = {
36
  "hello": "مرحبا",
@@ -46,73 +57,118 @@ FALLBACK_PHRASES = {
46
  # --- Translation Function ---
47
  def translate_text_internal(text: str, source_lang: str, target_lang: str = "ar") -> str:
48
  """
49
- Translate text using LibreTranslate API with fallbacks and cultural adaptation.
50
  """
 
 
 
51
  print(f"Translation Request - Source Lang: {source_lang}, Target Lang: {target_lang}")
52
 
53
- # Map source language codes to full language names
54
- language_map = {
55
- "en": "English",
56
- "fr": "French",
57
- "es": "Spanish",
58
- "de": "German",
59
- "zh": "Chinese",
60
- "ru": "Russian",
61
- "ja": "Japanese",
62
- "hi": "Hindi",
63
- "pt": "Portuguese",
64
- "tr": "Turkish",
65
- "ko": "Korean",
66
- "it": "Italian"
67
- }
68
-
69
  # For very short text, check our dictionary first
70
- if len(text.strip()) < 30 and text.lower().strip() in FALLBACK_PHRASES:
71
  return FALLBACK_PHRASES[text.lower().strip()]
72
 
73
- # Try each API endpoint until one works
74
- for api_url in TRANSLATION_APIS:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  try:
76
- print(f"Attempting translation using API: {api_url}")
77
 
78
- # Basic payload for standard translation
79
- payload = {
80
- "q": text,
81
- "source": source_lang if source_lang != "auto" else "auto",
82
- "target": target_lang,
83
- "format": "text"
84
- }
85
 
86
- headers = {"Content-Type": "application/json"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  # Make the API call
89
- response = requests.post(api_url, json=payload, headers=headers, timeout=10)
90
 
 
91
  if response.status_code == 200:
92
  result = response.json()
93
- translated_text = result.get("translatedText")
94
 
95
- if translated_text:
96
- print(f"Translation successful using {api_url}")
97
-
98
- # For Arabic translations, apply post-processing
99
- if target_lang == "ar":
100
- translated_text = culturally_adapt_arabic(translated_text)
 
 
 
 
 
101
 
102
- return translated_text
 
 
 
103
  else:
104
- print(f"Translation API returned empty result: {response.text}")
105
- continue # Try next API
106
  else:
107
- print(f"Translation API returned error: {response.status_code}")
108
- continue # Try next API
109
 
110
  except Exception as e:
111
- print(f"Error with translation API {api_url}: {e}")
112
- continue # Try next API
113
 
114
- # If all APIs failed, use a polite message
115
- fallback_text = FALLBACK_PHRASES.get(text.lower().strip()) if len(text.strip()) < 30 else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  if fallback_text:
118
  return fallback_text
@@ -127,92 +183,71 @@ def culturally_adapt_arabic(text: str) -> str:
127
 
128
  # --- Helper Functions ---
129
  async def extract_text_from_file(file: UploadFile) -> str:
130
- """Extracts text content from various file types."""
131
- # Ensure upload directory exists (though Dockerfile should create it)
132
- # Use os.makedirs for robustness
133
- os.makedirs(UPLOAD_DIR, exist_ok=True) # Ensure directory exists
134
-
135
- # Secure filename and define path
136
- # Use a temporary filename to avoid collisions and complex sanitization
137
- # Make sure the filename is safe for the filesystem
138
- safe_filename = os.path.basename(file.filename) # Basic safety
139
- temp_file_path = os.path.join(UPLOAD_DIR, f"temp_{safe_filename}")
140
- print(f"Attempting to save uploaded file to: {temp_file_path}")
141
- extracted_text = "" # Initialize extracted_text
142
 
143
  try:
144
- # Save the uploaded file temporarily
145
- # Use async file writing if possible with a library like aiofiles,
146
- # but standard file I/O is often sufficient here.
147
- with open(temp_file_path, "wb") as buffer:
148
- content = await file.read() # Read content
149
- buffer.write(content) # Write to file
150
- print(f"File saved successfully to: {temp_file_path}")
151
-
152
- # Determine file type and extract text
153
- file_extension = os.path.splitext(safe_filename)[1].lower()
154
-
155
  if file_extension == '.txt':
156
- with open(temp_file_path, 'r', encoding='utf-8') as f:
157
- extracted_text = f.read()
 
 
 
 
 
 
 
 
 
 
158
  elif file_extension == '.docx':
159
  try:
160
  import docx
161
- doc = docx.Document(temp_file_path)
162
- extracted_text = '\\n'.join([para.text for para in doc.paragraphs])
 
 
 
 
163
  except ImportError:
164
  raise HTTPException(status_code=501, detail="DOCX processing requires 'python-docx' library, which is not installed.")
165
- except Exception as e:
166
- raise HTTPException(status_code=500, detail=f"Error reading DOCX file: {e}")
167
  elif file_extension == '.pdf':
168
  try:
169
- import fitz # PyMuPDF
170
- doc = fitz.open(temp_file_path)
171
- extracted_text = ""
 
 
 
 
 
172
  for page in doc:
173
- extracted_text += page.get_text()
 
174
  doc.close()
175
  except ImportError:
176
- raise HTTPException(status_code=501, detail="PDF processing requires 'PyMuPDF' library, which is not installed.")
177
- except Exception as e:
178
- raise HTTPException(status_code=500, detail=f"Error reading PDF file: {e}")
179
- # Add support for other types (pptx, xlsx) similarly if needed
180
- # elif file_extension == '.pptx': ...
181
- # elif file_extension == '.xlsx': ...
182
  else:
183
  raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_extension}")
184
 
185
  print(f"Extracted text length: {len(extracted_text)}")
186
- return extracted_text # Return the extracted text
187
 
188
- except IOError as e:
189
- print(f"IOError saving/reading file {temp_file_path}: {e}")
190
- # Check permissions specifically
191
- if e.errno == 13: # Permission denied
192
- raise HTTPException(status_code=500, detail=f"Permission denied writing to {temp_file_path}. Check container permissions for {UPLOAD_DIR}.")
193
- raise HTTPException(status_code=500, detail=f"Error saving/accessing uploaded file: {e}")
194
  except HTTPException as e:
195
- # Re-raise HTTPExceptions directly
196
  raise e
197
  except Exception as e:
198
  print(f"Error processing file {file.filename}: {e}")
199
  traceback.print_exc()
200
  raise HTTPException(status_code=500, detail=f"An unexpected error occurred processing the document: {e}")
201
- finally:
202
- # Clean up the temporary file
203
- if os.path.exists(temp_file_path):
204
- try:
205
- os.remove(temp_file_path)
206
- print(f"Temporary file removed: {temp_file_path}")
207
- except OSError as e:
208
- # Log error but don't crash the request if cleanup fails
209
- print(f"Error removing temporary file {temp_file_path}: {e}")
210
 
211
  # --- API Endpoints ---
212
  @app.get("/", response_class=HTMLResponse)
213
  async def read_root(request: Request):
214
  """Serves the main HTML page."""
215
- # Ensure templates directory exists before trying to render
216
  if not os.path.exists(TEMPLATE_DIR):
217
  raise HTTPException(status_code=500, detail=f"Template directory not found at {TEMPLATE_DIR}")
218
  if not os.path.exists(os.path.join(TEMPLATE_DIR, "index.html")):
@@ -222,30 +257,20 @@ async def read_root(request: Request):
222
  @app.post("/translate/text")
223
  async def translate_text_endpoint(
224
  text: str = Form(...),
225
- source_lang: str = Form(...), # e.g., 'en', 'fr', 'auto'
226
- target_lang: str = Form("ar") # Default to Arabic
227
  ):
228
  """Translates direct text input."""
229
  if not text:
230
  raise HTTPException(status_code=400, detail="No text provided for translation.")
231
- # Allow translation to Arabic or from Arabic
232
- # if target_lang != "ar" and source_lang != "ar":
233
- # raise HTTPException(status_code=400, detail="Translation must involve Arabic (either as source or target). Specify 'ar' in source_lang or target_lang.")
234
-
235
- # Simplified: For now, stick to the primary goal: other -> Arabic
236
  if target_lang != "ar":
237
  raise HTTPException(status_code=400, detail="Currently, only translation to Arabic (ar) is supported via this endpoint.")
238
 
239
  try:
240
- # Determine actual source language if 'auto' is selected (requires model/library support)
241
- actual_source_lang = source_lang # Placeholder
242
- # if source_lang == 'auto':
243
- # actual_source_lang = detect_language(text) # Needs implementation
244
-
245
- translated_text = translate_text_internal(text, actual_source_lang, target_lang)
246
- return JSONResponse(content={"translated_text": translated_text, "source_lang": actual_source_lang})
247
  except HTTPException as http_exc:
248
- # Re-raise HTTP exceptions from internal functions
249
  raise http_exc
250
  except Exception as e:
251
  print(f"Unexpected error in /translate/text: {e}")
@@ -254,101 +279,37 @@ async def translate_text_endpoint(
254
  @app.post("/translate/document")
255
  async def translate_document_endpoint(
256
  file: UploadFile = File(...),
257
- source_lang: str = Form(...), # e.g., 'en', 'fr', 'auto'
258
- target_lang: str = Form("ar") # Default to Arabic
259
  ):
260
- """Translates text extracted from an uploaded document."""
261
- # Allow translation to Arabic or from Arabic
262
- # if target_lang != "ar" and source_lang != "ar":
263
- # raise HTTPException(status_code=400, detail="Document translation must involve Arabic (either as source or target). Specify 'ar' in source_lang or target_lang.")
264
-
265
- # Simplified: For now, stick to the primary goal: other -> Arabic
266
  if target_lang != "ar":
267
  raise HTTPException(status_code=400, detail="Currently, only document translation to Arabic (ar) is supported.")
268
 
269
- # Ensure upload directory exists
270
- if not os.path.exists(UPLOAD_DIR):
271
- try:
272
- os.makedirs(UPLOAD_DIR)
273
- except OSError as e:
274
- raise HTTPException(status_code=500, detail=f"Could not create upload directory: {e}")
275
-
276
- # Create a safe temporary file path
277
- temp_file_path = os.path.join(UPLOAD_DIR, f"temp_{file.filename}")
278
-
279
  try:
280
- # Save the uploaded file temporarily
281
- with open(temp_file_path, "wb") as buffer:
282
- shutil.copyfileobj(file.file, buffer)
283
-
284
- # Extract text based on content type
285
  extracted_text = await extract_text_from_file(file)
286
- # Note: extract_text_from_file now raises HTTPException on errors or unsupported types
287
-
288
  if not extracted_text:
289
- # This case might be less likely if extract_text_from_file handles errors robustly
290
- # but keep it as a safeguard.
291
- if os.path.exists(temp_file_path):
292
- os.remove(temp_file_path)
293
  raise HTTPException(status_code=400, detail="Could not extract any text from the document.")
294
 
295
- # Determine actual source language if 'auto' (requires model/library support)
296
- actual_source_lang = source_lang # Placeholder
297
- # if source_lang == 'auto':
298
- # actual_source_lang = detect_language(extracted_text) # Needs implementation
299
-
300
  # Translate the extracted text
301
- translated_text = translate_text_internal(extracted_text, actual_source_lang, target_lang)
302
-
303
- # Clean up the temporary file *after* successful processing
304
- if os.path.exists(temp_file_path):
305
- os.remove(temp_file_path)
306
 
307
  return JSONResponse(content={
308
  "original_filename": file.filename,
309
- "detected_source_lang": actual_source_lang,
310
  "translated_text": translated_text
311
  })
312
 
313
  except HTTPException as http_exc:
314
- # Clean up temp file if it exists on known errors
315
- if os.path.exists(temp_file_path):
316
- try:
317
- os.remove(temp_file_path)
318
- except:
319
- pass
320
- raise http_exc # Re-raise the exception
321
  except Exception as e:
322
- # Clean up temp file on unexpected errors
323
- if os.path.exists(temp_file_path):
324
- try:
325
- os.remove(temp_file_path)
326
- except:
327
- pass
328
  raise HTTPException(status_code=500, detail=f"An unexpected error occurred processing the document: {e}")
329
 
330
- # --- Optional: Add endpoint for reverse translation (Arabic to other) ---
331
- # @app.post("/translate/reverse")
332
- # async def translate_reverse_endpoint(text: str = Form(...), target_lang: str = Form(...)):
333
- # # Implement logic similar to translate_text_endpoint but with source="ar"
334
- # # You'll need a model capable of ar -> target_lang translation
335
- # pass
336
-
337
  # --- Run the server (for local development) ---
338
  if __name__ == "__main__":
339
  import uvicorn
340
- # Make sure to install PyMuPDF, python-docx etc. if testing locally:
341
- # pip install -r requirements.txt (from backend directory)
342
  print(f"Template Directory: {TEMPLATE_DIR}")
343
  print(f"Static Directory: {STATIC_DIR}")
344
- print(f"Upload Directory: {UPLOAD_DIR}")
345
- # Ensure necessary directories exist for local run
346
- if not os.path.exists(TEMPLATE_DIR): os.makedirs(TEMPLATE_DIR)
347
- if not os.path.exists(STATIC_DIR): os.makedirs(STATIC_DIR)
348
- if not os.path.exists(UPLOAD_DIR): os.makedirs(UPLOAD_DIR)
349
- # Create dummy index.html if it doesn't exist for local run
350
- if not os.path.exists(os.path.join(TEMPLATE_DIR, "index.html")):
351
- with open(os.path.join(TEMPLATE_DIR, "index.html"), "w") as f:
352
- f.write("<html><body><h1>Placeholder Frontend</h1></body></html>")
353
-
354
  uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
 
2
  from fastapi.responses import HTMLResponse, JSONResponse
3
  from fastapi.staticfiles import StaticFiles
4
  from fastapi.templating import Jinja2Templates
5
+ from typing import List, Optional, Dict, Any
 
6
  import os
7
  import requests
8
  import json
9
  import traceback
10
+ import io
11
 
12
  # --- Configuration ---
13
  # Determine the base directory of the main.py script
 
15
  # Adjust paths to go one level up from backend to find templates/static
16
  TEMPLATE_DIR = os.path.join(os.path.dirname(BASE_DIR), "templates")
17
  STATIC_DIR = os.path.join(os.path.dirname(BASE_DIR), "static")
 
18
 
19
+ # Hugging Face API configurations
20
+ HF_API_URL = "https://api-inference.huggingface.co/models/t5-base"
21
+ HF_HEADERS = {"Authorization": "Bearer hf_api_key_placeholder"} # Replace with your API key or remove if using a free model
 
 
 
22
 
23
  app = FastAPI()
24
 
 
26
  app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
27
  templates = Jinja2Templates(directory=TEMPLATE_DIR)
28
 
29
+ # --- Language mapping ---
30
+ LANGUAGE_MAP = {
31
+ "en": "English",
32
+ "fr": "French",
33
+ "es": "Spanish",
34
+ "de": "German",
35
+ "zh": "Chinese",
36
+ "ru": "Russian",
37
+ "ja": "Japanese",
38
+ "hi": "Hindi",
39
+ "pt": "Portuguese",
40
+ "tr": "Turkish",
41
+ "ko": "Korean",
42
+ "it": "Italian"
43
+ }
44
+
45
  # --- Fallback dictionary for common phrases ---
46
  FALLBACK_PHRASES = {
47
  "hello": "مرحبا",
 
57
  # --- Translation Function ---
58
  def translate_text_internal(text: str, source_lang: str, target_lang: str = "ar") -> str:
59
  """
60
+ Translate text using Hugging Face Inference API with prompt engineering.
61
  """
62
+ if not text.strip():
63
+ return ""
64
+
65
  print(f"Translation Request - Source Lang: {source_lang}, Target Lang: {target_lang}")
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  # For very short text, check our dictionary first
68
+ if len(text.strip()) < 20 and text.lower().strip() in FALLBACK_PHRASES:
69
  return FALLBACK_PHRASES[text.lower().strip()]
70
 
71
+ # Get full language name if available
72
+ source_lang_name = LANGUAGE_MAP.get(source_lang, source_lang)
73
+
74
+ # Construct our prompt with instructions for eloquent Arabic translation
75
+ prompt = f"""Translate the following {source_lang_name} text into Modern Standard Arabic (Fusha).
76
+ Focus on conveying the meaning elegantly using proper Balagha (Arabic eloquence).
77
+ Adapt any cultural references or idioms appropriately rather than translating literally.
78
+ Ensure the translation reads naturally to a native Arabic speaker.
79
+
80
+ Text to translate:
81
+ {text}"""
82
+
83
+ # Try multiple models in order of preference
84
+ models_to_try = [
85
+ "Helsinki-NLP/opus-mt-en-ar", # specialized English-Arabic translator
86
+ "facebook/nllb-200-distilled-600M", # multilingual model
87
+ "t5-base", # general-purpose model that can follow instructions
88
+ "google/mt5-small" # small multilingual model
89
+ ]
90
+
91
+ for model in models_to_try:
92
  try:
93
+ print(f"Attempting translation using Hugging Face model: {model}")
94
 
95
+ # Update API URL for current model
96
+ api_url = f"https://api-inference.huggingface.co/models/{model}"
 
 
 
 
 
97
 
98
+ # Prepare request payload based on model type
99
+ if "opus-mt" in model:
100
+ # Helsinki NMT models use direct input
101
+ payload = {"inputs": text}
102
+ elif "nllb" in model:
103
+ # NLLB models need language tags
104
+ src_lang_code = source_lang if source_lang != "auto" else "eng_Latn"
105
+ payload = {
106
+ "inputs": text,
107
+ "parameters": {
108
+ "source_lang": src_lang_code,
109
+ "target_lang": "arb_Arab"
110
+ }
111
+ }
112
+ else:
113
+ # T5 and other instruction-following models use our prompt
114
+ payload = {"inputs": prompt}
115
 
116
  # Make the API call
117
+ response = requests.post(api_url, headers=HF_HEADERS, json=payload, timeout=30)
118
 
119
+ # Handle different response formats based on model
120
  if response.status_code == 200:
121
  result = response.json()
 
122
 
123
+ # Extract translated text based on response structure
124
+ translated_text = None
125
+ if isinstance(result, list) and len(result) > 0:
126
+ if isinstance(result[0], dict) and "generated_text" in result[0]:
127
+ translated_text = result[0]["generated_text"]
128
+ elif isinstance(result[0], dict) and "translation_text" in result[0]:
129
+ translated_text = result[0]["translation_text"]
130
+ else:
131
+ translated_text = str(result[0])
132
+ elif isinstance(result, dict) and "generated_text" in result:
133
+ translated_text = result["generated_text"]
134
 
135
+ if translated_text:
136
+ print(f"Translation successful using {model}")
137
+ # Apply post-processing
138
+ return culturally_adapt_arabic(translated_text)
139
  else:
140
+ print(f"Unexpected response format: {response.text}")
141
+ continue # Try next model
142
  else:
143
+ print(f"API error: {response.status_code}, {response.text}")
144
+ continue # Try next model
145
 
146
  except Exception as e:
147
+ print(f"Error with model {model}: {e}")
148
+ continue # Try next model
149
 
150
+ # If all models failed, try LibreTranslate as a backup
151
+ try:
152
+ print("Attempting LibreTranslate API as backup")
153
+ libre_api = "https://translate.terraprint.co/translate"
154
+ payload = {
155
+ "q": text,
156
+ "source": source_lang if source_lang != "auto" else "auto",
157
+ "target": target_lang,
158
+ "format": "text"
159
+ }
160
+
161
+ response = requests.post(libre_api, json=payload, timeout=10)
162
+ if response.status_code == 200:
163
+ result = response.json()
164
+ translated_text = result.get("translatedText")
165
+ if translated_text:
166
+ return culturally_adapt_arabic(translated_text)
167
+ except Exception as e:
168
+ print(f"LibreTranslate backup failed: {e}")
169
+
170
+ # All translation attempts failed, use fallback
171
+ fallback_text = FALLBACK_PHRASES.get(text.lower().strip()) if len(text.strip()) < 20 else None
172
 
173
  if fallback_text:
174
  return fallback_text
 
183
 
184
  # --- Helper Functions ---
185
  async def extract_text_from_file(file: UploadFile) -> str:
186
+ """Extracts text content from uploaded files without writing to disk."""
187
+ content = await file.read() # Read file content into memory
188
+ file_extension = os.path.splitext(file.filename)[1].lower()
189
+ extracted_text = ""
 
 
 
 
 
 
 
 
190
 
191
  try:
 
 
 
 
 
 
 
 
 
 
 
192
  if file_extension == '.txt':
193
+ # Process text file directly from bytes
194
+ try:
195
+ extracted_text = content.decode('utf-8')
196
+ except UnicodeDecodeError:
197
+ # Try other common encodings if UTF-8 fails
198
+ for encoding in ['latin-1', 'cp1252', 'utf-16']:
199
+ try:
200
+ extracted_text = content.decode(encoding)
201
+ break
202
+ except UnicodeDecodeError:
203
+ continue
204
+
205
  elif file_extension == '.docx':
206
  try:
207
  import docx
208
+ from io import BytesIO
209
+
210
+ # Load DOCX from memory
211
+ doc_stream = BytesIO(content)
212
+ doc = docx.Document(doc_stream)
213
+ extracted_text = '\n'.join([para.text for para in doc.paragraphs])
214
  except ImportError:
215
  raise HTTPException(status_code=501, detail="DOCX processing requires 'python-docx' library, which is not installed.")
216
+
 
217
  elif file_extension == '.pdf':
218
  try:
219
+ import fitz # PyMuPDF
220
+ from io import BytesIO
221
+
222
+ # Load PDF from memory
223
+ pdf_stream = BytesIO(content)
224
+ doc = fitz.open(stream=pdf_stream, filetype="pdf")
225
+
226
+ page_texts = []
227
  for page in doc:
228
+ page_texts.append(page.get_text())
229
+ extracted_text = "\n".join(page_texts)
230
  doc.close()
231
  except ImportError:
232
+ raise HTTPException(status_code=501, detail="PDF processing requires 'PyMuPDF' library, which is not installed.")
233
+
 
 
 
 
234
  else:
235
  raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_extension}")
236
 
237
  print(f"Extracted text length: {len(extracted_text)}")
238
+ return extracted_text
239
 
 
 
 
 
 
 
240
  except HTTPException as e:
 
241
  raise e
242
  except Exception as e:
243
  print(f"Error processing file {file.filename}: {e}")
244
  traceback.print_exc()
245
  raise HTTPException(status_code=500, detail=f"An unexpected error occurred processing the document: {e}")
 
 
 
 
 
 
 
 
 
246
 
247
  # --- API Endpoints ---
248
  @app.get("/", response_class=HTMLResponse)
249
  async def read_root(request: Request):
250
  """Serves the main HTML page."""
 
251
  if not os.path.exists(TEMPLATE_DIR):
252
  raise HTTPException(status_code=500, detail=f"Template directory not found at {TEMPLATE_DIR}")
253
  if not os.path.exists(os.path.join(TEMPLATE_DIR, "index.html")):
 
257
  @app.post("/translate/text")
258
  async def translate_text_endpoint(
259
  text: str = Form(...),
260
+ source_lang: str = Form(...),
261
+ target_lang: str = Form("ar")
262
  ):
263
  """Translates direct text input."""
264
  if not text:
265
  raise HTTPException(status_code=400, detail="No text provided for translation.")
266
+
 
 
 
 
267
  if target_lang != "ar":
268
  raise HTTPException(status_code=400, detail="Currently, only translation to Arabic (ar) is supported via this endpoint.")
269
 
270
  try:
271
+ translated_text = translate_text_internal(text, source_lang, target_lang)
272
+ return JSONResponse(content={"translated_text": translated_text, "source_lang": source_lang})
 
 
 
 
 
273
  except HTTPException as http_exc:
 
274
  raise http_exc
275
  except Exception as e:
276
  print(f"Unexpected error in /translate/text: {e}")
 
279
  @app.post("/translate/document")
280
  async def translate_document_endpoint(
281
  file: UploadFile = File(...),
282
+ source_lang: str = Form(...),
283
+ target_lang: str = Form("ar")
284
  ):
285
+ """Translates text extracted from an uploaded document without saving to disk."""
 
 
 
 
 
286
  if target_lang != "ar":
287
  raise HTTPException(status_code=400, detail="Currently, only document translation to Arabic (ar) is supported.")
288
 
 
 
 
 
 
 
 
 
 
 
289
  try:
290
+ # Extract text directly from the uploaded file
 
 
 
 
291
  extracted_text = await extract_text_from_file(file)
292
+
 
293
  if not extracted_text:
 
 
 
 
294
  raise HTTPException(status_code=400, detail="Could not extract any text from the document.")
295
 
 
 
 
 
 
296
  # Translate the extracted text
297
+ translated_text = translate_text_internal(extracted_text, source_lang, target_lang)
 
 
 
 
298
 
299
  return JSONResponse(content={
300
  "original_filename": file.filename,
301
+ "detected_source_lang": source_lang,
302
  "translated_text": translated_text
303
  })
304
 
305
  except HTTPException as http_exc:
306
+ raise http_exc
 
 
 
 
 
 
307
  except Exception as e:
 
 
 
 
 
 
308
  raise HTTPException(status_code=500, detail=f"An unexpected error occurred processing the document: {e}")
309
 
 
 
 
 
 
 
 
310
  # --- Run the server (for local development) ---
311
  if __name__ == "__main__":
312
  import uvicorn
 
 
313
  print(f"Template Directory: {TEMPLATE_DIR}")
314
  print(f"Static Directory: {STATIC_DIR}")
 
 
 
 
 
 
 
 
 
 
315
  uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
backend/requirements.txt CHANGED
@@ -2,7 +2,5 @@ fastapi
2
  uvicorn
3
  python-docx
4
  PyMuPDF
5
- transformers[torch]
6
- sentencepiece
7
  python-multipart
8
- requests # Added for LibreTranslate API fallback
 
2
  uvicorn
3
  python-docx
4
  PyMuPDF
5
+ requests
 
6
  python-multipart