amine_dubs commited on
Commit
d6d82c2
·
1 Parent(s): a5b30bf

Restore backend files and ensure Dockerfile is in root

Browse files
Files changed (3) hide show
  1. backend +0 -1
  2. backend/main.py +266 -0
  3. backend/requirements.txt +11 -0
backend DELETED
@@ -1 +0,0 @@
1
- Subproject commit da27106172ae5acc2deda738eee913963fdaac6f
 
 
backend/main.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Request
2
+ from fastapi.responses import HTMLResponse, JSONResponse
3
+ from fastapi.staticfiles import StaticFiles
4
+ from fastapi.templating import Jinja2Templates
5
+ import os
6
+ from typing import List, Optional
7
+ import shutil
8
+
9
+ # Placeholder for translation logic
10
+ # from transformers import pipeline # Uncomment when implementing translation
11
+
12
+ # --- Configuration ---
13
+ # Determine the base directory of the main.py script
14
+ # This helps in locating templates and static files correctly, especially in Docker
15
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
16
+ # Adjust paths to go one level up from backend to find templates/static
17
+ TEMPLATE_DIR = os.path.join(os.path.dirname(BASE_DIR), "templates")
18
+ STATIC_DIR = os.path.join(os.path.dirname(BASE_DIR), "static")
19
+ UPLOAD_DIR = os.path.join(os.path.dirname(BASE_DIR), "uploads") # Place uploads outside backend
20
+
21
+ app = FastAPI()
22
+
23
+ # --- Mount Static Files and Templates ---
24
+ # Ensure the static directory exists (FastAPI doesn't create it)
25
+ # We'll create it manually or via Docker later
26
+ app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
27
+
28
+ # Ensure the templates directory exists (FastAPI doesn't create it)
29
+ templates = Jinja2Templates(directory=TEMPLATE_DIR)
30
+
31
+ # --- Placeholder for Model Loading ---
32
+ # Initialize the translation pipeline (load the model)
33
+ # Consider loading the model on startup to avoid delays during requests
34
+ # translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ar") # Example model
35
+
36
+ # --- Helper Functions ---
37
+ def translate_text_internal(text: str, source_lang: str, target_lang: str = "ar") -> str:
38
+ """Internal function to handle text translation using the loaded model."""
39
+ # Refined Prompt based on user request
40
+ prompt = f"""Translate the following text from {source_lang} to Arabic (Modern Standard Arabic - Fusha) precisely. Do not provide a literal translation; focus on conveying the meaning accurately while respecting Arabic eloquence (balagha) by rephrasing if necessary:
41
+
42
+ {text}"""
43
+
44
+ # --- Actual Translation Logic (using Hugging Face pipeline) ---
45
+ # This part needs to be implemented based on the chosen model's API
46
+ # Example using a generic pipeline (replace with actual model call):
47
+ # try:
48
+ # # Note: Standard pipelines might not directly support complex prompts like this.
49
+ # # You might need custom model loading and generation logic.
50
+ # # result = translator(prompt, src_lang=source_lang, tgt_lang=target_lang) # Adjust based on model
51
+ # # translated_text = result[0]['translation_text']
52
+ # # --- Placeholder ---
53
+ # print(f"Simulating translation for prompt: {prompt}") # Log the prompt being used
54
+ # translated_text = f"Translated: {text} (from {source_lang} to {target_lang})" # Replace with actual translation
55
+ # return translated_text
56
+ # except Exception as e:
57
+ # print(f"Error during translation: {e}")
58
+ # raise HTTPException(status_code=500, detail=f"Translation failed: {e}")
59
+ # --- End Placeholder ---
60
+
61
+ # --- Simplified Placeholder ---
62
+ print(f"Using Prompt: {prompt}")
63
+ # Simulate translation for now
64
+ return f"[Simulated Translation of '{text}' from {source_lang} to MSA Arabic, focusing on meaning and eloquence]"
65
+ # --- End Simplified Placeholder ---
66
+
67
+
68
+ def extract_text_from_file(file_path: str, file_type: str) -> str:
69
+ """Extracts text from various document types."""
70
+ text = ""
71
+ try:
72
+ if file_type == "application/pdf":
73
+ import fitz # PyMuPDF
74
+ with fitz.open(file_path) as doc:
75
+ for page in doc:
76
+ text += page.get_text()
77
+ elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
78
+ from docx import Document
79
+ doc = Document(file_path)
80
+ for para in doc.paragraphs:
81
+ text += para.text + "\n"
82
+ elif file_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
83
+ import openpyxl
84
+ workbook = openpyxl.load_workbook(file_path)
85
+ for sheet_name in workbook.sheetnames:
86
+ sheet = workbook[sheet_name]
87
+ for row in sheet.iter_rows():
88
+ for cell in row:
89
+ if cell.value:
90
+ text += str(cell.value) + " "
91
+ text += "\n" # Newline after each row
92
+ elif file_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
93
+ from pptx import Presentation
94
+ prs = Presentation(file_path)
95
+ for slide in prs.slides:
96
+ for shape in slide.shapes:
97
+ if hasattr(shape, "text"):
98
+ text += shape.text + "\n"
99
+ # Add handling for plain text files
100
+ elif file_type.startswith("text/"):
101
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
102
+ text = f.read()
103
+ else:
104
+ raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_type}. Cannot extract text.")
105
+
106
+ except ImportError as ie:
107
+ print(f"Import error for {file_type}: {ie}. Make sure the required library is installed.")
108
+ # Ensure temp file is cleaned up even if import fails
109
+ if os.path.exists(file_path):
110
+ os.remove(file_path)
111
+ raise HTTPException(status_code=501, detail=f"Text extraction for {file_type} requires an additional library: {ie.name}. Please install it (check requirements.txt). The file was not processed.")
112
+ except Exception as e:
113
+ print(f"Error extracting text from {file_path} ({file_type}): {e}")
114
+ # Ensure temp file is cleaned up on extraction error
115
+ if os.path.exists(file_path):
116
+ os.remove(file_path)
117
+ raise HTTPException(status_code=500, detail=f"Failed to extract text from file: {e}")
118
+
119
+ # Do not remove the file here; let the calling function handle cleanup after translation
120
+ return text
121
+
122
+ # --- API Endpoints ---
123
+ @app.get("/", response_class=HTMLResponse)
124
+ async def read_root(request: Request):
125
+ """Serves the main HTML page."""
126
+ # Ensure templates directory exists before trying to render
127
+ if not os.path.exists(TEMPLATE_DIR):
128
+ raise HTTPException(status_code=500, detail=f"Template directory not found at {TEMPLATE_DIR}")
129
+ if not os.path.exists(os.path.join(TEMPLATE_DIR, "index.html")):
130
+ raise HTTPException(status_code=500, detail=f"index.html not found in {TEMPLATE_DIR}")
131
+ return templates.TemplateResponse("index.html", {"request": request})
132
+
133
+ @app.post("/translate/text")
134
+ async def translate_text_endpoint(
135
+ text: str = Form(...),
136
+ source_lang: str = Form(...), # e.g., 'en', 'fr', 'auto'
137
+ target_lang: str = Form("ar") # Default to Arabic
138
+ ):
139
+ """Translates direct text input."""
140
+ if not text:
141
+ raise HTTPException(status_code=400, detail="No text provided for translation.")
142
+ # Allow translation to Arabic or from Arabic
143
+ # if target_lang != "ar" and source_lang != "ar":
144
+ # raise HTTPException(status_code=400, detail="Translation must involve Arabic (either as source or target). Specify 'ar' in source_lang or target_lang.")
145
+
146
+ # Simplified: For now, stick to the primary goal: other -> Arabic
147
+ if target_lang != "ar":
148
+ raise HTTPException(status_code=400, detail="Currently, only translation to Arabic (ar) is supported via this endpoint.")
149
+
150
+ try:
151
+ # Determine actual source language if 'auto' is selected (requires model/library support)
152
+ actual_source_lang = source_lang # Placeholder
153
+ # if source_lang == 'auto':
154
+ # actual_source_lang = detect_language(text) # Needs implementation
155
+
156
+ translated_text = translate_text_internal(text, actual_source_lang, target_lang)
157
+ return JSONResponse(content={"translated_text": translated_text, "source_lang": actual_source_lang})
158
+ except HTTPException as http_exc:
159
+ # Re-raise HTTP exceptions from internal functions
160
+ raise http_exc
161
+ except Exception as e:
162
+ print(f"Unexpected error in /translate/text: {e}")
163
+ raise HTTPException(status_code=500, detail=f"An unexpected error occurred during text translation: {e}")
164
+
165
+
166
+ @app.post("/translate/document")
167
+ async def translate_document_endpoint(
168
+ file: UploadFile = File(...),
169
+ source_lang: str = Form(...), # e.g., 'en', 'fr', 'auto'
170
+ target_lang: str = Form("ar") # Default to Arabic
171
+ ):
172
+ """Translates text extracted from an uploaded document."""
173
+ # Allow translation to Arabic or from Arabic
174
+ # if target_lang != "ar" and source_lang != "ar":
175
+ # raise HTTPException(status_code=400, detail="Document translation must involve Arabic (either as source or target). Specify 'ar' in source_lang or target_lang.")
176
+
177
+ # Simplified: For now, stick to the primary goal: other -> Arabic
178
+ if target_lang != "ar":
179
+ raise HTTPException(status_code=400, detail="Currently, only document translation to Arabic (ar) is supported.")
180
+
181
+ # Ensure upload directory exists
182
+ if not os.path.exists(UPLOAD_DIR):
183
+ try:
184
+ os.makedirs(UPLOAD_DIR)
185
+ except OSError as e:
186
+ raise HTTPException(status_code=500, detail=f"Could not create upload directory: {e}")
187
+
188
+ # Create a safe temporary file path
189
+ temp_file_path = os.path.join(UPLOAD_DIR, f"temp_{file.filename}")
190
+
191
+ try:
192
+ # Save the uploaded file temporarily
193
+ with open(temp_file_path, "wb") as buffer:
194
+ shutil.copyfileobj(file.file, buffer)
195
+
196
+ # Extract text based on content type
197
+ extracted_text = extract_text_from_file(temp_file_path, file.content_type)
198
+ # Note: extract_text_from_file now raises HTTPException on errors or unsupported types
199
+
200
+ if not extracted_text:
201
+ # This case might be less likely if extract_text_from_file handles errors robustly
202
+ # but keep it as a safeguard.
203
+ if os.path.exists(temp_file_path):
204
+ os.remove(temp_file_path)
205
+ raise HTTPException(status_code=400, detail="Could not extract any text from the document.")
206
+
207
+ # Determine actual source language if 'auto' (requires model/library support)
208
+ actual_source_lang = source_lang # Placeholder
209
+ # if source_lang == 'auto':
210
+ # actual_source_lang = detect_language(extracted_text) # Needs implementation
211
+
212
+ # Translate the extracted text
213
+ translated_text = translate_text_internal(extracted_text, actual_source_lang, target_lang)
214
+
215
+ # Clean up the temporary file *after* successful processing
216
+ if os.path.exists(temp_file_path):
217
+ os.remove(temp_file_path)
218
+
219
+ return JSONResponse(content={
220
+ "original_filename": file.filename,
221
+ "detected_source_lang": actual_source_lang,
222
+ "translated_text": translated_text
223
+ })
224
+
225
+ except HTTPException as http_exc:
226
+ # Clean up temp file if it exists on known errors
227
+ if os.path.exists(temp_file_path):
228
+ try:
229
+ os.remove(temp_file_path)
230
+ except:
231
+ pass
232
+ raise http_exc # Re-raise the exception
233
+ except Exception as e:
234
+ # Clean up temp file on unexpected errors
235
+ if os.path.exists(temp_file_path):
236
+ try:
237
+ os.remove(temp_file_path)
238
+ except:
239
+ pass
240
+ raise HTTPException(status_code=500, detail=f"An unexpected error occurred processing the document: {e}")
241
+
242
+ # --- Optional: Add endpoint for reverse translation (Arabic to other) ---
243
+ # @app.post("/translate/reverse")
244
+ # async def translate_reverse_endpoint(text: str = Form(...), target_lang: str = Form(...)):
245
+ # # Implement logic similar to translate_text_endpoint but with source="ar"
246
+ # # You'll need a model capable of ar -> target_lang translation
247
+ # pass
248
+
249
+ # --- Run the server (for local development) ---
250
+ if __name__ == "__main__":
251
+ import uvicorn
252
+ # Make sure to install PyMuPDF, python-docx etc. if testing locally:
253
+ # pip install -r requirements.txt (from backend directory)
254
+ print(f"Template Directory: {TEMPLATE_DIR}")
255
+ print(f"Static Directory: {STATIC_DIR}")
256
+ print(f"Upload Directory: {UPLOAD_DIR}")
257
+ # Ensure necessary directories exist for local run
258
+ if not os.path.exists(TEMPLATE_DIR): os.makedirs(TEMPLATE_DIR)
259
+ if not os.path.exists(STATIC_DIR): os.makedirs(STATIC_DIR)
260
+ if not os.path.exists(UPLOAD_DIR): os.makedirs(UPLOAD_DIR)
261
+ # Create dummy index.html if it doesn't exist for local run
262
+ if not os.path.exists(os.path.join(TEMPLATE_DIR, "index.html")):
263
+ with open(os.path.join(TEMPLATE_DIR, "index.html"), "w") as f:
264
+ f.write("<html><body><h1>Placeholder Frontend</h1></body></html>")
265
+
266
+ uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
backend/requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ python-multipart
4
+ transformers[torch]
5
+ # Add specific document parsing libraries later if needed, e.g.:
6
+ python-docx
7
+ openpyxl
8
+ python-pptx
9
+ PyMuPDF
10
+ sentencepiece # Often needed for tokenizers
11
+ sacremoses # Often needed for translation models