amine_dubs commited on
Commit
91a3ee9
·
1 Parent(s): 5b77ddb

Fix translation output and file upload permissions

Browse files
Files changed (2) hide show
  1. Dockerfile +3 -0
  2. backend/main.py +108 -80
Dockerfile CHANGED
@@ -30,6 +30,9 @@ COPY static/ /app/static
30
  # Create the necessary directories within the container that the app expects
31
  RUN mkdir -p /app/templates /app/static /app/uploads
32
 
 
 
 
33
  # Make port 8000 available
34
  EXPOSE 8000
35
 
 
30
  # Create the necessary directories within the container that the app expects
31
  RUN mkdir -p /app/templates /app/static /app/uploads
32
 
33
+ # Grant write permissions to the uploads directory
34
+ RUN chmod -R 777 /app/uploads
35
+
36
  # Make port 8000 available
37
  EXPOSE 8000
38
 
backend/main.py CHANGED
@@ -2,12 +2,13 @@ from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Request
2
  from fastapi.responses import HTMLResponse, JSONResponse
3
  from fastapi.staticfiles import StaticFiles
4
  from fastapi.templating import Jinja2Templates
5
- import os
6
  from typing import List, Optional
7
  import shutil
 
 
8
 
9
  # Placeholder for translation logic
10
- # from transformers import pipeline # Uncomment when implementing translation
11
 
12
  # --- Configuration ---
13
  # Determine the base directory of the main.py script
@@ -16,7 +17,7 @@ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
16
  # Adjust paths to go one level up from backend to find templates/static
17
  TEMPLATE_DIR = os.path.join(os.path.dirname(BASE_DIR), "templates")
18
  STATIC_DIR = os.path.join(os.path.dirname(BASE_DIR), "static")
19
- UPLOAD_DIR = os.path.join(os.path.dirname(BASE_DIR), "uploads") # Place uploads outside backend
20
 
21
  app = FastAPI()
22
 
@@ -31,93 +32,120 @@ templates = Jinja2Templates(directory=TEMPLATE_DIR)
31
  # --- Placeholder for Model Loading ---
32
  # Initialize the translation pipeline (load the model)
33
  # Consider loading the model on startup to avoid delays during requests
34
- # translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ar") # Example model
35
 
36
  # --- Helper Functions ---
37
  def translate_text_internal(text: str, source_lang: str, target_lang: str = "ar") -> str:
38
  """Internal function to handle text translation using the loaded model."""
39
- # Refined Prompt based on user request
40
- prompt = f"""Translate the following text from {source_lang} to Arabic (Modern Standard Arabic - Fusha) precisely. Do not provide a literal translation; focus on conveying the meaning accurately while respecting Arabic eloquence (balagha) by rephrasing if necessary:
 
41
 
42
- {text}"""
 
 
43
 
44
  # --- Actual Translation Logic (using Hugging Face pipeline) ---
45
- # This part needs to be implemented based on the chosen model's API
46
- # Example using a generic pipeline (replace with actual model call):
47
- # try:
48
- # # Note: Standard pipelines might not directly support complex prompts like this.
49
- # # You might need custom model loading and generation logic.
50
- # # result = translator(prompt, src_lang=source_lang, tgt_lang=target_lang) # Adjust based on model
51
- # # translated_text = result[0]['translation_text']
52
- # # --- Placeholder ---
53
- # print(f"Simulating translation for prompt: {prompt}") # Log the prompt being used
54
- # translated_text = f"Translated: {text} (from {source_lang} to {target_lang})" # Replace with actual translation
55
- # return translated_text
56
- # except Exception as e:
57
- # print(f"Error during translation: {e}")
58
- # raise HTTPException(status_code=500, detail=f"Translation failed: {e}")
59
- # --- End Placeholder ---
60
-
61
- # --- Simplified Placeholder ---
62
- print(f"Using Prompt: {prompt}")
63
- # Simulate translation for now
64
- return f"[Simulated Translation of '{text}' from {source_lang} to MSA Arabic, focusing on meaning and eloquence]"
65
- # --- End Simplified Placeholder ---
66
-
67
-
68
- def extract_text_from_file(file_path: str, file_type: str) -> str:
69
- """Extracts text from various document types."""
70
- text = ""
71
  try:
72
- if file_type == "application/pdf":
73
- import fitz # PyMuPDF
74
- with fitz.open(file_path) as doc:
75
- for page in doc:
76
- text += page.get_text()
77
- elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
78
- from docx import Document
79
- doc = Document(file_path)
80
- for para in doc.paragraphs:
81
- text += para.text + "\n"
82
- elif file_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
83
- import openpyxl
84
- workbook = openpyxl.load_workbook(file_path)
85
- for sheet_name in workbook.sheetnames:
86
- sheet = workbook[sheet_name]
87
- for row in sheet.iter_rows():
88
- for cell in row:
89
- if cell.value:
90
- text += str(cell.value) + " "
91
- text += "\n" # Newline after each row
92
- elif file_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
93
- from pptx import Presentation
94
- prs = Presentation(file_path)
95
- for slide in prs.slides:
96
- for shape in slide.shapes:
97
- if hasattr(shape, "text"):
98
- text += shape.text + "\n"
99
- # Add handling for plain text files
100
- elif file_type.startswith("text/"):
101
- with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
102
- text = f.read()
103
  else:
104
- raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_type}. Cannot extract text.")
105
-
106
- except ImportError as ie:
107
- print(f"Import error for {file_type}: {ie}. Make sure the required library is installed.")
108
- # Ensure temp file is cleaned up even if import fails
109
- if os.path.exists(file_path):
110
- os.remove(file_path)
111
- raise HTTPException(status_code=501, detail=f"Text extraction for {file_type} requires an additional library: {ie.name}. Please install it (check requirements.txt). The file was not processed.")
112
  except Exception as e:
113
- print(f"Error extracting text from {file_path} ({file_type}): {e}")
114
- # Ensure temp file is cleaned up on extraction error
115
- if os.path.exists(file_path):
116
- os.remove(file_path)
117
- raise HTTPException(status_code=500, detail=f"Failed to extract text from file: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
- # Do not remove the file here; let the calling function handle cleanup after translation
120
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  # --- API Endpoints ---
123
  @app.get("/", response_class=HTMLResponse)
@@ -194,7 +222,7 @@ async def translate_document_endpoint(
194
  shutil.copyfileobj(file.file, buffer)
195
 
196
  # Extract text based on content type
197
- extracted_text = extract_text_from_file(temp_file_path, file.content_type)
198
  # Note: extract_text_from_file now raises HTTPException on errors or unsupported types
199
 
200
  if not extracted_text:
 
2
  from fastapi.responses import HTMLResponse, JSONResponse
3
  from fastapi.staticfiles import StaticFiles
4
  from fastapi.templating import Jinja2Templates
 
5
  from typing import List, Optional
6
  import shutil
7
+ import os
8
+ import traceback
9
 
10
  # Placeholder for translation logic
11
+ from transformers import pipeline # Uncomment when implementing translation
12
 
13
  # --- Configuration ---
14
  # Determine the base directory of the main.py script
 
17
  # Adjust paths to go one level up from backend to find templates/static
18
  TEMPLATE_DIR = os.path.join(os.path.dirname(BASE_DIR), "templates")
19
  STATIC_DIR = os.path.join(os.path.dirname(BASE_DIR), "static")
20
+ UPLOAD_DIR = "/app/uploads" # Ensure this matches Dockerfile WORKDIR + uploads
21
 
22
  app = FastAPI()
23
 
 
32
  # --- Placeholder for Model Loading ---
33
  # Initialize the translation pipeline (load the model)
34
  # Consider loading the model on startup to avoid delays during requests
35
+ translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ar") # Example model
36
 
37
  # --- Helper Functions ---
38
  def translate_text_internal(text: str, source_lang: str, target_lang: str = "ar") -> str:
39
  """Internal function to handle text translation using the loaded model."""
40
+ if translator is None:
41
+ # If the model failed to load, raise an error instead of returning a placeholder
42
+ raise HTTPException(status_code=503, detail="Translation service is unavailable (model not loaded).")
43
 
44
+ # Log the request details
45
+ print(f"Translation Request - Source Lang: {source_lang}, Target Lang: {target_lang}")
46
+ print(f"Input Text: {text}")
47
 
48
  # --- Actual Translation Logic (using Hugging Face pipeline) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  try:
50
+ # The Helsinki model expects the text directly
51
+ result = translator(text)
52
+
53
+ if result and isinstance(result, list) and 'translation_text' in result[0]:
54
+ translated_text = result[0]['translation_text']
55
+ print(f"Raw Translation Output: {translated_text}")
56
+ # Return the actual translated text
57
+ return translated_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  else:
59
+ print(f"Unexpected translation result format: {result}")
60
+ raise HTTPException(status_code=500, detail="Translation failed: Unexpected model output format.")
61
+
 
 
 
 
 
62
  except Exception as e:
63
+ print(f"Error during translation pipeline: {e}")
64
+ traceback.print_exc()
65
+ raise HTTPException(status_code=500, detail=f"Translation failed: {e}")
66
+
67
+ # --- Function to extract text ---
68
+ async def extract_text_from_file(file: UploadFile) -> str:
69
+ """Extracts text content from various file types."""
70
+ # Ensure upload directory exists (though Dockerfile should create it)
71
+ # Use os.makedirs for robustness
72
+ os.makedirs(UPLOAD_DIR, exist_ok=True) # Ensure directory exists
73
+
74
+ # Secure filename and define path
75
+ # Use a temporary filename to avoid collisions and complex sanitization
76
+ # Make sure the filename is safe for the filesystem
77
+ safe_filename = os.path.basename(file.filename) # Basic safety
78
+ temp_file_path = os.path.join(UPLOAD_DIR, f"temp_{safe_filename}")
79
+ print(f"Attempting to save uploaded file to: {temp_file_path}")
80
+ extracted_text = "" # Initialize extracted_text
81
 
82
+ try:
83
+ # Save the uploaded file temporarily
84
+ # Use async file writing if possible with a library like aiofiles,
85
+ # but standard file I/O is often sufficient here.
86
+ with open(temp_file_path, "wb") as buffer:
87
+ content = await file.read() # Read content
88
+ buffer.write(content) # Write to file
89
+ print(f"File saved successfully to: {temp_file_path}")
90
+
91
+ # Determine file type and extract text
92
+ file_extension = os.path.splitext(safe_filename)[1].lower()
93
+
94
+ if file_extension == '.txt':
95
+ with open(temp_file_path, 'r', encoding='utf-8') as f:
96
+ extracted_text = f.read()
97
+ elif file_extension == '.docx':
98
+ try:
99
+ import docx
100
+ doc = docx.Document(temp_file_path)
101
+ extracted_text = '\\n'.join([para.text for para in doc.paragraphs])
102
+ except ImportError:
103
+ raise HTTPException(status_code=501, detail="DOCX processing requires 'python-docx' library, which is not installed.")
104
+ except Exception as e:
105
+ raise HTTPException(status_code=500, detail=f"Error reading DOCX file: {e}")
106
+ elif file_extension == '.pdf':
107
+ try:
108
+ import fitz # PyMuPDF
109
+ doc = fitz.open(temp_file_path)
110
+ extracted_text = ""
111
+ for page in doc:
112
+ extracted_text += page.get_text()
113
+ doc.close()
114
+ except ImportError:
115
+ raise HTTPException(status_code=501, detail="PDF processing requires 'PyMuPDF' library, which is not installed.")
116
+ except Exception as e:
117
+ raise HTTPException(status_code=500, detail=f"Error reading PDF file: {e}")
118
+ # Add support for other types (pptx, xlsx) similarly if needed
119
+ # elif file_extension == '.pptx': ...
120
+ # elif file_extension == '.xlsx': ...
121
+ else:
122
+ raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_extension}")
123
+
124
+ print(f"Extracted text length: {len(extracted_text)}")
125
+ return extracted_text # Return the extracted text
126
+
127
+ except IOError as e:
128
+ print(f"IOError saving/reading file {temp_file_path}: {e}")
129
+ # Check permissions specifically
130
+ if e.errno == 13: # Permission denied
131
+ raise HTTPException(status_code=500, detail=f"Permission denied writing to {temp_file_path}. Check container permissions for {UPLOAD_DIR}.")
132
+ raise HTTPException(status_code=500, detail=f"Error saving/accessing uploaded file: {e}")
133
+ except HTTPException as e:
134
+ # Re-raise HTTPExceptions directly
135
+ raise e
136
+ except Exception as e:
137
+ print(f"Error processing file {file.filename}: {e}")
138
+ traceback.print_exc()
139
+ raise HTTPException(status_code=500, detail=f"An unexpected error occurred processing the document: {e}")
140
+ finally:
141
+ # Clean up the temporary file
142
+ if os.path.exists(temp_file_path):
143
+ try:
144
+ os.remove(temp_file_path)
145
+ print(f"Temporary file removed: {temp_file_path}")
146
+ except OSError as e:
147
+ # Log error but don't crash the request if cleanup fails
148
+ print(f"Error removing temporary file {temp_file_path}: {e}")
149
 
150
  # --- API Endpoints ---
151
  @app.get("/", response_class=HTMLResponse)
 
222
  shutil.copyfileobj(file.file, buffer)
223
 
224
  # Extract text based on content type
225
+ extracted_text = await extract_text_from_file(file)
226
  # Note: extract_text_from_file now raises HTTPException on errors or unsupported types
227
 
228
  if not extracted_text: