Spaces:

AmineDubs
/

Scripts_translation_to_arabic

Running

App Files Files Community

amine_dubs commited on Apr 27

Commit

d6d82c2

1 Parent(s): a5b30bf

Restore backend files and ensure Dockerfile is in root

Browse files

Files changed (3) hide show

backend +0 -1
backend/main.py +266 -0
backend/requirements.txt +11 -0

backend DELETED Viewed

	@@ -1 +0,0 @@
1	- Subproject commit da27106172ae5acc2deda738eee913963fdaac6f

backend/main.py ADDED Viewed

	@@ -0,0 +1,266 @@

+from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Request
+from fastapi.responses import HTMLResponse, JSONResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
+import os
+from typing import List, Optional
+import shutil
+# Placeholder for translation logic
+# from transformers import pipeline # Uncomment when implementing translation
+# --- Configuration ---
+# Determine the base directory of the main.py script
+# This helps in locating templates and static files correctly, especially in Docker
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+# Adjust paths to go one level up from backend to find templates/static
+TEMPLATE_DIR = os.path.join(os.path.dirname(BASE_DIR), "templates")
+STATIC_DIR = os.path.join(os.path.dirname(BASE_DIR), "static")
+UPLOAD_DIR = os.path.join(os.path.dirname(BASE_DIR), "uploads") # Place uploads outside backend
+app = FastAPI()
+# --- Mount Static Files and Templates ---
+# Ensure the static directory exists (FastAPI doesn't create it)
+# We'll create it manually or via Docker later
+app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
+# Ensure the templates directory exists (FastAPI doesn't create it)
+templates = Jinja2Templates(directory=TEMPLATE_DIR)
+# --- Placeholder for Model Loading ---
+# Initialize the translation pipeline (load the model)
+# Consider loading the model on startup to avoid delays during requests
+# translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ar") # Example model
+# --- Helper Functions ---
+def translate_text_internal(text: str, source_lang: str, target_lang: str = "ar") -> str:
+    """Internal function to handle text translation using the loaded model."""
+    # Refined Prompt based on user request
+    prompt = f"""Translate the following text from {source_lang} to Arabic (Modern Standard Arabic - Fusha) precisely. Do not provide a literal translation; focus on conveying the meaning accurately while respecting Arabic eloquence (balagha) by rephrasing if necessary:
+{text}"""
+    # --- Actual Translation Logic (using Hugging Face pipeline) ---
+    # This part needs to be implemented based on the chosen model's API
+    # Example using a generic pipeline (replace with actual model call):
+    # try:
+    #     # Note: Standard pipelines might not directly support complex prompts like this.
+    #     # You might need custom model loading and generation logic.
+    #     # result = translator(prompt, src_lang=source_lang, tgt_lang=target_lang) # Adjust based on model
+    #     # translated_text = result[0]['translation_text']
+    #     # --- Placeholder ---
+    #     print(f"Simulating translation for prompt: {prompt}") # Log the prompt being used
+    #     translated_text = f"Translated: {text} (from {source_lang} to {target_lang})" # Replace with actual translation
+    #     return translated_text
+    # except Exception as e:
+    #     print(f"Error during translation: {e}")
+    #     raise HTTPException(status_code=500, detail=f"Translation failed: {e}")
+    # --- End Placeholder ---
+    # --- Simplified Placeholder ---
+    print(f"Using Prompt: {prompt}")
+    # Simulate translation for now
+    return f"[Simulated Translation of '{text}' from {source_lang} to MSA Arabic, focusing on meaning and eloquence]"
+    # --- End Simplified Placeholder ---
+def extract_text_from_file(file_path: str, file_type: str) -> str:
+    """Extracts text from various document types."""
+    text = ""
+    try:
+        if file_type == "application/pdf":
+            import fitz  # PyMuPDF
+            with fitz.open(file_path) as doc:
+                for page in doc:
+                    text += page.get_text()
+        elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+            from docx import Document
+            doc = Document(file_path)
+            for para in doc.paragraphs:
+                text += para.text + "\n"
+        elif file_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
+            import openpyxl
+            workbook = openpyxl.load_workbook(file_path)
+            for sheet_name in workbook.sheetnames:
+                sheet = workbook[sheet_name]
+                for row in sheet.iter_rows():
+                    for cell in row:
+                        if cell.value:
+                            text += str(cell.value) + " "
+                    text += "\n" # Newline after each row
+        elif file_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
+            from pptx import Presentation
+            prs = Presentation(file_path)
+            for slide in prs.slides:
+                for shape in slide.shapes:
+                    if hasattr(shape, "text"):
+                        text += shape.text + "\n"
+        # Add handling for plain text files
+        elif file_type.startswith("text/"):
+             with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                 text = f.read()
+        else:
+            raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_type}. Cannot extract text.")
+    except ImportError as ie:
+         print(f"Import error for {file_type}: {ie}. Make sure the required library is installed.")
+         # Ensure temp file is cleaned up even if import fails
+         if os.path.exists(file_path):
+             os.remove(file_path)
+         raise HTTPException(status_code=501, detail=f"Text extraction for {file_type} requires an additional library: {ie.name}. Please install it (check requirements.txt). The file was not processed.")
+    except Exception as e:
+        print(f"Error extracting text from {file_path} ({file_type}): {e}")
+        # Ensure temp file is cleaned up on extraction error
+        if os.path.exists(file_path):
+            os.remove(file_path)
+        raise HTTPException(status_code=500, detail=f"Failed to extract text from file: {e}")
+    # Do not remove the file here; let the calling function handle cleanup after translation
+    return text
+# --- API Endpoints ---
+@app.get("/", response_class=HTMLResponse)
+async def read_root(request: Request):
+    """Serves the main HTML page."""
+    # Ensure templates directory exists before trying to render
+    if not os.path.exists(TEMPLATE_DIR):
+         raise HTTPException(status_code=500, detail=f"Template directory not found at {TEMPLATE_DIR}")
+    if not os.path.exists(os.path.join(TEMPLATE_DIR, "index.html")):
+         raise HTTPException(status_code=500, detail=f"index.html not found in {TEMPLATE_DIR}")
+    return templates.TemplateResponse("index.html", {"request": request})
+@app.post("/translate/text")
+async def translate_text_endpoint(
+    text: str = Form(...),
+    source_lang: str = Form(...), # e.g., 'en', 'fr', 'auto'
+    target_lang: str = Form("ar") # Default to Arabic
+):
+    """Translates direct text input."""
+    if not text:
+        raise HTTPException(status_code=400, detail="No text provided for translation.")
+    # Allow translation to Arabic or from Arabic
+    # if target_lang != "ar" and source_lang != "ar":
+    #      raise HTTPException(status_code=400, detail="Translation must involve Arabic (either as source or target). Specify 'ar' in source_lang or target_lang.")
+    # Simplified: For now, stick to the primary goal: other -> Arabic
+    if target_lang != "ar":
+         raise HTTPException(status_code=400, detail="Currently, only translation to Arabic (ar) is supported via this endpoint.")
+    try:
+        # Determine actual source language if 'auto' is selected (requires model/library support)
+        actual_source_lang = source_lang # Placeholder
+        # if source_lang == 'auto':
+            # actual_source_lang = detect_language(text) # Needs implementation
+        translated_text = translate_text_internal(text, actual_source_lang, target_lang)
+        return JSONResponse(content={"translated_text": translated_text, "source_lang": actual_source_lang})
+    except HTTPException as http_exc:
+        # Re-raise HTTP exceptions from internal functions
+        raise http_exc
+    except Exception as e:
+        print(f"Unexpected error in /translate/text: {e}")
+        raise HTTPException(status_code=500, detail=f"An unexpected error occurred during text translation: {e}")
+@app.post("/translate/document")
+async def translate_document_endpoint(
+    file: UploadFile = File(...),
+    source_lang: str = Form(...), # e.g., 'en', 'fr', 'auto'
+    target_lang: str = Form("ar") # Default to Arabic
+):
+    """Translates text extracted from an uploaded document."""
+    # Allow translation to Arabic or from Arabic
+    # if target_lang != "ar" and source_lang != "ar":
+    #      raise HTTPException(status_code=400, detail="Document translation must involve Arabic (either as source or target). Specify 'ar' in source_lang or target_lang.")
+    # Simplified: For now, stick to the primary goal: other -> Arabic
+    if target_lang != "ar":
+         raise HTTPException(status_code=400, detail="Currently, only document translation to Arabic (ar) is supported.")
+    # Ensure upload directory exists
+    if not os.path.exists(UPLOAD_DIR):
+        try:
+            os.makedirs(UPLOAD_DIR)
+        except OSError as e:
+             raise HTTPException(status_code=500, detail=f"Could not create upload directory: {e}")
+    # Create a safe temporary file path
+    temp_file_path = os.path.join(UPLOAD_DIR, f"temp_{file.filename}")
+    try:
+        # Save the uploaded file temporarily
+        with open(temp_file_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+        # Extract text based on content type
+        extracted_text = extract_text_from_file(temp_file_path, file.content_type)
+        # Note: extract_text_from_file now raises HTTPException on errors or unsupported types
+        if not extracted_text:
+            # This case might be less likely if extract_text_from_file handles errors robustly
+            # but keep it as a safeguard.
+            if os.path.exists(temp_file_path):
+                 os.remove(temp_file_path)
+            raise HTTPException(status_code=400, detail="Could not extract any text from the document.")
+        # Determine actual source language if 'auto' (requires model/library support)
+        actual_source_lang = source_lang # Placeholder
+        # if source_lang == 'auto':
+            # actual_source_lang = detect_language(extracted_text) # Needs implementation
+        # Translate the extracted text
+        translated_text = translate_text_internal(extracted_text, actual_source_lang, target_lang)
+        # Clean up the temporary file *after* successful processing
+        if os.path.exists(temp_file_path):
+            os.remove(temp_file_path)
+        return JSONResponse(content={
+            "original_filename": file.filename,
+            "detected_source_lang": actual_source_lang,
+            "translated_text": translated_text
+        })
+    except HTTPException as http_exc:
+        # Clean up temp file if it exists on known errors
+        if os.path.exists(temp_file_path):
+             try:
+                 os.remove(temp_file_path)
+             except:
+                 pass
+        raise http_exc # Re-raise the exception
+    except Exception as e:
+        # Clean up temp file on unexpected errors
+        if os.path.exists(temp_file_path):
+             try:
+                 os.remove(temp_file_path)
+             except:
+                 pass
+        raise HTTPException(status_code=500, detail=f"An unexpected error occurred processing the document: {e}")
+# --- Optional: Add endpoint for reverse translation (Arabic to other) ---
+# @app.post("/translate/reverse")
+# async def translate_reverse_endpoint(text: str = Form(...), target_lang: str = Form(...)):
+#     # Implement logic similar to translate_text_endpoint but with source="ar"
+#     # You'll need a model capable of ar -> target_lang translation
+#     pass
+# --- Run the server (for local development) ---
+if __name__ == "__main__":
+    import uvicorn
+    # Make sure to install PyMuPDF, python-docx etc. if testing locally:
+    # pip install -r requirements.txt (from backend directory)
+    print(f"Template Directory: {TEMPLATE_DIR}")
+    print(f"Static Directory: {STATIC_DIR}")
+    print(f"Upload Directory: {UPLOAD_DIR}")
+    # Ensure necessary directories exist for local run
+    if not os.path.exists(TEMPLATE_DIR): os.makedirs(TEMPLATE_DIR)
+    if not os.path.exists(STATIC_DIR): os.makedirs(STATIC_DIR)
+    if not os.path.exists(UPLOAD_DIR): os.makedirs(UPLOAD_DIR)
+    # Create dummy index.html if it doesn't exist for local run
+    if not os.path.exists(os.path.join(TEMPLATE_DIR, "index.html")):
+        with open(os.path.join(TEMPLATE_DIR, "index.html"), "w") as f:
+            f.write("<html><body><h1>Placeholder Frontend</h1></body></html>")
+    uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)

backend/requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+fastapi
+uvicorn[standard]
+python-multipart
+transformers[torch]
+# Add specific document parsing libraries later if needed, e.g.:
+python-docx
+openpyxl
+python-pptx
+PyMuPDF
+sentencepiece # Often needed for tokenizers
+sacremoses # Often needed for translation models