amine_dubs
commited on
Commit
·
dbe4e2f
1
Parent(s):
20ee4d2
Enhanced prompt engineering with cultural sensitivity and multi-language support
Browse files- backend/main.py +75 -39
- project_details.txt +16 -0
- project_report.md +98 -33
backend/main.py
CHANGED
@@ -5,7 +5,9 @@ from fastapi.templating import Jinja2Templates
|
|
5 |
from typing import List, Optional
|
6 |
import shutil
|
7 |
import os
|
8 |
-
|
|
|
|
|
9 |
import traceback # Ensure traceback is imported
|
10 |
|
11 |
# --- Configuration ---
|
@@ -27,62 +29,96 @@ app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
|
|
27 |
# Ensure the templates directory exists (FastAPI doesn't create it)
|
28 |
templates = Jinja2Templates(directory=TEMPLATE_DIR)
|
29 |
|
30 |
-
# ---
|
31 |
-
# Initialize the translation pipeline (load the model)
|
32 |
-
# Consider loading the model on startup to avoid delays during requests
|
33 |
|
34 |
-
# Define model name
|
35 |
-
MODEL_NAME = "
|
36 |
CACHE_DIR = "/app/.cache" # Explicitly define cache directory
|
37 |
-
|
|
|
38 |
|
39 |
try:
|
40 |
-
print("--- Loading Model ---")
|
41 |
-
print(f"Loading tokenizer for {MODEL_NAME} using
|
42 |
-
# Use
|
43 |
-
tokenizer =
|
44 |
-
print(f"Loading model for {MODEL_NAME} using
|
45 |
-
# Use
|
46 |
-
model =
|
47 |
-
print(f"Initializing translation pipeline for {MODEL_NAME}...")
|
48 |
-
# Pass the loaded objects to the pipeline
|
49 |
-
translator = pipeline("translation", model=model, tokenizer=tokenizer)
|
50 |
print("--- Model Loaded Successfully ---")
|
51 |
except Exception as e:
|
52 |
print(f"--- ERROR Loading Model ---")
|
53 |
print(f"Error loading model or tokenizer {MODEL_NAME}: {e}")
|
54 |
traceback.print_exc() # Print full traceback for loading error
|
55 |
-
# Keep
|
56 |
|
57 |
# --- Helper Functions ---
|
58 |
def translate_text_internal(text: str, source_lang: str, target_lang: str = "ar") -> str:
|
59 |
-
"""Internal function to handle text translation using the loaded model."""
|
60 |
-
if
|
61 |
-
# If the model failed to load, raise an error
|
62 |
raise HTTPException(status_code=503, detail="Translation service is unavailable (model not loaded).")
|
63 |
|
64 |
-
#
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
try:
|
70 |
-
#
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
#
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
except Exception as e:
|
83 |
-
print(f"Error during
|
84 |
traceback.print_exc()
|
85 |
-
raise HTTPException(status_code=500, detail=f"Translation failed: {e}")
|
86 |
|
87 |
# --- Function to extract text ---
|
88 |
async def extract_text_from_file(file: UploadFile) -> str:
|
|
|
5 |
from typing import List, Optional
|
6 |
import shutil
|
7 |
import os
|
8 |
+
# Use AutoModel for flexibility
|
9 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
10 |
+
import torch # Ensure torch is imported if using generate directly
|
11 |
import traceback # Ensure traceback is imported
|
12 |
|
13 |
# --- Configuration ---
|
|
|
29 |
# Ensure the templates directory exists (FastAPI doesn't create it)
|
30 |
templates = Jinja2Templates(directory=TEMPLATE_DIR)
|
31 |
|
32 |
+
# --- Model Loading ---
|
|
|
|
|
33 |
|
34 |
+
# Define model name - Switched to FLAN-T5
|
35 |
+
MODEL_NAME = "google/flan-t5-small"
|
36 |
CACHE_DIR = "/app/.cache" # Explicitly define cache directory
|
37 |
+
model = None
|
38 |
+
tokenizer = None
|
39 |
|
40 |
try:
|
41 |
+
print("--- Loading Model ---")
|
42 |
+
print(f"Loading tokenizer for {MODEL_NAME} using AutoTokenizer...")
|
43 |
+
# Use AutoTokenizer and specify cache_dir
|
44 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
|
45 |
+
print(f"Loading model for {MODEL_NAME} using AutoModelForSeq2SeqLM...")
|
46 |
+
# Use AutoModelForSeq2SeqLM and specify cache_dir
|
47 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
|
|
|
|
|
|
|
48 |
print("--- Model Loaded Successfully ---")
|
49 |
except Exception as e:
|
50 |
print(f"--- ERROR Loading Model ---")
|
51 |
print(f"Error loading model or tokenizer {MODEL_NAME}: {e}")
|
52 |
traceback.print_exc() # Print full traceback for loading error
|
53 |
+
# Keep model and tokenizer as None
|
54 |
|
55 |
# --- Helper Functions ---
|
56 |
def translate_text_internal(text: str, source_lang: str, target_lang: str = "ar") -> str:
|
57 |
+
"""Internal function to handle text translation using the loaded model via prompting."""
|
58 |
+
if model is None or tokenizer is None:
|
59 |
+
# If the model/tokenizer failed to load, raise an error
|
60 |
raise HTTPException(status_code=503, detail="Translation service is unavailable (model not loaded).")
|
61 |
|
62 |
+
# --- Enhanced Prompt Engineering ---
|
63 |
+
# Map source language codes to full language names for better model understanding
|
64 |
+
language_map = {
|
65 |
+
"en": "English",
|
66 |
+
"fr": "French",
|
67 |
+
"es": "Spanish",
|
68 |
+
"de": "German",
|
69 |
+
"zh": "Chinese",
|
70 |
+
"ru": "Russian",
|
71 |
+
"ja": "Japanese",
|
72 |
+
"hi": "Hindi",
|
73 |
+
"pt": "Portuguese",
|
74 |
+
"tr": "Turkish",
|
75 |
+
"ko": "Korean",
|
76 |
+
"it": "Italian"
|
77 |
+
# Add more languages as needed
|
78 |
+
}
|
79 |
+
|
80 |
+
# Get the full language name, or use the code if not in our map
|
81 |
+
source_lang_name = language_map.get(source_lang, source_lang)
|
82 |
+
|
83 |
+
# Craft a more detailed prompt that emphasizes meaning over literal translation
|
84 |
+
# and focuses on eloquence and cultural sensitivity
|
85 |
+
prompt = f"""Translate the following {source_lang_name} text into Modern Standard Arabic (Fusha).
|
86 |
+
Focus on conveying the meaning elegantly using proper Balagha (Arabic eloquence).
|
87 |
+
Adapt any cultural references or idioms appropriately rather than translating literally.
|
88 |
+
Ensure the translation reads naturally to a native Arabic speaker.
|
89 |
+
|
90 |
+
Text to translate:
|
91 |
+
{text}"""
|
92 |
+
|
93 |
+
print(f"Translation Request - Source Lang: {source_lang} ({source_lang_name}), Target Lang: {target_lang}")
|
94 |
+
print(f"Using Enhanced Prompt for Balagha and Cultural Sensitivity")
|
95 |
+
|
96 |
+
# --- Actual Translation Logic (using model.generate) ---
|
97 |
try:
|
98 |
+
# Tokenize the prompt
|
99 |
+
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
|
100 |
+
|
101 |
+
# Generate the translation with parameters tuned for quality
|
102 |
+
outputs = model.generate(
|
103 |
+
**inputs,
|
104 |
+
max_length=512, # Adjust based on expected output length
|
105 |
+
num_beams=5, # Increased for better quality
|
106 |
+
length_penalty=1.0, # Encourage slightly longer outputs for natural flow
|
107 |
+
top_k=50, # More diverse word choices
|
108 |
+
top_p=0.95, # Sample from higher probability tokens for fluency
|
109 |
+
early_stopping=True
|
110 |
+
)
|
111 |
+
|
112 |
+
# Decode the generated tokens
|
113 |
+
translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
114 |
+
|
115 |
+
print(f"Raw Translation Output: {translated_text}")
|
116 |
+
return translated_text
|
117 |
|
118 |
except Exception as e:
|
119 |
+
print(f"Error during model generation: {e}")
|
120 |
traceback.print_exc()
|
121 |
+
raise HTTPException(status_code=500, detail=f"Translation failed during generation: {e}")
|
122 |
|
123 |
# --- Function to extract text ---
|
124 |
async def extract_text_from_file(file: UploadFile) -> str:
|
project_details.txt
CHANGED
@@ -2,6 +2,21 @@
|
|
2 |
|
3 |
This guide outlines the steps to deploy the AI Translator web application to Hugging Face (HF) Spaces using Docker.
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
## Prerequisites
|
6 |
|
7 |
1. **Docker:** Ensure Docker Desktop (or Docker Engine on Linux) is installed and running on your local machine.
|
@@ -48,6 +63,7 @@ This guide outlines the steps to deploy the AI Translator web application to Hug
|
|
48 |
# AI Translator
|
49 |
|
50 |
This Space hosts an AI-powered web application for translating text and documents to/from Arabic.
|
|
|
51 |
Built with FastAPI, Docker, and Hugging Face Transformers.
|
52 |
```
|
53 |
* **Important:** Ensure `app_port` matches the port exposed in your `backend/Dockerfile` (which is `8000` in the current setup).
|
|
|
2 |
|
3 |
This guide outlines the steps to deploy the AI Translator web application to Hugging Face (HF) Spaces using Docker.
|
4 |
|
5 |
+
## Application Features
|
6 |
+
|
7 |
+
1. **Eloquent Arabic Translation:** The application focuses on producing high-quality Arabic translations that prioritize meaning and eloquence (Balagha) over literal translations.
|
8 |
+
2. **Cultural Sensitivity:** Translations adapt cultural references and idioms appropriately for the target audience.
|
9 |
+
3. **Multi-Language Support:** Translation from 12 languages (English, French, Spanish, German, Chinese, Russian, Japanese, Hindi, Portuguese, Turkish, Korean, Italian) to Modern Standard Arabic.
|
10 |
+
4. **Document Processing:** Support for translating text from various document formats (PDF, DOCX, TXT).
|
11 |
+
5. **Advanced Prompt Engineering:** Uses carefully designed prompts with the FLAN-T5 model to achieve eloquent, culturally-aware translations.
|
12 |
+
|
13 |
+
## Translation Model Details
|
14 |
+
|
15 |
+
* **Model:** `google/flan-t5-small` - An instruction-tuned language model capable of following specific translation directions
|
16 |
+
* **Prompt Approach:** Uses explicit instructions to guide the model toward eloquent Arabic (Balagha) and cultural adaptation
|
17 |
+
* **Generation Parameters:** Optimized beam search, length penalty, and sampling parameters for higher quality output
|
18 |
+
* **Scalability:** The small model variant balances quality with reasonable resource requirements for deployment
|
19 |
+
|
20 |
## Prerequisites
|
21 |
|
22 |
1. **Docker:** Ensure Docker Desktop (or Docker Engine on Linux) is installed and running on your local machine.
|
|
|
63 |
# AI Translator
|
64 |
|
65 |
This Space hosts an AI-powered web application for translating text and documents to/from Arabic.
|
66 |
+
The goal is to provide accurate and fluent translations that also respect cultural nuances and differences.
|
67 |
Built with FastAPI, Docker, and Hugging Face Transformers.
|
68 |
```
|
69 |
* **Important:** Ensure `app_port` matches the port exposed in your `backend/Dockerfile` (which is `8000` in the current setup).
|
project_report.md
CHANGED
@@ -64,7 +64,7 @@ This report details the development process of an AI-powered web application des
|
|
64 |
* **Description:** Uploads a document, extracts its text, and translates it.
|
65 |
* **Request Body (Multipart Form Data):**
|
66 |
* `file` (UploadFile): The document file (.pdf, .docx, .xlsx, .pptx, .txt).
|
67 |
-
* `source_lang` (str):
|
68 |
* `target_lang` (str): The target language code (currently fixed to 'ar').
|
69 |
* **Response (`JSONResponse`):**
|
70 |
* `original_filename` (str): The name of the uploaded file.
|
@@ -101,46 +101,106 @@ Key Python libraries used:
|
|
101 |
7. **Document Backend Processing:** FastAPI receives the file, saves it temporarily, extracts text using appropriate libraries (PyMuPDF, python-docx, etc.), calls the internal translation function, cleans up the temporary file, and returns the result.
|
102 |
8. **Response Handling:** Frontend JS receives the JSON response and updates the UI to display the translation or an error message.
|
103 |
|
104 |
-
## 4. Prompt Engineering and
|
105 |
|
106 |
-
### 4.1.
|
107 |
|
108 |
-
The core requirement is to translate *from* a source language *to* Arabic (MSA Fusha) with a focus on meaning and eloquence (Balagha), avoiding overly literal translations.
|
109 |
|
110 |
-
|
111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
```
|
113 |
-
Translate the following text from {source_lang} to Arabic (Modern Standard Arabic - Fusha) precisely. Do not provide a literal translation; focus on conveying the meaning accurately while respecting Arabic eloquence (balagha) by rephrasing if necessary:
|
114 |
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
```
|
117 |
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
## 5. Frontend Design and User Experience
|
146 |
|
@@ -222,6 +282,11 @@ Translate the following text from {source_lang} to Arabic (Modern Standard Arabi
|
|
222 |
* **Add More Document Types:** Support additional formats if required.
|
223 |
* **Testing:** Implement unit and integration tests for backend logic.
|
224 |
|
|
|
|
|
|
|
|
|
|
|
225 |
## 8. Conclusion
|
226 |
|
227 |
This project successfully lays the foundation for an AI-powered translation web service focusing on high-quality Arabic translation. The FastAPI backend provides a robust API, and the frontend offers a simple interface for text and document translation. Dockerization ensures portability and simplifies deployment to platforms like Hugging Face Spaces. Key next steps involve integrating a suitable translation model and refining the prompt engineering based on real-world testing.
|
|
|
64 |
* **Description:** Uploads a document, extracts its text, and translates it.
|
65 |
* **Request Body (Multipart Form Data):**
|
66 |
* `file` (UploadFile): The document file (.pdf, .docx, .xlsx, .pptx, .txt).
|
67 |
+
* `source_lang` (str):
|
68 |
* `target_lang` (str): The target language code (currently fixed to 'ar').
|
69 |
* **Response (`JSONResponse`):**
|
70 |
* `original_filename` (str): The name of the uploaded file.
|
|
|
101 |
7. **Document Backend Processing:** FastAPI receives the file, saves it temporarily, extracts text using appropriate libraries (PyMuPDF, python-docx, etc.), calls the internal translation function, cleans up the temporary file, and returns the result.
|
102 |
8. **Response Handling:** Frontend JS receives the JSON response and updates the UI to display the translation or an error message.
|
103 |
|
104 |
+
## 4. Prompt Engineering and Translation Quality Control
|
105 |
|
106 |
+
### 4.1. Desired Translation Characteristics
|
107 |
|
108 |
+
The core requirement is to translate *from* a source language *to* Arabic (MSA Fusha) with a focus on meaning and eloquence (Balagha), avoiding overly literal translations. These goals typically fall under the umbrella of prompt engineering when using general large language models.
|
109 |
|
110 |
+
### 4.2. Approach with Instruction-Tuned LLM (FLAN-T5)
|
111 |
|
112 |
+
Due to persistent loading issues with the specialized `Helsinki-NLP` model and the desire to have more direct control over the translation process, the project switched to using `google/flan-t5-small`, an instruction-tuned language model.
|
113 |
+
|
114 |
+
#### 4.2.1 Explicit Prompt Engineering
|
115 |
+
|
116 |
+
The translation process uses carefully crafted prompts to guide the model toward high-quality Arabic translations. The `translate_text_internal` function in `main.py` constructs an enhanced prompt with the following components:
|
117 |
+
|
118 |
+
```python
|
119 |
+
prompt = f"""Translate the following {source_lang_name} text into Modern Standard Arabic (Fusha).
|
120 |
+
Focus on conveying the meaning elegantly using proper Balagha (Arabic eloquence).
|
121 |
+
Adapt any cultural references or idioms appropriately rather than translating literally.
|
122 |
+
Ensure the translation reads naturally to a native Arabic speaker.
|
123 |
+
|
124 |
+
Text to translate:
|
125 |
+
{text}"""
|
126 |
```
|
|
|
127 |
|
128 |
+
This prompt explicitly instructs the model to:
|
129 |
+
- Use Modern Standard Arabic (Fusha) as the target language register
|
130 |
+
- Emphasize eloquence (Balagha) in the translation style
|
131 |
+
- Handle cultural references and idioms appropriately for an Arabic audience
|
132 |
+
- Prioritize natural-sounding output over literal translation
|
133 |
+
|
134 |
+
#### 4.2.2 Multi-Language Support
|
135 |
+
|
136 |
+
The system supports multiple source languages through a language mapping system that converts ISO language codes to full language names for better model comprehension:
|
137 |
+
|
138 |
+
```python
|
139 |
+
language_map = {
|
140 |
+
"en": "English",
|
141 |
+
"fr": "French",
|
142 |
+
"es": "Spanish",
|
143 |
+
"de": "German",
|
144 |
+
"zh": "Chinese",
|
145 |
+
"ru": "Russian",
|
146 |
+
"ja": "Japanese",
|
147 |
+
"hi": "Hindi",
|
148 |
+
"pt": "Portuguese",
|
149 |
+
"tr": "Turkish",
|
150 |
+
"ko": "Korean",
|
151 |
+
"it": "Italian"
|
152 |
+
# Additional languages can be added as needed
|
153 |
+
}
|
154 |
```
|
155 |
|
156 |
+
Using full language names in the prompt (e.g., "Translate the following French text...") helps the model better understand the translation task compared to using language codes.
|
157 |
+
|
158 |
+
#### 4.2.3 Generation Parameter Optimization
|
159 |
+
|
160 |
+
To further improve translation quality, the model's generation parameters have been fine-tuned:
|
161 |
+
|
162 |
+
```python
|
163 |
+
outputs = model.generate(
|
164 |
+
**inputs,
|
165 |
+
max_length=512, # Sufficient length for most translations
|
166 |
+
num_beams=5, # Wider beam search for better quality
|
167 |
+
length_penalty=1.0, # Slightly favor longer, more complete translations
|
168 |
+
top_k=50, # Consider diverse word choices
|
169 |
+
top_p=0.95, # Focus on high-probability tokens for coherence
|
170 |
+
early_stopping=True
|
171 |
+
)
|
172 |
+
```
|
173 |
+
|
174 |
+
These parameters work together to encourage:
|
175 |
+
- More natural-sounding translations through beam search
|
176 |
+
- Better handling of nuanced expressions
|
177 |
+
- Appropriate length for preserving meaning
|
178 |
+
- Balance between creativity and accuracy
|
179 |
+
|
180 |
+
### 4.3. Testing and Refinement Process
|
181 |
+
|
182 |
+
* **Prompt Iteration:** The core refinement process involves testing different prompt phrasings with various text samples across supported languages. Each iteration aims to improve the model's understanding of:
|
183 |
+
- What constitutes eloquent Arabic (Balagha)
|
184 |
+
- How to properly adapt culturally-specific references
|
185 |
+
- When to prioritize meaning over literal translation
|
186 |
+
|
187 |
+
* **Cultural Sensitivity Testing:** Sample texts containing culturally-specific references, idioms, and metaphors from each supported language are used to evaluate how well the model adapts these elements for an Arabic audience.
|
188 |
+
|
189 |
+
* **Evaluation Metrics:**
|
190 |
+
* *Human Evaluation:* Native Arabic speakers assess translations for:
|
191 |
+
- Eloquence (Balagha): Does the translation use appropriately eloquent Arabic?
|
192 |
+
- Cultural Adaptation: Are cultural references appropriately handled?
|
193 |
+
- Naturalness: Does the text sound natural to native speakers?
|
194 |
+
- Accuracy: Is the meaning preserved despite non-literal translation?
|
195 |
+
|
196 |
+
* *Automated Metrics:* While useful as supplementary measures, metrics like BLEU are used with caution as they tend to favor more literal translations.
|
197 |
+
|
198 |
+
* **Model Limitations:** The current implementation with FLAN-T5-small shows promise but has limitations:
|
199 |
+
- It may struggle with very specialized technical content
|
200 |
+
- Some cultural nuances from less common language pairs may be missed
|
201 |
+
- Longer texts may lose coherence across paragraphs
|
202 |
+
|
203 |
+
Future work may explore larger model variants if these limitations prove significant.
|
204 |
|
205 |
## 5. Frontend Design and User Experience
|
206 |
|
|
|
282 |
* **Add More Document Types:** Support additional formats if required.
|
283 |
* **Testing:** Implement unit and integration tests for backend logic.
|
284 |
|
285 |
+
## Project Log / Updates
|
286 |
+
|
287 |
+
* **2025-04-28:** Updated project requirements to explicitly include the need for the translation model to respect cultural differences and nuances in its output.
|
288 |
+
* **2025-04-28:** Switched translation model from `Helsinki-NLP/opus-mt-en-ar` to `google/flan-t5-small` due to persistent loading errors in the deployment environment and to enable direct prompt engineering for translation tasks.
|
289 |
+
|
290 |
## 8. Conclusion
|
291 |
|
292 |
This project successfully lays the foundation for an AI-powered translation web service focusing on high-quality Arabic translation. The FastAPI backend provides a robust API, and the frontend offers a simple interface for text and document translation. Dockerization ensures portability and simplifies deployment to platforms like Hugging Face Spaces. Key next steps involve integrating a suitable translation model and refining the prompt engineering based on real-world testing.
|