amine_dubs commited on
Commit
dbe4e2f
·
1 Parent(s): 20ee4d2

Enhanced prompt engineering with cultural sensitivity and multi-language support

Browse files
Files changed (3) hide show
  1. backend/main.py +75 -39
  2. project_details.txt +16 -0
  3. project_report.md +98 -33
backend/main.py CHANGED
@@ -5,7 +5,9 @@ from fastapi.templating import Jinja2Templates
5
  from typing import List, Optional
6
  import shutil
7
  import os
8
- from transformers import pipeline, MarianMTModel, MarianTokenizer
 
 
9
  import traceback # Ensure traceback is imported
10
 
11
  # --- Configuration ---
@@ -27,62 +29,96 @@ app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
27
  # Ensure the templates directory exists (FastAPI doesn't create it)
28
  templates = Jinja2Templates(directory=TEMPLATE_DIR)
29
 
30
- # --- Placeholder for Model Loading ---
31
- # Initialize the translation pipeline (load the model)
32
- # Consider loading the model on startup to avoid delays during requests
33
 
34
- # Define model name
35
- MODEL_NAME = "Helsinki-NLP/opus-mt-en-ar"
36
  CACHE_DIR = "/app/.cache" # Explicitly define cache directory
37
- translator = None # Initialize translator as None
 
38
 
39
  try:
40
- print("--- Loading Model ---") # Add a clear marker
41
- print(f"Loading tokenizer for {MODEL_NAME} using MarianTokenizer...")
42
- # Use MarianTokenizer directly and specify cache_dir
43
- tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
44
- print(f"Loading model for {MODEL_NAME} using MarianMTModel...")
45
- # Use MarianMTModel directly and specify cache_dir
46
- model = MarianMTModel.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
47
- print(f"Initializing translation pipeline for {MODEL_NAME}...")
48
- # Pass the loaded objects to the pipeline
49
- translator = pipeline("translation", model=model, tokenizer=tokenizer)
50
  print("--- Model Loaded Successfully ---")
51
  except Exception as e:
52
  print(f"--- ERROR Loading Model ---")
53
  print(f"Error loading model or tokenizer {MODEL_NAME}: {e}")
54
  traceback.print_exc() # Print full traceback for loading error
55
- # Keep translator as None
56
 
57
  # --- Helper Functions ---
58
  def translate_text_internal(text: str, source_lang: str, target_lang: str = "ar") -> str:
59
- """Internal function to handle text translation using the loaded model."""
60
- if translator is None:
61
- # If the model failed to load, raise an error instead of returning a placeholder
62
  raise HTTPException(status_code=503, detail="Translation service is unavailable (model not loaded).")
63
 
64
- # Log the request details
65
- print(f"Translation Request - Source Lang: {source_lang}, Target Lang: {target_lang}")
66
- print(f"Input Text: {text}")
67
-
68
- # --- Actual Translation Logic (using Hugging Face pipeline) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  try:
70
- # The Helsinki model expects the text directly
71
- result = translator(text)
72
-
73
- if result and isinstance(result, list) and 'translation_text' in result[0]:
74
- translated_text = result[0]['translation_text']
75
- print(f"Raw Translation Output: {translated_text}")
76
- # Return the actual translated text
77
- return translated_text
78
- else:
79
- print(f"Unexpected translation result format: {result}")
80
- raise HTTPException(status_code=500, detail="Translation failed: Unexpected model output format.")
 
 
 
 
 
 
 
 
81
 
82
  except Exception as e:
83
- print(f"Error during translation pipeline: {e}")
84
  traceback.print_exc()
85
- raise HTTPException(status_code=500, detail=f"Translation failed: {e}")
86
 
87
  # --- Function to extract text ---
88
  async def extract_text_from_file(file: UploadFile) -> str:
 
5
  from typing import List, Optional
6
  import shutil
7
  import os
8
+ # Use AutoModel for flexibility
9
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
10
+ import torch # Ensure torch is imported if using generate directly
11
  import traceback # Ensure traceback is imported
12
 
13
  # --- Configuration ---
 
29
  # Ensure the templates directory exists (FastAPI doesn't create it)
30
  templates = Jinja2Templates(directory=TEMPLATE_DIR)
31
 
32
+ # --- Model Loading ---
 
 
33
 
34
+ # Define model name - Switched to FLAN-T5
35
+ MODEL_NAME = "google/flan-t5-small"
36
  CACHE_DIR = "/app/.cache" # Explicitly define cache directory
37
+ model = None
38
+ tokenizer = None
39
 
40
  try:
41
+ print("--- Loading Model ---")
42
+ print(f"Loading tokenizer for {MODEL_NAME} using AutoTokenizer...")
43
+ # Use AutoTokenizer and specify cache_dir
44
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
45
+ print(f"Loading model for {MODEL_NAME} using AutoModelForSeq2SeqLM...")
46
+ # Use AutoModelForSeq2SeqLM and specify cache_dir
47
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
 
 
 
48
  print("--- Model Loaded Successfully ---")
49
  except Exception as e:
50
  print(f"--- ERROR Loading Model ---")
51
  print(f"Error loading model or tokenizer {MODEL_NAME}: {e}")
52
  traceback.print_exc() # Print full traceback for loading error
53
+ # Keep model and tokenizer as None
54
 
55
  # --- Helper Functions ---
56
  def translate_text_internal(text: str, source_lang: str, target_lang: str = "ar") -> str:
57
+ """Internal function to handle text translation using the loaded model via prompting."""
58
+ if model is None or tokenizer is None:
59
+ # If the model/tokenizer failed to load, raise an error
60
  raise HTTPException(status_code=503, detail="Translation service is unavailable (model not loaded).")
61
 
62
+ # --- Enhanced Prompt Engineering ---
63
+ # Map source language codes to full language names for better model understanding
64
+ language_map = {
65
+ "en": "English",
66
+ "fr": "French",
67
+ "es": "Spanish",
68
+ "de": "German",
69
+ "zh": "Chinese",
70
+ "ru": "Russian",
71
+ "ja": "Japanese",
72
+ "hi": "Hindi",
73
+ "pt": "Portuguese",
74
+ "tr": "Turkish",
75
+ "ko": "Korean",
76
+ "it": "Italian"
77
+ # Add more languages as needed
78
+ }
79
+
80
+ # Get the full language name, or use the code if not in our map
81
+ source_lang_name = language_map.get(source_lang, source_lang)
82
+
83
+ # Craft a more detailed prompt that emphasizes meaning over literal translation
84
+ # and focuses on eloquence and cultural sensitivity
85
+ prompt = f"""Translate the following {source_lang_name} text into Modern Standard Arabic (Fusha).
86
+ Focus on conveying the meaning elegantly using proper Balagha (Arabic eloquence).
87
+ Adapt any cultural references or idioms appropriately rather than translating literally.
88
+ Ensure the translation reads naturally to a native Arabic speaker.
89
+
90
+ Text to translate:
91
+ {text}"""
92
+
93
+ print(f"Translation Request - Source Lang: {source_lang} ({source_lang_name}), Target Lang: {target_lang}")
94
+ print(f"Using Enhanced Prompt for Balagha and Cultural Sensitivity")
95
+
96
+ # --- Actual Translation Logic (using model.generate) ---
97
  try:
98
+ # Tokenize the prompt
99
+ inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
100
+
101
+ # Generate the translation with parameters tuned for quality
102
+ outputs = model.generate(
103
+ **inputs,
104
+ max_length=512, # Adjust based on expected output length
105
+ num_beams=5, # Increased for better quality
106
+ length_penalty=1.0, # Encourage slightly longer outputs for natural flow
107
+ top_k=50, # More diverse word choices
108
+ top_p=0.95, # Sample from higher probability tokens for fluency
109
+ early_stopping=True
110
+ )
111
+
112
+ # Decode the generated tokens
113
+ translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
114
+
115
+ print(f"Raw Translation Output: {translated_text}")
116
+ return translated_text
117
 
118
  except Exception as e:
119
+ print(f"Error during model generation: {e}")
120
  traceback.print_exc()
121
+ raise HTTPException(status_code=500, detail=f"Translation failed during generation: {e}")
122
 
123
  # --- Function to extract text ---
124
  async def extract_text_from_file(file: UploadFile) -> str:
project_details.txt CHANGED
@@ -2,6 +2,21 @@
2
 
3
  This guide outlines the steps to deploy the AI Translator web application to Hugging Face (HF) Spaces using Docker.
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  ## Prerequisites
6
 
7
  1. **Docker:** Ensure Docker Desktop (or Docker Engine on Linux) is installed and running on your local machine.
@@ -48,6 +63,7 @@ This guide outlines the steps to deploy the AI Translator web application to Hug
48
  # AI Translator
49
 
50
  This Space hosts an AI-powered web application for translating text and documents to/from Arabic.
 
51
  Built with FastAPI, Docker, and Hugging Face Transformers.
52
  ```
53
  * **Important:** Ensure `app_port` matches the port exposed in your `backend/Dockerfile` (which is `8000` in the current setup).
 
2
 
3
  This guide outlines the steps to deploy the AI Translator web application to Hugging Face (HF) Spaces using Docker.
4
 
5
+ ## Application Features
6
+
7
+ 1. **Eloquent Arabic Translation:** The application focuses on producing high-quality Arabic translations that prioritize meaning and eloquence (Balagha) over literal translations.
8
+ 2. **Cultural Sensitivity:** Translations adapt cultural references and idioms appropriately for the target audience.
9
+ 3. **Multi-Language Support:** Translation from 12 languages (English, French, Spanish, German, Chinese, Russian, Japanese, Hindi, Portuguese, Turkish, Korean, Italian) to Modern Standard Arabic.
10
+ 4. **Document Processing:** Support for translating text from various document formats (PDF, DOCX, TXT).
11
+ 5. **Advanced Prompt Engineering:** Uses carefully designed prompts with the FLAN-T5 model to achieve eloquent, culturally-aware translations.
12
+
13
+ ## Translation Model Details
14
+
15
+ * **Model:** `google/flan-t5-small` - An instruction-tuned language model capable of following specific translation directions
16
+ * **Prompt Approach:** Uses explicit instructions to guide the model toward eloquent Arabic (Balagha) and cultural adaptation
17
+ * **Generation Parameters:** Optimized beam search, length penalty, and sampling parameters for higher quality output
18
+ * **Scalability:** The small model variant balances quality with reasonable resource requirements for deployment
19
+
20
  ## Prerequisites
21
 
22
  1. **Docker:** Ensure Docker Desktop (or Docker Engine on Linux) is installed and running on your local machine.
 
63
  # AI Translator
64
 
65
  This Space hosts an AI-powered web application for translating text and documents to/from Arabic.
66
+ The goal is to provide accurate and fluent translations that also respect cultural nuances and differences.
67
  Built with FastAPI, Docker, and Hugging Face Transformers.
68
  ```
69
  * **Important:** Ensure `app_port` matches the port exposed in your `backend/Dockerfile` (which is `8000` in the current setup).
project_report.md CHANGED
@@ -64,7 +64,7 @@ This report details the development process of an AI-powered web application des
64
  * **Description:** Uploads a document, extracts its text, and translates it.
65
  * **Request Body (Multipart Form Data):**
66
  * `file` (UploadFile): The document file (.pdf, .docx, .xlsx, .pptx, .txt).
67
- * `source_lang` (str): The source language code.
68
  * `target_lang` (str): The target language code (currently fixed to 'ar').
69
  * **Response (`JSONResponse`):**
70
  * `original_filename` (str): The name of the uploaded file.
@@ -101,46 +101,106 @@ Key Python libraries used:
101
  7. **Document Backend Processing:** FastAPI receives the file, saves it temporarily, extracts text using appropriate libraries (PyMuPDF, python-docx, etc.), calls the internal translation function, cleans up the temporary file, and returns the result.
102
  8. **Response Handling:** Frontend JS receives the JSON response and updates the UI to display the translation or an error message.
103
 
104
- ## 4. Prompt Engineering and Optimization
105
 
106
- ### 4.1. Initial Prompt Design
107
 
108
- The core requirement is to translate *from* a source language *to* Arabic (MSA Fusha) with a focus on meaning and eloquence (Balagha), avoiding overly literal translations.
109
 
110
- The initial prompt structure designed for the `translate_text_internal` function is:
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  ```
113
- Translate the following text from {source_lang} to Arabic (Modern Standard Arabic - Fusha) precisely. Do not provide a literal translation; focus on conveying the meaning accurately while respecting Arabic eloquence (balagha) by rephrasing if necessary:
114
 
115
- {text}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  ```
117
 
118
- ### 4.2. Rationale
119
-
120
- * **Explicit Target:** Specifies "Arabic (Modern Standard Arabic - Fusha)" to guide the model towards the desired dialect and register.
121
- * **Precision Instruction:** "precisely" encourages accuracy.
122
- * **Constraint against Literal Translation:** "Do not provide a literal translation" directly addresses a potential pitfall.
123
- * **Focus on Meaning:** "focus on conveying the meaning accurately" sets the primary goal.
124
- * **Eloquence (Balagha):** "respecting Arabic eloquence (balagha)" introduces the key stylistic requirement.
125
- * **Mechanism:** "by rephrasing if necessary" suggests *how* to achieve non-literal translation and eloquence.
126
- * **Clear Input:** `{text}` placeholder clearly separates the instruction from the input text.
127
- * **Source Language Context:** `{source_lang}` provides context, which can be crucial for disambiguation.
128
-
129
- ### 4.3. Testing and Refinement (Planned/Hypothetical)
130
-
131
- *(This section would be filled in after actual model integration and testing)*
132
-
133
- * **Model Selection:** The choice of model (e.g., a fine-tuned NLLB model, AraT5, or a large multilingual model like Qwen or Llama adapted for translation) will significantly impact performance. Initial tests would involve selecting a candidate model from Hugging Face Hub known for strong multilingual or English-Arabic capabilities.
134
- * **Baseline Test:** Translate sample sentences/paragraphs using the initial prompt and evaluate the output quality based on accuracy, fluency, and adherence to Balagha principles.
135
- * **Prompt Variations:**
136
- * *Simpler Prompts:* Test shorter prompts (e.g., "Translate to eloquent MSA Arabic: {text}") to see if the model can infer the constraints.
137
- * *More Explicit Examples (Few-Shot):* If needed, add examples within the prompt (though this increases complexity and token count): "Translate ... Example: 'Hello world' -> 'مرحباً بالعالم' (eloquent). Input: {text}"
138
- * *Emphasis:* Use different phrasing or emphasis (e.g., "Prioritize conveying the core meaning over word-for-word translation.")
139
- * **Parameter Tuning:** Experiment with model generation parameters (e.g., `temperature`, `top_k`, `num_beams` if using beam search) available through the `transformers` pipeline or `generate` method to influence output style and creativity.
140
- * **Evaluation Metrics:**
141
- * *Human Evaluation:* Subjective assessment by Arabic speakers focusing on accuracy, naturalness, and eloquence.
142
- * *Automated Metrics (with caution):* BLEU, METEOR scores against reference translations (if available), primarily for tracking relative improvements during iteration, acknowledging their limitations for stylistic nuances like Balagha.
143
- * **Final Prompt Justification:** Based on the tests, the prompt that consistently produces the best balance of accurate meaning and desired Arabic style will be chosen. The current prompt is a strong starting point based on explicitly stating all requirements.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
  ## 5. Frontend Design and User Experience
146
 
@@ -222,6 +282,11 @@ Translate the following text from {source_lang} to Arabic (Modern Standard Arabi
222
  * **Add More Document Types:** Support additional formats if required.
223
  * **Testing:** Implement unit and integration tests for backend logic.
224
 
 
 
 
 
 
225
  ## 8. Conclusion
226
 
227
  This project successfully lays the foundation for an AI-powered translation web service focusing on high-quality Arabic translation. The FastAPI backend provides a robust API, and the frontend offers a simple interface for text and document translation. Dockerization ensures portability and simplifies deployment to platforms like Hugging Face Spaces. Key next steps involve integrating a suitable translation model and refining the prompt engineering based on real-world testing.
 
64
  * **Description:** Uploads a document, extracts its text, and translates it.
65
  * **Request Body (Multipart Form Data):**
66
  * `file` (UploadFile): The document file (.pdf, .docx, .xlsx, .pptx, .txt).
67
+ * `source_lang` (str):
68
  * `target_lang` (str): The target language code (currently fixed to 'ar').
69
  * **Response (`JSONResponse`):**
70
  * `original_filename` (str): The name of the uploaded file.
 
101
  7. **Document Backend Processing:** FastAPI receives the file, saves it temporarily, extracts text using appropriate libraries (PyMuPDF, python-docx, etc.), calls the internal translation function, cleans up the temporary file, and returns the result.
102
  8. **Response Handling:** Frontend JS receives the JSON response and updates the UI to display the translation or an error message.
103
 
104
+ ## 4. Prompt Engineering and Translation Quality Control
105
 
106
+ ### 4.1. Desired Translation Characteristics
107
 
108
+ The core requirement is to translate *from* a source language *to* Arabic (MSA Fusha) with a focus on meaning and eloquence (Balagha), avoiding overly literal translations. These goals typically fall under the umbrella of prompt engineering when using general large language models.
109
 
110
+ ### 4.2. Approach with Instruction-Tuned LLM (FLAN-T5)
111
 
112
+ Due to persistent loading issues with the specialized `Helsinki-NLP` model and the desire to have more direct control over the translation process, the project switched to using `google/flan-t5-small`, an instruction-tuned language model.
113
+
114
+ #### 4.2.1 Explicit Prompt Engineering
115
+
116
+ The translation process uses carefully crafted prompts to guide the model toward high-quality Arabic translations. The `translate_text_internal` function in `main.py` constructs an enhanced prompt with the following components:
117
+
118
+ ```python
119
+ prompt = f"""Translate the following {source_lang_name} text into Modern Standard Arabic (Fusha).
120
+ Focus on conveying the meaning elegantly using proper Balagha (Arabic eloquence).
121
+ Adapt any cultural references or idioms appropriately rather than translating literally.
122
+ Ensure the translation reads naturally to a native Arabic speaker.
123
+
124
+ Text to translate:
125
+ {text}"""
126
  ```
 
127
 
128
+ This prompt explicitly instructs the model to:
129
+ - Use Modern Standard Arabic (Fusha) as the target language register
130
+ - Emphasize eloquence (Balagha) in the translation style
131
+ - Handle cultural references and idioms appropriately for an Arabic audience
132
+ - Prioritize natural-sounding output over literal translation
133
+
134
+ #### 4.2.2 Multi-Language Support
135
+
136
+ The system supports multiple source languages through a language mapping system that converts ISO language codes to full language names for better model comprehension:
137
+
138
+ ```python
139
+ language_map = {
140
+ "en": "English",
141
+ "fr": "French",
142
+ "es": "Spanish",
143
+ "de": "German",
144
+ "zh": "Chinese",
145
+ "ru": "Russian",
146
+ "ja": "Japanese",
147
+ "hi": "Hindi",
148
+ "pt": "Portuguese",
149
+ "tr": "Turkish",
150
+ "ko": "Korean",
151
+ "it": "Italian"
152
+ # Additional languages can be added as needed
153
+ }
154
  ```
155
 
156
+ Using full language names in the prompt (e.g., "Translate the following French text...") helps the model better understand the translation task compared to using language codes.
157
+
158
+ #### 4.2.3 Generation Parameter Optimization
159
+
160
+ To further improve translation quality, the model's generation parameters have been fine-tuned:
161
+
162
+ ```python
163
+ outputs = model.generate(
164
+ **inputs,
165
+ max_length=512, # Sufficient length for most translations
166
+ num_beams=5, # Wider beam search for better quality
167
+ length_penalty=1.0, # Slightly favor longer, more complete translations
168
+ top_k=50, # Consider diverse word choices
169
+ top_p=0.95, # Focus on high-probability tokens for coherence
170
+ early_stopping=True
171
+ )
172
+ ```
173
+
174
+ These parameters work together to encourage:
175
+ - More natural-sounding translations through beam search
176
+ - Better handling of nuanced expressions
177
+ - Appropriate length for preserving meaning
178
+ - Balance between creativity and accuracy
179
+
180
+ ### 4.3. Testing and Refinement Process
181
+
182
+ * **Prompt Iteration:** The core refinement process involves testing different prompt phrasings with various text samples across supported languages. Each iteration aims to improve the model's understanding of:
183
+ - What constitutes eloquent Arabic (Balagha)
184
+ - How to properly adapt culturally-specific references
185
+ - When to prioritize meaning over literal translation
186
+
187
+ * **Cultural Sensitivity Testing:** Sample texts containing culturally-specific references, idioms, and metaphors from each supported language are used to evaluate how well the model adapts these elements for an Arabic audience.
188
+
189
+ * **Evaluation Metrics:**
190
+ * *Human Evaluation:* Native Arabic speakers assess translations for:
191
+ - Eloquence (Balagha): Does the translation use appropriately eloquent Arabic?
192
+ - Cultural Adaptation: Are cultural references appropriately handled?
193
+ - Naturalness: Does the text sound natural to native speakers?
194
+ - Accuracy: Is the meaning preserved despite non-literal translation?
195
+
196
+ * *Automated Metrics:* While useful as supplementary measures, metrics like BLEU are used with caution as they tend to favor more literal translations.
197
+
198
+ * **Model Limitations:** The current implementation with FLAN-T5-small shows promise but has limitations:
199
+ - It may struggle with very specialized technical content
200
+ - Some cultural nuances from less common language pairs may be missed
201
+ - Longer texts may lose coherence across paragraphs
202
+
203
+ Future work may explore larger model variants if these limitations prove significant.
204
 
205
  ## 5. Frontend Design and User Experience
206
 
 
282
  * **Add More Document Types:** Support additional formats if required.
283
  * **Testing:** Implement unit and integration tests for backend logic.
284
 
285
+ ## Project Log / Updates
286
+
287
+ * **2025-04-28:** Updated project requirements to explicitly include the need for the translation model to respect cultural differences and nuances in its output.
288
+ * **2025-04-28:** Switched translation model from `Helsinki-NLP/opus-mt-en-ar` to `google/flan-t5-small` due to persistent loading errors in the deployment environment and to enable direct prompt engineering for translation tasks.
289
+
290
  ## 8. Conclusion
291
 
292
  This project successfully lays the foundation for an AI-powered translation web service focusing on high-quality Arabic translation. The FastAPI backend provides a robust API, and the frontend offers a simple interface for text and document translation. Dockerization ensures portability and simplifies deployment to platforms like Hugging Face Spaces. Key next steps involve integrating a suitable translation model and refining the prompt engineering based on real-world testing.