import json def get_translate_prompt(article_title, artice_summary, content_format, original_content, target_lang, preference_prompt=None): """ Function to get the translation prompt for the LLM to translate Wikipedia {content_format} content with high quality and fidelity to the original. Args: article_title: Title of the Wikipedia article artice_summary: Summary of the Wikipedia article content_format: Format of the content (e.g., XML, Text) original_content: The content to be translated target_lang: The target language for translation preference_prompt: Optional additional user preferences for translation """ # Define the prompt template translate_prompt = ( "# Task\n" "You are an expert Wikipedia translator specializing in multilingual content adaptation. " "Your task is to translate the provided {content_format} content into {target_lang} while preserving the " "academic tone, factual accuracy, and encyclopedic style of Wikipedia.\n\n" "# Article Original Title\n" "{article_title}\n\n" "# Article Summary\n" "{article_summary}\n\n" "# Article Original Content ({content_format} format)\n" "{original_content}\n\n" "# Target Language\n" "{target_lang}\n\n" "# Instructions\n" "1. If provided input is XML code, Preserve all XML tags, attributes, and structure exactly as they appear\n" "2. If provided input is XML code, Translate only the text content between XML tags\n" "3. Maintain Wikipedia's neutral point of view and encyclopedic style\n" "4. Preserve proper nouns, scientific terminology, and citations appropriately\n" "5. Adapt cultural references or idioms to be understandable in the target language\n" "6. Use terminology consistent with the {target_lang} Wikipedia for similar topics\n" "7. Maintain the same paragraph structure and information hierarchy\n" ) # Add special instructions for Arabic-Extended if target_lang in ["ar-x-extended", "Arabic-Extended"]: translate_prompt += ( "\n# Arabic-Extended Alphabet Guidelines\n" "When translating to Arabic-Extended, use the extended Arabic alphabet ONLY for entity names " "(people, places, brands, foreign terms) that contain sounds not in standard Arabic. Use these special characters:\n\n" "- ڤ (V): Use for 'v' sound in foreign names instead of ف\n" "- پ (P): Use for 'p' sound in foreign names instead of ب\n" "- چ (Ch): Use for 'ch' sound in foreign names instead of تش\n" "- گ (G): Use for 'g' sound in foreign names instead of ج/غ/ك\n" "- ژ (Zh): Use for 'zh/j' sound in foreign names instead of ز/ج\n" "- ڠ (ng): Use for 'ng' sound in foreign names instead of نج/نغ\n\n" "Examples:\n" "- 'Vancouver' → 'ڤانكوڤر' (using ڤ for V)\n" "- 'Pakistan' → 'پاكستان' (using پ for P)\n" "- 'Chicago' → 'چيكاغو' (using چ for Ch)\n" "- 'Google' → 'گوگل' (using گ for G)\n\n" "Important: Use these extended characters ONLY for entity names. Use standard Arabic for all other content.\n" ) # Add user preference prompt if provided if preference_prompt and preference_prompt.strip(): translate_prompt += ( "\n# Additional Translation Preferences\n" f"{preference_prompt}\n" ) # Add the output format section translate_prompt += ( "\n# Output Format\n" "Return a single JSON object with the following structure:\n" "```json\n" "{{\n" " \"output_content\": \"The complete translated {content_format} content with all tags preserved\"\n" "}}\n" "```\n\n" "# Translation Quality Guidelines\n" "- Accuracy: Ensure factual information is preserved exactly\n" "- Completeness: Translate all content, don't summarize or omit information\n" "- Consistency: Use consistent terminology throughout the article\n" "- Fluency: Produce natural-sounding text in the target language\n" "- Formatting: Preserve all formatting elements, including lists, tables, and emphasis\n" "# Output json\n" "```json\n" ) # Format the prompt with the provided values formatted_prompt = translate_prompt.format( article_title=article_title, article_summary=artice_summary, original_content=original_content, target_lang=target_lang, content_format=content_format ) return formatted_prompt