File size: 4,858 Bytes
c25ce6b
 
186544d
c25ce6b
94260c3
c25ce6b
186544d
 
 
 
 
 
 
 
c25ce6b
 
 
 
 
94260c3
c25ce6b
 
 
 
 
 
 
 
94260c3
c25ce6b
 
 
 
 
 
94260c3
 
c25ce6b
 
 
 
186544d
 
 
06c3d9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186544d
 
 
 
 
 
 
 
 
 
c25ce6b
c065ba1
 
94260c3
c065ba1
 
c25ce6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94260c3
 
c25ce6b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import json

def get_translate_prompt(article_title, artice_summary, content_format, original_content, target_lang, preference_prompt=None):
    """
    Function to get the translation prompt for the LLM to translate Wikipedia {content_format} content
    with high quality and fidelity to the original.
    
    Args:
        article_title: Title of the Wikipedia article
        artice_summary: Summary of the Wikipedia article
        content_format: Format of the content (e.g., XML, Text)
        original_content: The content to be translated
        target_lang: The target language for translation
        preference_prompt: Optional additional user preferences for translation
    """
    # Define the prompt template
    translate_prompt = (
        "# Task\n"
        "You are an expert Wikipedia translator specializing in multilingual content adaptation. "
        "Your task is to translate the provided {content_format} content into {target_lang} while preserving the "
        "academic tone, factual accuracy, and encyclopedic style of Wikipedia.\n\n"
        
        "# Article Original Title\n"
        "{article_title}\n\n"
        
        "# Article Summary\n"
        "{article_summary}\n\n"
        
        "# Article Original Content ({content_format} format)\n"
        "{original_content}\n\n"
        
        "# Target Language\n"
        "{target_lang}\n\n"
        
        "# Instructions\n"
        "1. If provided input is XML code, Preserve all XML tags, attributes, and structure exactly as they appear\n"
        "2. If provided input is XML code, Translate only the text content between XML tags\n"
        "3. Maintain Wikipedia's neutral point of view and encyclopedic style\n"
        "4. Preserve proper nouns, scientific terminology, and citations appropriately\n"
        "5. Adapt cultural references or idioms to be understandable in the target language\n"
        "6. Use terminology consistent with the {target_lang} Wikipedia for similar topics\n"
        "7. Maintain the same paragraph structure and information hierarchy\n"
    )
    
    # Add special instructions for Arabic-Extended
    if target_lang in ["ar-x-extended", "Arabic-Extended"]:
        translate_prompt += (
            "\n# Arabic-Extended Alphabet Guidelines\n"
            "When translating to Arabic-Extended, use the extended Arabic alphabet ONLY for entity names "
            "(people, places, brands, foreign terms) that contain sounds not in standard Arabic. Use these special characters:\n\n"
            "- ڤ (V): Use for 'v' sound in foreign names instead of ف\n"
            "- پ (P): Use for 'p' sound in foreign names instead of ب\n"
            "- چ (Ch): Use for 'ch' sound in foreign names instead of تش\n"
            "- گ (G): Use for 'g' sound in foreign names instead of ج/غ/ك\n"
            "- ژ (Zh): Use for 'zh/j' sound in foreign names instead of ز/ج\n"
            "- ڠ (ng): Use for 'ng' sound in foreign names instead of نج/نغ\n\n"
            "Examples:\n"
            "- 'Vancouver' → 'ڤانكوڤر' (using ڤ for V)\n"
            "- 'Pakistan' → 'پاكستان' (using پ for P)\n"
            "- 'Chicago' → 'چيكاغو' (using چ for Ch)\n"
            "- 'Google' → 'گوگل' (using گ for G)\n\n"
            "Important: Use these extended characters ONLY for entity names. Use standard Arabic for all other content.\n"
        )
    
    # Add user preference prompt if provided
    if preference_prompt and preference_prompt.strip():
        translate_prompt += (
            "\n# Additional Translation Preferences\n"
            f"{preference_prompt}\n"
        )
    
    # Add the output format section
    translate_prompt += (
        "\n# Output Format\n"
        "Return a single JSON object with the following structure:\n"
        "```json\n"
        "{{\n"
        "    \"output_content\": \"The complete translated {content_format} content with all tags preserved\"\n"
        "}}\n"
        "```\n\n"
        
        "# Translation Quality Guidelines\n"
        "- Accuracy: Ensure factual information is preserved exactly\n"
        "- Completeness: Translate all content, don't summarize or omit information\n"
        "- Consistency: Use consistent terminology throughout the article\n"
        "- Fluency: Produce natural-sounding text in the target language\n"
        "- Formatting: Preserve all formatting elements, including lists, tables, and emphasis\n"

        "# Output json\n"
        "```json\n"
        
    )

    # Format the prompt with the provided values
    formatted_prompt = translate_prompt.format(
        article_title=article_title,
        article_summary=artice_summary,
        original_content=original_content,
        target_lang=target_lang,
        content_format=content_format
    )

    return formatted_prompt