Spaces:

bakrianoo
/

wikipedia-translator

Running

App Files Files Community

bakrianoo commited on May 16

Commit

c25ce6b

1 Parent(s): e424603

setup llm parser

Browse files

Files changed (7) hide show

app.py +24 -1
utils/__init__.py +2 -0
utils/__pycache__/__init__.cpython-310.pyc +0 -0
utils/__pycache__/llm_parser.cpython-310.pyc +0 -0
utils/__pycache__/llm_prompts.cpython-310.pyc +0 -0
utils/llm_parser.py +15 -0
utils/llm_prompts.py +64 -0

app.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import gradio as gr
-from utils import extract_wiki_id, get_wiki_details, split_content_into_sections
 import json
 # Define language options for translation
@@ -39,6 +41,27 @@ def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang
         content_sections
     )
 def update_ui_with_sections(sections_dict):
     """
     Creates a list of components to display in the sections area

 import gradio as gr
+from utils import (extract_wiki_id, get_wiki_details,
+                   init_llm_client, split_content_into_sections,
+                   get_translate_prompt)
 import json
 # Define language options for translation
         content_sections
     )
+def translate_content(content, article_title, artice_summary, target_lang, api_key, model_id, base_url):
+    llm_client = init_llm_client(api_key, model_id, base_url)
+    translation_prompt = get_translate_prompt(
+        article_title=article_title,
+        artice_summary=artice_summary,
+        original_content=content,
+        target_lang=target_lang
+    )
+    # Call the LLM to get the translation
+    response = llm_client.responses.create(
+        messages=[
+            {"role": "user", "content": translation_prompt}
+        ],
+        model=model_id,
+        max_tokens=2000,
+        temperature=0.5
+    )
 def update_ui_with_sections(sections_dict):
     """
     Creates a list of components to display in the sections area

utils/__init__.py CHANGED Viewed

	@@ -1 +1,3 @@
1	from .wikipedia_extractor import (extract_wiki_id, get_wiki_details, split_content_into_sections)

 from .wikipedia_extractor import (extract_wiki_id, get_wiki_details, split_content_into_sections)
+from .llm_parser import init_llm_client
+from .llm_prompts import get_translate_prompt

utils/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/utils/__pycache__/__init__.cpython-310.pyc and b/utils/__pycache__/__init__.cpython-310.pyc differ

utils/__pycache__/llm_parser.cpython-310.pyc ADDED Viewed

Binary file (560 Bytes). View file

utils/__pycache__/llm_prompts.cpython-310.pyc ADDED Viewed

Binary file (2.24 kB). View file

utils/llm_parser.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import os
+from openai import OpenAI
+def init_llm_client(api_key, model_id, base_url=None):
+    """
+    Initialize the OpenAI client with the provided API key and model ID.
+    """
+    if base_url:
+        os.environ["OPENAI_API_BASE"] = base_url
+    os.environ["OPENAI_API_KEY"] = api_key
+    os.environ["OPENAI_MODEL_ID"] = model_id
+    return OpenAI(api_key=api_key, model_id=model_id, base_url=base_url)

utils/llm_prompts.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import json
+def get_translate_prompt(article_title, artice_summary, original_content, target_lang):
+    """
+    Function to get the translation prompt for the LLM to translate Wikipedia XML content
+    with high quality and fidelity to the original.
+    """
+    # Define the prompt template
+    translate_prompt = (
+        "# Task\n"
+        "You are an expert Wikipedia translator specializing in multilingual content adaptation. "
+        "Your task is to translate the provided XML content into {target_lang} while preserving the "
+        "academic tone, factual accuracy, and encyclopedic style of Wikipedia.\n\n"
+        "# Article Original Title\n"
+        "{article_title}\n\n"
+        "# Article Summary\n"
+        "{article_summary}\n\n"
+        "# Article Original Content (XML format)\n"
+        "{original_content}\n\n"
+        "# Target Language\n"
+        "{target_lang}\n\n"
+        "# Instructions\n"
+        "1. Preserve all XML tags, attributes, and structure exactly as they appear\n"
+        "2. Translate only the text content between XML tags\n"
+        "3. Maintain Wikipedia's neutral point of view and encyclopedic style\n"
+        "4. Preserve proper nouns, scientific terminology, and citations appropriately\n"
+        "5. Adapt cultural references or idioms to be understandable in the target language\n"
+        "6. Use terminology consistent with the {target_lang} Wikipedia for similar topics\n"
+        "7. Maintain the same paragraph structure and information hierarchy\n\n"
+        "# Output Format\n"
+        "Return a single JSON object with the following structure:\n"
+        "```json\n" +
+        json.dumps({
+            "translated_content": "The complete translated XML content with all tags preserved",
+        }, indent=4, ensure_ascii=False) +
+        "\n```\n\n"
+        "# Translation Quality Guidelines\n"
+        "- Accuracy: Ensure factual information is preserved exactly\n"
+        "- Completeness: Translate all content, don't summarize or omit information\n"
+        "- Consistency: Use consistent terminology throughout the article\n"
+        "- Fluency: Produce natural-sounding text in the target language\n"
+        "- Formatting: Preserve all formatting elements, including lists, tables, and emphasis\n"
+        "# Output json\n"
+        "```json\n"
+    )
+    # Format the prompt with the provided values
+    formatted_prompt = translate_prompt.format(
+        article_title=article_title,
+        article_summary=artice_summary,
+        original_content=original_content,
+        target_lang=target_lang
+    )
+    return formatted_prompt