Spaces:

bakrianoo
/

wikipedia-translator

Running

App Files Files Community

bakrianoo commited on May 16

Commit

94260c3

1 Parent(s): c065ba1

specify the translated format

Browse files

Files changed (6) hide show

app.py +20 -6
utils.py +91 -0
utils/__pycache__/llm_parser.cpython-310.pyc +0 -0
utils/__pycache__/wikipedia_extractor.cpython-310.pyc +0 -0
utils/llm_prompts.py +9 -8
utils/wikipedia_extractor.py +2 -1

app.py CHANGED Viewed

@@ -21,7 +21,7 @@ LANGUAGES = {
     "Korean": "ko"
 }
-def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang):
     """
     Function to extract content from Wikipedia URL (placeholder for now)
     """
@@ -31,7 +31,10 @@ def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang
     # Get the details of the Wikipedia article
     wiki_details = get_wiki_details(wiki_id)
-    content_sections = split_content_into_sections(wiki_details['wiki_xml'])
     return (
         "Extraction complete! Sections: " + str(len(content_sections)),
@@ -42,7 +45,8 @@ def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang
         content_sections
     )
-def translate_content(content, article_title, artice_summary, target_lang, api_key, model_id, base_url):
     llm_client = init_llm_client(api_key, base_url=base_url)
@@ -50,7 +54,8 @@ def translate_content(content, article_title, artice_summary, target_lang, api_k
         article_title=article_title,
         artice_summary=artice_summary,
         original_content=content,
-        target_lang=target_lang
     )
     # Call the LLM to get the translation - updating params to match OpenAI's requirements
@@ -69,7 +74,7 @@ def translate_content(content, article_title, artice_summary, target_lang, api_k
     return "Error: Translation output not found in the response."
-def translate_section(section_content, article_title, article_summary, target_lang, api_key, model_id, base_url):
     """
     Translates a single section of the Wikipedia article
     """
@@ -80,6 +85,7 @@ def translate_section(section_content, article_title, article_summary, target_la
         content=section_content,
         article_title=article_title,
         artice_summary=article_summary,
         target_lang=target_lang,
         api_key=api_key,
         model_id=model_id,
@@ -172,6 +178,13 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
                     value="Arabic",
                     label="Target Language",
                 )
             gr.Markdown("### About")
             gr.Markdown("""
@@ -240,6 +253,7 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
                                 section_textbox,
                                 article_title,
                                 aticle_summary,
                                 target_language,
                                 api_key,
                                 model_id,
@@ -251,7 +265,7 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
     # Connect the extract button to the function
     extract_button.click(
         fn=extract_wikipedia_content,
-        inputs=[wiki_url, api_key, model_id, base_url, target_language],
         outputs=[
             output,
             article_pageid,

     "Korean": "ko"
 }
+def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang, content_format):
     """
     Function to extract content from Wikipedia URL (placeholder for now)
     """
     # Get the details of the Wikipedia article
     wiki_details = get_wiki_details(wiki_id)
+    if content_format == "XML":
+        content_sections = split_content_into_sections(wiki_details['wiki_xml'], content_format)
+    else:
+        content_sections = split_content_into_sections(wiki_details['content'], content_format)
     return (
         "Extraction complete! Sections: " + str(len(content_sections)),
         content_sections
     )
+def translate_content(content, article_title, artice_summary, content_format,
+                      target_lang, api_key, model_id, base_url):
     llm_client = init_llm_client(api_key, base_url=base_url)
         article_title=article_title,
         artice_summary=artice_summary,
         original_content=content,
+        target_lang=target_lang,
+        content_format=content_format
     )
     # Call the LLM to get the translation - updating params to match OpenAI's requirements
     return "Error: Translation output not found in the response."
+def translate_section(section_content, article_title, article_summary, content_format, target_lang, api_key, model_id, base_url):
     """
     Translates a single section of the Wikipedia article
     """
         content=section_content,
         article_title=article_title,
         artice_summary=article_summary,
+        content_format=content_format,
         target_lang=target_lang,
         api_key=api_key,
         model_id=model_id,
                     value="Arabic",
                     label="Target Language",
                 )
+                content_format = gr.Radio(
+                    choices=["Text", "XML"],
+                    value="XML",
+                    label="Content Format",
+                    info="Choose how to display article content"
+                )
             gr.Markdown("### About")
             gr.Markdown("""
                                 section_textbox,
                                 article_title,
                                 aticle_summary,
+                                content_format,
                                 target_language,
                                 api_key,
                                 model_id,
     # Connect the extract button to the function
     extract_button.click(
         fn=extract_wikipedia_content,
+        inputs=[wiki_url, api_key, model_id, base_url, target_language, content_format],
         outputs=[
             output,
             article_pageid,

utils.py ADDED Viewed

	@@ -0,0 +1,91 @@

+def extract_wiki_id(wiki_url):
+    """
+    Extracts the Wikipedia ID from the given URL.
+    """
+    import re
+    match = re.search(r'wiki/([^#?]+)', wiki_url)
+    return match.group(1) if match else None
+def get_wiki_details(wiki_id):
+    """
+    Placeholder function to get Wikipedia details using the wiki ID.
+    """
+    # This should interact with the Wikipedia API or your backend service
+    # For now, returning dummy data
+    return {
+        "pageid": 123456,
+        "title": "Artificial Intelligence",
+        "summary": "AI is the simulation of human intelligence in machines.",
+        "wiki_xml": "<xml>...</xml>",
+        "sections": {
+            "Introduction": "AI Introduction content...",
+            "History": "AI History content...",
+            "Applications": "AI Applications content...",
+        }
+    }
+def init_llm_client(api_key, base_url="https://api.openai.com/v1"):
+    """
+    Initializes the LLM client with the given API key and base URL.
+    """
+    import openai
+    openai.api_key = api_key
+    openai.api_base = base_url
+    return openai
+def split_content_into_sections(wiki_xml, content_format="Plain Text"):
+    """
+    Split the Wikipedia content into logical sections.
+    Args:
+        wiki_xml (str): The XML content of the Wikipedia article
+        content_format (str): The format to return the content in ("Plain Text" or "XML")
+    Returns:
+        dict: A dictionary mapping section names to their content
+    """
+    from xml.etree import ElementTree as ET
+    # Parse the XML content
+    root = ET.fromstring(wiki_xml)
+    sections = {}
+    for child in root:
+        # Assuming each child of the root is a section
+        section_name = child.tag
+        section_content = ET.tostring(child, encoding='unicode')
+        # Add to sections dictionary
+        if content_format == "XML":
+            sections[section_name] = section_content
+        else:  # Plain Text
+            try:
+                # Try to extract text content only
+                text_content = child.text if child.text else ""
+                for elem in child.iter():
+                    if elem.text and elem != child:
+                        text_content += " " + elem.text
+                    if elem.tail:
+                        text_content += " " + elem.tail
+                sections[section_name] = text_content.strip()
+            except Exception as e:
+                # Fallback in case of parsing issues
+                sections[section_name] = f"Error extracting text: {str(e)}"
+    return sections
+def get_translate_prompt(article_title, artice_summary, original_content, target_lang):
+    """
+    Constructs the translation prompt for the LLM.
+    """
+    return f"""
+    You are a professional translator. Translate the following content to {target_lang}.
+    Title: {article_title}
+    Summary: {artice_summary}
+    Content: {original_content}
+    """

utils/__pycache__/llm_parser.cpython-310.pyc CHANGED Viewed

Binary files a/utils/__pycache__/llm_parser.cpython-310.pyc and b/utils/__pycache__/llm_parser.cpython-310.pyc differ

utils/__pycache__/wikipedia_extractor.cpython-310.pyc CHANGED Viewed

Binary files a/utils/__pycache__/wikipedia_extractor.cpython-310.pyc and b/utils/__pycache__/wikipedia_extractor.cpython-310.pyc differ

utils/llm_prompts.py CHANGED Viewed

@@ -1,15 +1,15 @@
 import json
-def get_translate_prompt(article_title, artice_summary, original_content, target_lang):
     """
-    Function to get the translation prompt for the LLM to translate Wikipedia XML content
     with high quality and fidelity to the original.
     """
     # Define the prompt template
     translate_prompt = (
         "# Task\n"
         "You are an expert Wikipedia translator specializing in multilingual content adaptation. "
-        "Your task is to translate the provided XML content into {target_lang} while preserving the "
         "academic tone, factual accuracy, and encyclopedic style of Wikipedia.\n\n"
         "# Article Original Title\n"
@@ -18,15 +18,15 @@ def get_translate_prompt(article_title, artice_summary, original_content, target
         "# Article Summary\n"
         "{article_summary}\n\n"
-        "# Article Original Content (XML format)\n"
         "{original_content}\n\n"
         "# Target Language\n"
         "{target_lang}\n\n"
         "# Instructions\n"
-        "1. Preserve all XML tags, attributes, and structure exactly as they appear\n"
-        "2. Translate only the text content between XML tags\n"
         "3. Maintain Wikipedia's neutral point of view and encyclopedic style\n"
         "4. Preserve proper nouns, scientific terminology, and citations appropriately\n"
         "5. Adapt cultural references or idioms to be understandable in the target language\n"
@@ -37,7 +37,7 @@ def get_translate_prompt(article_title, artice_summary, original_content, target
         "Return a single JSON object with the following structure:\n"
         "```json\n"
         "{{\n"
-        "    \"output_content\": \"The complete translated XML content with all tags preserved\"\n"
         "}}\n"
         "```\n\n"
@@ -58,7 +58,8 @@ def get_translate_prompt(article_title, artice_summary, original_content, target
         article_title=article_title,
         article_summary=artice_summary,
         original_content=original_content,
-        target_lang=target_lang
     )
     return formatted_prompt

 import json
+def get_translate_prompt(article_title, artice_summary, content_format, original_content, target_lang):
     """
+    Function to get the translation prompt for the LLM to translate Wikipedia {content_format} content
     with high quality and fidelity to the original.
     """
     # Define the prompt template
     translate_prompt = (
         "# Task\n"
         "You are an expert Wikipedia translator specializing in multilingual content adaptation. "
+        "Your task is to translate the provided {content_format} content into {target_lang} while preserving the "
         "academic tone, factual accuracy, and encyclopedic style of Wikipedia.\n\n"
         "# Article Original Title\n"
         "# Article Summary\n"
         "{article_summary}\n\n"
+        "# Article Original Content ({content_format} format)\n"
         "{original_content}\n\n"
         "# Target Language\n"
         "{target_lang}\n\n"
         "# Instructions\n"
+        "1. If provided input is XML code, Preserve all XML tags, attributes, and structure exactly as they appear\n"
+        "2. If provided input is XML code, Translate only the text content between XML tags\n"
         "3. Maintain Wikipedia's neutral point of view and encyclopedic style\n"
         "4. Preserve proper nouns, scientific terminology, and citations appropriately\n"
         "5. Adapt cultural references or idioms to be understandable in the target language\n"
         "Return a single JSON object with the following structure:\n"
         "```json\n"
         "{{\n"
+        "    \"output_content\": \"The complete translated {content_format} content with all tags preserved\"\n"
         "}}\n"
         "```\n\n"
         article_title=article_title,
         article_summary=artice_summary,
         original_content=original_content,
+        target_lang=target_lang,
+        content_format=content_format
     )
     return formatted_prompt

utils/wikipedia_extractor.py CHANGED Viewed

@@ -111,13 +111,14 @@ def get_wiki_xml(page_title):
         return None, {"error": f"An error occurred: {str(e)}"}
 # function to split content into sections using === [SECTION NAME] === regex pattern
-def split_content_into_sections(content: str) -> List[str]:
     """
     Splits the content into sections using the === [SECTION NAME] === regex pattern.
     Args:
         content (str): The content to split.
     Returns:
         dict: The sections dictionary.

         return None, {"error": f"An error occurred: {str(e)}"}
 # function to split content into sections using === [SECTION NAME] === regex pattern
+def split_content_into_sections(content: str, content_format: str=None) -> List[str]:
     """
     Splits the content into sections using the === [SECTION NAME] === regex pattern.
     Args:
         content (str): The content to split.
+        content_format (str): The format to return the content in ("Plain Text" or "XML").
     Returns:
         dict: The sections dictionary.