Spaces:

bakrianoo
/

wikipedia-translator

Running

App Files Files Community

bakrianoo commited on May 16

Commit

e424603

1 Parent(s): ce96e8f

extract wikipedia details

Browse files

Files changed (5) hide show

app.py +92 -12
utils/__init__.py +1 -0
utils/__pycache__/__init__.cpython-310.pyc +0 -0
utils/__pycache__/wikipedia_extractor.cpython-310.pyc +0 -0
utils/wikipedia_extractor.py +138 -0

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
-import wikipedia
-import openai
-import os
 # Define language options for translation
 LANGUAGES = {
@@ -15,7 +14,6 @@ LANGUAGES = {
     "Russian": "ru",
     "Japanese": "ja",
     "Chinese": "zh",
-    "Arabic": "ar",
     "Hindi": "hi",
     "Korean": "ko"
 }
@@ -24,13 +22,54 @@ def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang
     """
     Function to extract content from Wikipedia URL (placeholder for now)
     """
-    # Will implement the actual extraction and translation later
-    return f"Configuration saved. API Key: {api_key[:5]}..., Model: {model_id}, Target Language: {target_lang}"
 # Create Gradio app
-with gr.Blocks(theme=gr.themes.Monochrome()) as app:
     gr.Markdown("# Wikipedia Translator")
     with gr.Row():
         # Sidebar for configuration
         with gr.Column(scale=1):
@@ -81,19 +120,60 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as app:
             extract_button = gr.Button("Extract and Prepare for Translation", variant="primary")
-            output = gr.Textbox(label="Status")
             # Results area (will expand in the future)
-            article_info = gr.Textbox(label="Article Information", visible=False)
-            article_content = gr.Textbox(label="Article Content", visible=False)
     # Connect the extract button to the function
     extract_button.click(
         fn=extract_wikipedia_content,
         inputs=[wiki_url, api_key, model_id, base_url, target_language],
-        outputs=[output]
     )
 # Launch the app
 if __name__ == "__main__":
-    app.launch()

 import gradio as gr
+from utils import extract_wiki_id, get_wiki_details, split_content_into_sections
+import json
 # Define language options for translation
 LANGUAGES = {
     "Russian": "ru",
     "Japanese": "ja",
     "Chinese": "zh",
     "Hindi": "hi",
     "Korean": "ko"
 }
     """
     Function to extract content from Wikipedia URL (placeholder for now)
     """
+    wiki_id = extract_wiki_id(wiki_url)
+    if not wiki_id:
+        return "Invalid Wikipedia URL. Please check the URL and try again.", None, None, None, None, {}
+    # Get the details of the Wikipedia article
+    wiki_details = get_wiki_details(wiki_id)
+    content_sections = split_content_into_sections(wiki_details['wiki_xml'])
+    return (
+        "Extraction complete! Sections: " + str(len(content_sections)),
+        wiki_details['pageid'],
+        wiki_details['title'],
+        wiki_details['summary'],
+        wiki_details['wiki_xml'],
+        content_sections
+    )
+def update_ui_with_sections(sections_dict):
+    """
+    Creates a list of components to display in the sections area
+    """
+    components = []
+    if not sections_dict:
+        return [gr.update(visible=False) for _ in range(10)]  # Assuming max 10 sections
+    # Create visible components for available sections
+    for section_name, section_content in sections_dict.items():
+        components.append(gr.update(
+            value=section_content,
+            label=f"Section: {section_name}",
+            visible=True
+        ))
+    # Hide any unused components
+    remaining = 100 - len(components)  # Assuming max 100 sections
+    for _ in range(remaining):
+        components.append(gr.update(visible=False))
+    return components
 # Create Gradio app
+with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
     gr.Markdown("# Wikipedia Translator")
+    # State variable to store sections
+    sections_state = gr.State({})
     with gr.Row():
         # Sidebar for configuration
         with gr.Column(scale=1):
             extract_button = gr.Button("Extract and Prepare for Translation", variant="primary")
+            output = gr.Markdown(label="Status")
             # Results area (will expand in the future)
+            article_pageid = gr.Textbox(
+                label="Article Page ID",
+                placeholder="Page ID will appear here after extraction",
+                interactive=False
+            )
+            article_title = gr.Textbox(
+                label="Article Title",
+                placeholder="Title will appear here after extraction",
+                interactive=False
+            )
+            aticle_summary = gr.Textbox(
+                label="Article Summary",
+                placeholder="Summary will appear here after extraction",
+                interactive=False
+            )
+            article_xml = gr.Textbox(
+                label="Article XML",
+                placeholder="XML will appear here after extraction",
+                interactive=False,
+                visible=False  # Hidden by default as it's usually large
+            )
+            # Pre-define section textboxes (limit to 100 for simplicity)
+            gr.Markdown("### Article Sections")
+            with gr.Column() as sections_container:
+                section_textboxes = [
+                    gr.Textbox(visible=False, lines=4)
+                    for _ in range(100)  # Support up to 100 sections
+                ]
     # Connect the extract button to the function
     extract_button.click(
         fn=extract_wikipedia_content,
         inputs=[wiki_url, api_key, model_id, base_url, target_language],
+        outputs=[
+            output,
+            article_pageid,
+            article_title,
+            aticle_summary,
+            article_xml,
+            sections_state,
+        ]
+    ).then(
+        fn=update_ui_with_sections,
+        inputs=[sections_state],
+        outputs=section_textboxes
     )
 # Launch the app
 if __name__ == "__main__":
+    demo.launch()

utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .wikipedia_extractor import (extract_wiki_id, get_wiki_details, split_content_into_sections)

utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (292 Bytes). View file

utils/__pycache__/wikipedia_extractor.cpython-310.pyc ADDED Viewed

Binary file (1.43 kB). View file

utils/wikipedia_extractor.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import wikipedia
+from typing import List, Dict, Any
+import urllib.parse
+import requests
+import xml.etree.ElementTree as ET
+import re
+# Function to extract wiki id from a given url
+def extract_wiki_id(url: str) -> str:
+    """
+    Extracts the wiki id from a given url.
+    Args:
+        url (str): The url to extract the wiki id from.
+    Returns:
+        str: The extracted wiki id.
+    """
+    # validate the url is from wikipedia
+    if "wikipedia.org" not in url:
+        raise ValueError("URL is not from Wikipedia")
+    # Parse the URL
+    parsed_url = urllib.parse.urlparse(url)
+    # Extract the path from the parsed URL
+    path = parsed_url.path
+    # Split the path into parts
+    path_parts = path.split('/')
+    # The wiki id is the last part of the path
+    wiki_id = path_parts[-1]
+    # Remove any query parameters
+    if '?' in wiki_id:
+        wiki_id = wiki_id.split('?')[0]
+    # Remove any fragment identifiers
+    if '#' in wiki_id:
+        wiki_id = wiki_id.split('#')[0]
+    return wiki_id
+# Function to get all details dictionary from a given wiki id
+def get_wiki_details(wiki_id: str) -> Dict[str, Any]:
+    """
+    Gets all details dictionary from a given wiki id.
+    Args:
+        wiki_id (str): The wiki id to get the details from.
+    Returns:
+        dict: The details dictionary.
+    """
+    # Get the page object
+    page = wikipedia.page(wiki_id)
+    wiki_xml, has_error = get_wiki_xml(wiki_id)
+    if has_error or not wiki_xml:
+        print(f"Error fetching XML data: {has_error}")
+        return None
+    # Get the details dictionary
+    details = {
+        "title": page.title,
+        "wiki_xml": wiki_xml,
+        "pageid": page.pageid,
+        "url": page.url,
+        "content": page.content,
+        "summary": page.summary,
+        "images": page.images,
+        "links": page.links,
+        "categories": page.categories,
+        "references": page.references,
+        "sections": page.sections
+    }
+    return details
+# functio to get xml data from a given wiki id
+def get_wiki_xml(page_title):
+    try:
+        # MediaWiki API endpoint
+        url = "https://en.wikipedia.org/w/api.php"
+        # Parameters for XML format
+        params = {
+            "action": "query",
+            "titles": page_title,
+            "prop": "revisions",
+            "rvprop": "content",
+            "format": "xml"
+        }
+        # Make the request
+        response = requests.get(url, params=params)
+        xml_content = response.text
+        return xml_content, None
+    except wikipedia.exceptions.PageError:
+        return None, {"error": f"Page '{page_title}' does not exist"}
+    except wikipedia.exceptions.DisambiguationError as e:
+        return None, {"error": f"Disambiguation error: {e}"}
+    except Exception as e:
+        return None, {"error": f"An error occurred: {str(e)}"}
+# function to split content into sections using === [SECTION NAME] === regex pattern
+def split_content_into_sections(content: str) -> List[str]:
+    """
+    Splits the content into sections using the === [SECTION NAME] === regex pattern.
+    Args:
+        content (str): The content to split.
+    Returns:
+        dict: The sections dictionary.
+    """
+    sections_dict = {}
+    # Split the content into sections using regex
+    sections = re.split(r'={2,}([^=]+)={2,}', content)
+    # Iterate over the sections and add them to the dictionary
+    for i in range(1, len(sections), 2):
+        section_name = sections[i].strip()
+        section_content = sections[i + 1].strip()
+        sections_dict[section_name] = section_content
+    return sections_dict