Spaces:

bakrianoo
/

wikipedia-translator

Running

File size: 3,680 Bytes

e424603

import wikipedia
from typing import List, Dict, Any
import urllib.parse
import requests
import xml.etree.ElementTree as ET
import re


# Function to extract wiki id from a given url
def extract_wiki_id(url: str) -> str:
    """
    Extracts the wiki id from a given url.

    Args:
        url (str): The url to extract the wiki id from.

    Returns:
        str: The extracted wiki id.
    """
    
    # validate the url is from wikipedia
    if "wikipedia.org" not in url:
        raise ValueError("URL is not from Wikipedia")
    
    # Parse the URL
    parsed_url = urllib.parse.urlparse(url)

    # Extract the path from the parsed URL
    path = parsed_url.path

    # Split the path into parts
    path_parts = path.split('/')

    # The wiki id is the last part of the path
    wiki_id = path_parts[-1]

    # Remove any query parameters
    if '?' in wiki_id:
        wiki_id = wiki_id.split('?')[0]

    # Remove any fragment identifiers
    if '#' in wiki_id:
        wiki_id = wiki_id.split('#')[0]
    
    return wiki_id

# Function to get all details dictionary from a given wiki id
def get_wiki_details(wiki_id: str) -> Dict[str, Any]:
    """
    Gets all details dictionary from a given wiki id.

    Args:
        wiki_id (str): The wiki id to get the details from.

    Returns:
        dict: The details dictionary.
    """
    
    # Get the page object
    page = wikipedia.page(wiki_id)

    wiki_xml, has_error = get_wiki_xml(wiki_id)
    if has_error or not wiki_xml:
        print(f"Error fetching XML data: {has_error}")
        return None

    # Get the details dictionary
    details = {
        "title": page.title,
        "wiki_xml": wiki_xml,
        "pageid": page.pageid,
        "url": page.url,
        "content": page.content,
        "summary": page.summary,
        "images": page.images,
        "links": page.links,
        "categories": page.categories,
        "references": page.references,
        "sections": page.sections
    }

    return details

# functio to get xml data from a given wiki id
def get_wiki_xml(page_title):
    try:
        
        # MediaWiki API endpoint
        url = "https://en.wikipedia.org/w/api.php"
        
        # Parameters for XML format
        params = {
            "action": "query",
            "titles": page_title,
            "prop": "revisions",
            "rvprop": "content",
            "format": "xml"
        }
        
        # Make the request
        response = requests.get(url, params=params)
        xml_content = response.text
        
        return xml_content, None
    
    except wikipedia.exceptions.PageError:
        return None, {"error": f"Page '{page_title}' does not exist"}
    except wikipedia.exceptions.DisambiguationError as e:
        return None, {"error": f"Disambiguation error: {e}"}
    except Exception as e:
        return None, {"error": f"An error occurred: {str(e)}"}
    
# function to split content into sections using === [SECTION NAME] === regex pattern
def split_content_into_sections(content: str) -> List[str]:

    """
    Splits the content into sections using the === [SECTION NAME] === regex pattern.

    Args:
        content (str): The content to split.

    Returns:
        dict: The sections dictionary.
    """

    sections_dict = {}

    # Split the content into sections using regex
    sections = re.split(r'={2,}([^=]+)={2,}', content)

    # Iterate over the sections and add them to the dictionary
    for i in range(1, len(sections), 2):
        section_name = sections[i].strip()
        section_content = sections[i + 1].strip()
        sections_dict[section_name] = section_content

    return sections_dict