Spaces:

bakrianoo
/

wikipedia-translator

Running

File size: 5,429 Bytes

import wikipedia
from typing import List, Dict, Any, Optional, Tuple
import urllib.parse
import requests
import xml.etree.ElementTree as ET
import re


# Function to extract wiki id from a given url
def extract_wiki_id(url: str) -> str:
    """
    Extracts the wiki id from a given url.

    Args:
        url (str): The url to extract the wiki id from.

    Returns:
        str: The extracted wiki id.
    """
    
    # validate the url is from wikipedia
    if "wikipedia.org" not in url:
        raise ValueError("URL is not from Wikipedia")
    
    # Parse the URL
    parsed_url = urllib.parse.urlparse(url)

    # Extract the path from the parsed URL
    path = parsed_url.path

    # Split the path into parts
    path_parts = path.split('/')

    # The wiki id is the last part of the path
    wiki_id = path_parts[-1]

    # Remove any query parameters
    if '?' in wiki_id:
        wiki_id = wiki_id.split('?')[0]

    # Remove any fragment identifiers
    if '#' in wiki_id:
        wiki_id = wiki_id.split('#')[0]
    
    # URL decode the wiki id to handle special characters
    wiki_id = urllib.parse.unquote(wiki_id)
    
    # Replace underscores with spaces as Wikipedia API expects spaces
    wiki_id = wiki_id.replace('_', ' ')
    
    return wiki_id

# Function to get all details dictionary from a given wiki id
def get_wiki_details(wiki_id_or_url: str) -> Optional[Dict[str, Any]]:
    """
    Gets all details dictionary from a given wiki id or URL.

    Args:
        wiki_id_or_url (str): The wiki id or URL to get the details from.

    Returns:
        dict: The details dictionary or None if there was an error.
    """
    try:
        # Check if input is a URL and extract wiki_id if it is
        if "wikipedia.org" in wiki_id_or_url:
            wiki_id = extract_wiki_id(wiki_id_or_url)
        else:
            wiki_id = wiki_id_or_url
            
        # Get the page object
        try:
            page = wikipedia.page(wiki_id, auto_suggest=False)
        except wikipedia.exceptions.PageError:
            # If direct page lookup fails, try search
            search_results = wikipedia.search(wiki_id)
            if not search_results:
                print(f"No results found for '{wiki_id}'")
                return None
            
            # Use the first search result
            try:
                page = wikipedia.page(search_results[0], auto_suggest=False)
                print(f"Using closest match: '{page.title}' for query '{wiki_id}'")
            except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError) as e:
                print(f"Error with search result: {e}")
                return None

        wiki_xml, has_error = get_wiki_xml(page.title)
        if has_error or not wiki_xml:
            print(f"Error fetching XML data: {has_error}")
            return None

        # Get the details dictionary
        details = {
            "title": page.title,
            "wiki_xml": wiki_xml,
            "pageid": page.pageid,
            "url": page.url,
            "content": page.content,
            "summary": page.summary,
            "images": page.images,
            "links": page.links,
            "categories": page.categories,
            "references": page.references,
            "sections": page.sections
        }

        return details
    
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"Disambiguation error: {e}")
        print(f"Please specify one of the options above.")
        return None
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

# functio to get xml data from a given wiki id
def get_wiki_xml(page_title):
    try:
        
        # MediaWiki API endpoint
        url = "https://en.wikipedia.org/w/api.php"
        
        # Parameters for XML format
        params = {
            "action": "query",
            "titles": page_title,
            "prop": "revisions",
            "rvprop": "content",
            "format": "xml"
        }
        
        # Make the request
        response = requests.get(url, params=params)
        xml_content = response.text
        
        return xml_content, None
    
    except wikipedia.exceptions.PageError:
        return None, {"error": f"Page '{page_title}' does not exist"}
    except wikipedia.exceptions.DisambiguationError as e:
        return None, {"error": f"Disambiguation error: {e}"}
    except Exception as e:
        return None, {"error": f"An error occurred: {str(e)}"}
    
# function to split content into sections using === [SECTION NAME] === regex pattern
def split_content_into_sections(content: str, content_format: str=None) -> List[str]:

    """
    Splits the content into sections using the === [SECTION NAME] === regex pattern.

    Args:
        content (str): The content to split.
        content_format (str): The format to return the content in ("Plain Text" or "XML").

    Returns:
        dict: The sections dictionary.
    """

    sections_dict = {}

    # Split the content into sections using regex
    sections = re.split(r'={2,}([^=]+)={2,}', content)

    # Iterate over the sections and add them to the dictionary
    for i in range(1, len(sections), 2):
        section_name = sections[i].strip()
        section_content = sections[i + 1].strip()
        sections_dict[section_name] = section_content

    return sections_dict