File size: 3,680 Bytes
e424603
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import wikipedia
from typing import List, Dict, Any
import urllib.parse
import requests
import xml.etree.ElementTree as ET
import re


# Function to extract wiki id from a given url
def extract_wiki_id(url: str) -> str:
    """
    Extracts the wiki id from a given url.

    Args:
        url (str): The url to extract the wiki id from.

    Returns:
        str: The extracted wiki id.
    """
    
    # validate the url is from wikipedia
    if "wikipedia.org" not in url:
        raise ValueError("URL is not from Wikipedia")
    
    # Parse the URL
    parsed_url = urllib.parse.urlparse(url)

    # Extract the path from the parsed URL
    path = parsed_url.path

    # Split the path into parts
    path_parts = path.split('/')

    # The wiki id is the last part of the path
    wiki_id = path_parts[-1]

    # Remove any query parameters
    if '?' in wiki_id:
        wiki_id = wiki_id.split('?')[0]

    # Remove any fragment identifiers
    if '#' in wiki_id:
        wiki_id = wiki_id.split('#')[0]
    
    return wiki_id

# Function to get all details dictionary from a given wiki id
def get_wiki_details(wiki_id: str) -> Dict[str, Any]:
    """
    Gets all details dictionary from a given wiki id.

    Args:
        wiki_id (str): The wiki id to get the details from.

    Returns:
        dict: The details dictionary.
    """
    
    # Get the page object
    page = wikipedia.page(wiki_id)

    wiki_xml, has_error = get_wiki_xml(wiki_id)
    if has_error or not wiki_xml:
        print(f"Error fetching XML data: {has_error}")
        return None

    # Get the details dictionary
    details = {
        "title": page.title,
        "wiki_xml": wiki_xml,
        "pageid": page.pageid,
        "url": page.url,
        "content": page.content,
        "summary": page.summary,
        "images": page.images,
        "links": page.links,
        "categories": page.categories,
        "references": page.references,
        "sections": page.sections
    }

    return details

# functio to get xml data from a given wiki id
def get_wiki_xml(page_title):
    try:
        
        # MediaWiki API endpoint
        url = "https://en.wikipedia.org/w/api.php"
        
        # Parameters for XML format
        params = {
            "action": "query",
            "titles": page_title,
            "prop": "revisions",
            "rvprop": "content",
            "format": "xml"
        }
        
        # Make the request
        response = requests.get(url, params=params)
        xml_content = response.text
        
        return xml_content, None
    
    except wikipedia.exceptions.PageError:
        return None, {"error": f"Page '{page_title}' does not exist"}
    except wikipedia.exceptions.DisambiguationError as e:
        return None, {"error": f"Disambiguation error: {e}"}
    except Exception as e:
        return None, {"error": f"An error occurred: {str(e)}"}
    
# function to split content into sections using === [SECTION NAME] === regex pattern
def split_content_into_sections(content: str) -> List[str]:

    """
    Splits the content into sections using the === [SECTION NAME] === regex pattern.

    Args:
        content (str): The content to split.

    Returns:
        dict: The sections dictionary.
    """

    sections_dict = {}

    # Split the content into sections using regex
    sections = re.split(r'={2,}([^=]+)={2,}', content)

    # Iterate over the sections and add them to the dictionary
    for i in range(1, len(sections), 2):
        section_name = sections[i].strip()
        section_content = sections[i + 1].strip()
        sections_dict[section_name] = section_content

    return sections_dict