File size: 5,429 Bytes
e424603
49b1cdf
e424603
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49b1cdf
 
 
 
 
 
e424603
 
 
49b1cdf
e424603
49b1cdf
e424603
 
49b1cdf
e424603
 
49b1cdf
e424603
49b1cdf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e424603
49b1cdf
 
 
 
 
 
 
 
e424603
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94260c3
e424603
 
 
 
 
 
94260c3
e424603
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import wikipedia
from typing import List, Dict, Any, Optional, Tuple
import urllib.parse
import requests
import xml.etree.ElementTree as ET
import re


# Function to extract wiki id from a given url
def extract_wiki_id(url: str) -> str:
    """
    Extracts the wiki id from a given url.

    Args:
        url (str): The url to extract the wiki id from.

    Returns:
        str: The extracted wiki id.
    """
    
    # validate the url is from wikipedia
    if "wikipedia.org" not in url:
        raise ValueError("URL is not from Wikipedia")
    
    # Parse the URL
    parsed_url = urllib.parse.urlparse(url)

    # Extract the path from the parsed URL
    path = parsed_url.path

    # Split the path into parts
    path_parts = path.split('/')

    # The wiki id is the last part of the path
    wiki_id = path_parts[-1]

    # Remove any query parameters
    if '?' in wiki_id:
        wiki_id = wiki_id.split('?')[0]

    # Remove any fragment identifiers
    if '#' in wiki_id:
        wiki_id = wiki_id.split('#')[0]
    
    # URL decode the wiki id to handle special characters
    wiki_id = urllib.parse.unquote(wiki_id)
    
    # Replace underscores with spaces as Wikipedia API expects spaces
    wiki_id = wiki_id.replace('_', ' ')
    
    return wiki_id

# Function to get all details dictionary from a given wiki id
def get_wiki_details(wiki_id_or_url: str) -> Optional[Dict[str, Any]]:
    """
    Gets all details dictionary from a given wiki id or URL.

    Args:
        wiki_id_or_url (str): The wiki id or URL to get the details from.

    Returns:
        dict: The details dictionary or None if there was an error.
    """
    try:
        # Check if input is a URL and extract wiki_id if it is
        if "wikipedia.org" in wiki_id_or_url:
            wiki_id = extract_wiki_id(wiki_id_or_url)
        else:
            wiki_id = wiki_id_or_url
            
        # Get the page object
        try:
            page = wikipedia.page(wiki_id, auto_suggest=False)
        except wikipedia.exceptions.PageError:
            # If direct page lookup fails, try search
            search_results = wikipedia.search(wiki_id)
            if not search_results:
                print(f"No results found for '{wiki_id}'")
                return None
            
            # Use the first search result
            try:
                page = wikipedia.page(search_results[0], auto_suggest=False)
                print(f"Using closest match: '{page.title}' for query '{wiki_id}'")
            except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError) as e:
                print(f"Error with search result: {e}")
                return None

        wiki_xml, has_error = get_wiki_xml(page.title)
        if has_error or not wiki_xml:
            print(f"Error fetching XML data: {has_error}")
            return None

        # Get the details dictionary
        details = {
            "title": page.title,
            "wiki_xml": wiki_xml,
            "pageid": page.pageid,
            "url": page.url,
            "content": page.content,
            "summary": page.summary,
            "images": page.images,
            "links": page.links,
            "categories": page.categories,
            "references": page.references,
            "sections": page.sections
        }

        return details
    
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"Disambiguation error: {e}")
        print(f"Please specify one of the options above.")
        return None
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

# functio to get xml data from a given wiki id
def get_wiki_xml(page_title):
    try:
        
        # MediaWiki API endpoint
        url = "https://en.wikipedia.org/w/api.php"
        
        # Parameters for XML format
        params = {
            "action": "query",
            "titles": page_title,
            "prop": "revisions",
            "rvprop": "content",
            "format": "xml"
        }
        
        # Make the request
        response = requests.get(url, params=params)
        xml_content = response.text
        
        return xml_content, None
    
    except wikipedia.exceptions.PageError:
        return None, {"error": f"Page '{page_title}' does not exist"}
    except wikipedia.exceptions.DisambiguationError as e:
        return None, {"error": f"Disambiguation error: {e}"}
    except Exception as e:
        return None, {"error": f"An error occurred: {str(e)}"}
    
# function to split content into sections using === [SECTION NAME] === regex pattern
def split_content_into_sections(content: str, content_format: str=None) -> List[str]:

    """
    Splits the content into sections using the === [SECTION NAME] === regex pattern.

    Args:
        content (str): The content to split.
        content_format (str): The format to return the content in ("Plain Text" or "XML").

    Returns:
        dict: The sections dictionary.
    """

    sections_dict = {}

    # Split the content into sections using regex
    sections = re.split(r'={2,}([^=]+)={2,}', content)

    # Iterate over the sections and add them to the dictionary
    for i in range(1, len(sections), 2):
        section_name = sections[i].strip()
        section_content = sections[i + 1].strip()
        sections_dict[section_name] = section_content

    return sections_dict