Spaces:
Running
Running
import wikipedia | |
from typing import List, Dict, Any | |
import urllib.parse | |
import requests | |
import xml.etree.ElementTree as ET | |
import re | |
# Function to extract wiki id from a given url | |
def extract_wiki_id(url: str) -> str: | |
""" | |
Extracts the wiki id from a given url. | |
Args: | |
url (str): The url to extract the wiki id from. | |
Returns: | |
str: The extracted wiki id. | |
""" | |
# validate the url is from wikipedia | |
if "wikipedia.org" not in url: | |
raise ValueError("URL is not from Wikipedia") | |
# Parse the URL | |
parsed_url = urllib.parse.urlparse(url) | |
# Extract the path from the parsed URL | |
path = parsed_url.path | |
# Split the path into parts | |
path_parts = path.split('/') | |
# The wiki id is the last part of the path | |
wiki_id = path_parts[-1] | |
# Remove any query parameters | |
if '?' in wiki_id: | |
wiki_id = wiki_id.split('?')[0] | |
# Remove any fragment identifiers | |
if '#' in wiki_id: | |
wiki_id = wiki_id.split('#')[0] | |
return wiki_id | |
# Function to get all details dictionary from a given wiki id | |
def get_wiki_details(wiki_id: str) -> Dict[str, Any]: | |
""" | |
Gets all details dictionary from a given wiki id. | |
Args: | |
wiki_id (str): The wiki id to get the details from. | |
Returns: | |
dict: The details dictionary. | |
""" | |
# Get the page object | |
page = wikipedia.page(wiki_id) | |
wiki_xml, has_error = get_wiki_xml(wiki_id) | |
if has_error or not wiki_xml: | |
print(f"Error fetching XML data: {has_error}") | |
return None | |
# Get the details dictionary | |
details = { | |
"title": page.title, | |
"wiki_xml": wiki_xml, | |
"pageid": page.pageid, | |
"url": page.url, | |
"content": page.content, | |
"summary": page.summary, | |
"images": page.images, | |
"links": page.links, | |
"categories": page.categories, | |
"references": page.references, | |
"sections": page.sections | |
} | |
return details | |
# functio to get xml data from a given wiki id | |
def get_wiki_xml(page_title): | |
try: | |
# MediaWiki API endpoint | |
url = "https://en.wikipedia.org/w/api.php" | |
# Parameters for XML format | |
params = { | |
"action": "query", | |
"titles": page_title, | |
"prop": "revisions", | |
"rvprop": "content", | |
"format": "xml" | |
} | |
# Make the request | |
response = requests.get(url, params=params) | |
xml_content = response.text | |
return xml_content, None | |
except wikipedia.exceptions.PageError: | |
return None, {"error": f"Page '{page_title}' does not exist"} | |
except wikipedia.exceptions.DisambiguationError as e: | |
return None, {"error": f"Disambiguation error: {e}"} | |
except Exception as e: | |
return None, {"error": f"An error occurred: {str(e)}"} | |
# function to split content into sections using === [SECTION NAME] === regex pattern | |
def split_content_into_sections(content: str) -> List[str]: | |
""" | |
Splits the content into sections using the === [SECTION NAME] === regex pattern. | |
Args: | |
content (str): The content to split. | |
Returns: | |
dict: The sections dictionary. | |
""" | |
sections_dict = {} | |
# Split the content into sections using regex | |
sections = re.split(r'={2,}([^=]+)={2,}', content) | |
# Iterate over the sections and add them to the dictionary | |
for i in range(1, len(sections), 2): | |
section_name = sections[i].strip() | |
section_content = sections[i + 1].strip() | |
sections_dict[section_name] = section_content | |
return sections_dict | |