Spaces:
Running
Running
import wikipedia | |
from typing import List, Dict, Any, Optional, Tuple | |
import urllib.parse | |
import requests | |
import xml.etree.ElementTree as ET | |
import re | |
# Function to extract wiki id from a given url | |
def extract_wiki_id(url: str) -> str: | |
""" | |
Extracts the wiki id from a given url. | |
Args: | |
url (str): The url to extract the wiki id from. | |
Returns: | |
str: The extracted wiki id. | |
""" | |
# validate the url is from wikipedia | |
if "wikipedia.org" not in url: | |
raise ValueError("URL is not from Wikipedia") | |
# Parse the URL | |
parsed_url = urllib.parse.urlparse(url) | |
# Extract the path from the parsed URL | |
path = parsed_url.path | |
# Split the path into parts | |
path_parts = path.split('/') | |
# The wiki id is the last part of the path | |
wiki_id = path_parts[-1] | |
# Remove any query parameters | |
if '?' in wiki_id: | |
wiki_id = wiki_id.split('?')[0] | |
# Remove any fragment identifiers | |
if '#' in wiki_id: | |
wiki_id = wiki_id.split('#')[0] | |
# URL decode the wiki id to handle special characters | |
wiki_id = urllib.parse.unquote(wiki_id) | |
# Replace underscores with spaces as Wikipedia API expects spaces | |
wiki_id = wiki_id.replace('_', ' ') | |
return wiki_id | |
# Function to get all details dictionary from a given wiki id | |
def get_wiki_details(wiki_id_or_url: str) -> Optional[Dict[str, Any]]: | |
""" | |
Gets all details dictionary from a given wiki id or URL. | |
Args: | |
wiki_id_or_url (str): The wiki id or URL to get the details from. | |
Returns: | |
dict: The details dictionary or None if there was an error. | |
""" | |
try: | |
# Check if input is a URL and extract wiki_id if it is | |
if "wikipedia.org" in wiki_id_or_url: | |
wiki_id = extract_wiki_id(wiki_id_or_url) | |
else: | |
wiki_id = wiki_id_or_url | |
# Get the page object | |
try: | |
page = wikipedia.page(wiki_id, auto_suggest=False) | |
except wikipedia.exceptions.PageError: | |
# If direct page lookup fails, try search | |
search_results = wikipedia.search(wiki_id) | |
if not search_results: | |
print(f"No results found for '{wiki_id}'") | |
return None | |
# Use the first search result | |
try: | |
page = wikipedia.page(search_results[0], auto_suggest=False) | |
print(f"Using closest match: '{page.title}' for query '{wiki_id}'") | |
except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError) as e: | |
print(f"Error with search result: {e}") | |
return None | |
wiki_xml, has_error = get_wiki_xml(page.title) | |
if has_error or not wiki_xml: | |
print(f"Error fetching XML data: {has_error}") | |
return None | |
# Get the details dictionary | |
details = { | |
"title": page.title, | |
"wiki_xml": wiki_xml, | |
"pageid": page.pageid, | |
"url": page.url, | |
"content": page.content, | |
"summary": page.summary, | |
"images": page.images, | |
"links": page.links, | |
"categories": page.categories, | |
"references": page.references, | |
"sections": page.sections | |
} | |
return details | |
except wikipedia.exceptions.DisambiguationError as e: | |
print(f"Disambiguation error: {e}") | |
print(f"Please specify one of the options above.") | |
return None | |
except Exception as e: | |
print(f"An error occurred: {str(e)}") | |
return None | |
# functio to get xml data from a given wiki id | |
def get_wiki_xml(page_title): | |
try: | |
# MediaWiki API endpoint | |
url = "https://en.wikipedia.org/w/api.php" | |
# Parameters for XML format | |
params = { | |
"action": "query", | |
"titles": page_title, | |
"prop": "revisions", | |
"rvprop": "content", | |
"format": "xml" | |
} | |
# Make the request | |
response = requests.get(url, params=params) | |
xml_content = response.text | |
return xml_content, None | |
except wikipedia.exceptions.PageError: | |
return None, {"error": f"Page '{page_title}' does not exist"} | |
except wikipedia.exceptions.DisambiguationError as e: | |
return None, {"error": f"Disambiguation error: {e}"} | |
except Exception as e: | |
return None, {"error": f"An error occurred: {str(e)}"} | |
# function to split content into sections using === [SECTION NAME] === regex pattern | |
def split_content_into_sections(content: str, content_format: str=None) -> List[str]: | |
""" | |
Splits the content into sections using the === [SECTION NAME] === regex pattern. | |
Args: | |
content (str): The content to split. | |
content_format (str): The format to return the content in ("Plain Text" or "XML"). | |
Returns: | |
dict: The sections dictionary. | |
""" | |
sections_dict = {} | |
# Split the content into sections using regex | |
sections = re.split(r'={2,}([^=]+)={2,}', content) | |
# Iterate over the sections and add them to the dictionary | |
for i in range(1, len(sections), 2): | |
section_name = sections[i].strip() | |
section_content = sections[i + 1].strip() | |
sections_dict[section_name] = section_content | |
return sections_dict | |