Spaces:
Running
Running
File size: 5,429 Bytes
e424603 49b1cdf e424603 49b1cdf e424603 49b1cdf e424603 49b1cdf e424603 49b1cdf e424603 49b1cdf e424603 49b1cdf e424603 49b1cdf e424603 94260c3 e424603 94260c3 e424603 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import wikipedia
from typing import List, Dict, Any, Optional, Tuple
import urllib.parse
import requests
import xml.etree.ElementTree as ET
import re
# Function to extract wiki id from a given url
def extract_wiki_id(url: str) -> str:
"""
Extracts the wiki id from a given url.
Args:
url (str): The url to extract the wiki id from.
Returns:
str: The extracted wiki id.
"""
# validate the url is from wikipedia
if "wikipedia.org" not in url:
raise ValueError("URL is not from Wikipedia")
# Parse the URL
parsed_url = urllib.parse.urlparse(url)
# Extract the path from the parsed URL
path = parsed_url.path
# Split the path into parts
path_parts = path.split('/')
# The wiki id is the last part of the path
wiki_id = path_parts[-1]
# Remove any query parameters
if '?' in wiki_id:
wiki_id = wiki_id.split('?')[0]
# Remove any fragment identifiers
if '#' in wiki_id:
wiki_id = wiki_id.split('#')[0]
# URL decode the wiki id to handle special characters
wiki_id = urllib.parse.unquote(wiki_id)
# Replace underscores with spaces as Wikipedia API expects spaces
wiki_id = wiki_id.replace('_', ' ')
return wiki_id
# Function to get all details dictionary from a given wiki id
def get_wiki_details(wiki_id_or_url: str) -> Optional[Dict[str, Any]]:
"""
Gets all details dictionary from a given wiki id or URL.
Args:
wiki_id_or_url (str): The wiki id or URL to get the details from.
Returns:
dict: The details dictionary or None if there was an error.
"""
try:
# Check if input is a URL and extract wiki_id if it is
if "wikipedia.org" in wiki_id_or_url:
wiki_id = extract_wiki_id(wiki_id_or_url)
else:
wiki_id = wiki_id_or_url
# Get the page object
try:
page = wikipedia.page(wiki_id, auto_suggest=False)
except wikipedia.exceptions.PageError:
# If direct page lookup fails, try search
search_results = wikipedia.search(wiki_id)
if not search_results:
print(f"No results found for '{wiki_id}'")
return None
# Use the first search result
try:
page = wikipedia.page(search_results[0], auto_suggest=False)
print(f"Using closest match: '{page.title}' for query '{wiki_id}'")
except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError) as e:
print(f"Error with search result: {e}")
return None
wiki_xml, has_error = get_wiki_xml(page.title)
if has_error or not wiki_xml:
print(f"Error fetching XML data: {has_error}")
return None
# Get the details dictionary
details = {
"title": page.title,
"wiki_xml": wiki_xml,
"pageid": page.pageid,
"url": page.url,
"content": page.content,
"summary": page.summary,
"images": page.images,
"links": page.links,
"categories": page.categories,
"references": page.references,
"sections": page.sections
}
return details
except wikipedia.exceptions.DisambiguationError as e:
print(f"Disambiguation error: {e}")
print(f"Please specify one of the options above.")
return None
except Exception as e:
print(f"An error occurred: {str(e)}")
return None
# functio to get xml data from a given wiki id
def get_wiki_xml(page_title):
try:
# MediaWiki API endpoint
url = "https://en.wikipedia.org/w/api.php"
# Parameters for XML format
params = {
"action": "query",
"titles": page_title,
"prop": "revisions",
"rvprop": "content",
"format": "xml"
}
# Make the request
response = requests.get(url, params=params)
xml_content = response.text
return xml_content, None
except wikipedia.exceptions.PageError:
return None, {"error": f"Page '{page_title}' does not exist"}
except wikipedia.exceptions.DisambiguationError as e:
return None, {"error": f"Disambiguation error: {e}"}
except Exception as e:
return None, {"error": f"An error occurred: {str(e)}"}
# function to split content into sections using === [SECTION NAME] === regex pattern
def split_content_into_sections(content: str, content_format: str=None) -> List[str]:
"""
Splits the content into sections using the === [SECTION NAME] === regex pattern.
Args:
content (str): The content to split.
content_format (str): The format to return the content in ("Plain Text" or "XML").
Returns:
dict: The sections dictionary.
"""
sections_dict = {}
# Split the content into sections using regex
sections = re.split(r'={2,}([^=]+)={2,}', content)
# Iterate over the sections and add them to the dictionary
for i in range(1, len(sections), 2):
section_name = sections[i].strip()
section_content = sections[i + 1].strip()
sections_dict[section_name] = section_content
return sections_dict
|