Spaces:
Running
Running
File size: 3,680 Bytes
e424603 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import wikipedia
from typing import List, Dict, Any
import urllib.parse
import requests
import xml.etree.ElementTree as ET
import re
# Function to extract wiki id from a given url
def extract_wiki_id(url: str) -> str:
"""
Extracts the wiki id from a given url.
Args:
url (str): The url to extract the wiki id from.
Returns:
str: The extracted wiki id.
"""
# validate the url is from wikipedia
if "wikipedia.org" not in url:
raise ValueError("URL is not from Wikipedia")
# Parse the URL
parsed_url = urllib.parse.urlparse(url)
# Extract the path from the parsed URL
path = parsed_url.path
# Split the path into parts
path_parts = path.split('/')
# The wiki id is the last part of the path
wiki_id = path_parts[-1]
# Remove any query parameters
if '?' in wiki_id:
wiki_id = wiki_id.split('?')[0]
# Remove any fragment identifiers
if '#' in wiki_id:
wiki_id = wiki_id.split('#')[0]
return wiki_id
# Function to get all details dictionary from a given wiki id
def get_wiki_details(wiki_id: str) -> Dict[str, Any]:
"""
Gets all details dictionary from a given wiki id.
Args:
wiki_id (str): The wiki id to get the details from.
Returns:
dict: The details dictionary.
"""
# Get the page object
page = wikipedia.page(wiki_id)
wiki_xml, has_error = get_wiki_xml(wiki_id)
if has_error or not wiki_xml:
print(f"Error fetching XML data: {has_error}")
return None
# Get the details dictionary
details = {
"title": page.title,
"wiki_xml": wiki_xml,
"pageid": page.pageid,
"url": page.url,
"content": page.content,
"summary": page.summary,
"images": page.images,
"links": page.links,
"categories": page.categories,
"references": page.references,
"sections": page.sections
}
return details
# functio to get xml data from a given wiki id
def get_wiki_xml(page_title):
try:
# MediaWiki API endpoint
url = "https://en.wikipedia.org/w/api.php"
# Parameters for XML format
params = {
"action": "query",
"titles": page_title,
"prop": "revisions",
"rvprop": "content",
"format": "xml"
}
# Make the request
response = requests.get(url, params=params)
xml_content = response.text
return xml_content, None
except wikipedia.exceptions.PageError:
return None, {"error": f"Page '{page_title}' does not exist"}
except wikipedia.exceptions.DisambiguationError as e:
return None, {"error": f"Disambiguation error: {e}"}
except Exception as e:
return None, {"error": f"An error occurred: {str(e)}"}
# function to split content into sections using === [SECTION NAME] === regex pattern
def split_content_into_sections(content: str) -> List[str]:
"""
Splits the content into sections using the === [SECTION NAME] === regex pattern.
Args:
content (str): The content to split.
Returns:
dict: The sections dictionary.
"""
sections_dict = {}
# Split the content into sections using regex
sections = re.split(r'={2,}([^=]+)={2,}', content)
# Iterate over the sections and add them to the dictionary
for i in range(1, len(sections), 2):
section_name = sections[i].strip()
section_content = sections[i + 1].strip()
sections_dict[section_name] = section_content
return sections_dict
|