wikipedia-translator / utils /wikipedia_extractor.py
bakrianoo's picture
fix searching for wikipedia pages
49b1cdf
raw
history blame
5.43 kB
import wikipedia
from typing import List, Dict, Any, Optional, Tuple
import urllib.parse
import requests
import xml.etree.ElementTree as ET
import re
# Function to extract wiki id from a given url
def extract_wiki_id(url: str) -> str:
"""
Extracts the wiki id from a given url.
Args:
url (str): The url to extract the wiki id from.
Returns:
str: The extracted wiki id.
"""
# validate the url is from wikipedia
if "wikipedia.org" not in url:
raise ValueError("URL is not from Wikipedia")
# Parse the URL
parsed_url = urllib.parse.urlparse(url)
# Extract the path from the parsed URL
path = parsed_url.path
# Split the path into parts
path_parts = path.split('/')
# The wiki id is the last part of the path
wiki_id = path_parts[-1]
# Remove any query parameters
if '?' in wiki_id:
wiki_id = wiki_id.split('?')[0]
# Remove any fragment identifiers
if '#' in wiki_id:
wiki_id = wiki_id.split('#')[0]
# URL decode the wiki id to handle special characters
wiki_id = urllib.parse.unquote(wiki_id)
# Replace underscores with spaces as Wikipedia API expects spaces
wiki_id = wiki_id.replace('_', ' ')
return wiki_id
# Function to get all details dictionary from a given wiki id
def get_wiki_details(wiki_id_or_url: str) -> Optional[Dict[str, Any]]:
"""
Gets all details dictionary from a given wiki id or URL.
Args:
wiki_id_or_url (str): The wiki id or URL to get the details from.
Returns:
dict: The details dictionary or None if there was an error.
"""
try:
# Check if input is a URL and extract wiki_id if it is
if "wikipedia.org" in wiki_id_or_url:
wiki_id = extract_wiki_id(wiki_id_or_url)
else:
wiki_id = wiki_id_or_url
# Get the page object
try:
page = wikipedia.page(wiki_id, auto_suggest=False)
except wikipedia.exceptions.PageError:
# If direct page lookup fails, try search
search_results = wikipedia.search(wiki_id)
if not search_results:
print(f"No results found for '{wiki_id}'")
return None
# Use the first search result
try:
page = wikipedia.page(search_results[0], auto_suggest=False)
print(f"Using closest match: '{page.title}' for query '{wiki_id}'")
except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError) as e:
print(f"Error with search result: {e}")
return None
wiki_xml, has_error = get_wiki_xml(page.title)
if has_error or not wiki_xml:
print(f"Error fetching XML data: {has_error}")
return None
# Get the details dictionary
details = {
"title": page.title,
"wiki_xml": wiki_xml,
"pageid": page.pageid,
"url": page.url,
"content": page.content,
"summary": page.summary,
"images": page.images,
"links": page.links,
"categories": page.categories,
"references": page.references,
"sections": page.sections
}
return details
except wikipedia.exceptions.DisambiguationError as e:
print(f"Disambiguation error: {e}")
print(f"Please specify one of the options above.")
return None
except Exception as e:
print(f"An error occurred: {str(e)}")
return None
# functio to get xml data from a given wiki id
def get_wiki_xml(page_title):
try:
# MediaWiki API endpoint
url = "https://en.wikipedia.org/w/api.php"
# Parameters for XML format
params = {
"action": "query",
"titles": page_title,
"prop": "revisions",
"rvprop": "content",
"format": "xml"
}
# Make the request
response = requests.get(url, params=params)
xml_content = response.text
return xml_content, None
except wikipedia.exceptions.PageError:
return None, {"error": f"Page '{page_title}' does not exist"}
except wikipedia.exceptions.DisambiguationError as e:
return None, {"error": f"Disambiguation error: {e}"}
except Exception as e:
return None, {"error": f"An error occurred: {str(e)}"}
# function to split content into sections using === [SECTION NAME] === regex pattern
def split_content_into_sections(content: str, content_format: str=None) -> List[str]:
"""
Splits the content into sections using the === [SECTION NAME] === regex pattern.
Args:
content (str): The content to split.
content_format (str): The format to return the content in ("Plain Text" or "XML").
Returns:
dict: The sections dictionary.
"""
sections_dict = {}
# Split the content into sections using regex
sections = re.split(r'={2,}([^=]+)={2,}', content)
# Iterate over the sections and add them to the dictionary
for i in range(1, len(sections), 2):
section_name = sections[i].strip()
section_content = sections[i + 1].strip()
sections_dict[section_name] = section_content
return sections_dict