wikipedia-translator / utils /wikipedia_extractor.py
bakrianoo's picture
fix searching for wikipedia pages
49b1cdf
import wikipedia
from typing import List, Dict, Any, Optional, Tuple
import urllib.parse
import requests
import xml.etree.ElementTree as ET
import re
# Function to extract wiki id from a given url
def extract_wiki_id(url: str) -> str:
"""
Extracts the wiki id from a given url.
Args:
url (str): The url to extract the wiki id from.
Returns:
str: The extracted wiki id.
"""
# validate the url is from wikipedia
if "wikipedia.org" not in url:
raise ValueError("URL is not from Wikipedia")
# Parse the URL
parsed_url = urllib.parse.urlparse(url)
# Extract the path from the parsed URL
path = parsed_url.path
# Split the path into parts
path_parts = path.split('/')
# The wiki id is the last part of the path
wiki_id = path_parts[-1]
# Remove any query parameters
if '?' in wiki_id:
wiki_id = wiki_id.split('?')[0]
# Remove any fragment identifiers
if '#' in wiki_id:
wiki_id = wiki_id.split('#')[0]
# URL decode the wiki id to handle special characters
wiki_id = urllib.parse.unquote(wiki_id)
# Replace underscores with spaces as Wikipedia API expects spaces
wiki_id = wiki_id.replace('_', ' ')
return wiki_id
# Function to get all details dictionary from a given wiki id
def get_wiki_details(wiki_id_or_url: str) -> Optional[Dict[str, Any]]:
"""
Gets all details dictionary from a given wiki id or URL.
Args:
wiki_id_or_url (str): The wiki id or URL to get the details from.
Returns:
dict: The details dictionary or None if there was an error.
"""
try:
# Check if input is a URL and extract wiki_id if it is
if "wikipedia.org" in wiki_id_or_url:
wiki_id = extract_wiki_id(wiki_id_or_url)
else:
wiki_id = wiki_id_or_url
# Get the page object
try:
page = wikipedia.page(wiki_id, auto_suggest=False)
except wikipedia.exceptions.PageError:
# If direct page lookup fails, try search
search_results = wikipedia.search(wiki_id)
if not search_results:
print(f"No results found for '{wiki_id}'")
return None
# Use the first search result
try:
page = wikipedia.page(search_results[0], auto_suggest=False)
print(f"Using closest match: '{page.title}' for query '{wiki_id}'")
except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError) as e:
print(f"Error with search result: {e}")
return None
wiki_xml, has_error = get_wiki_xml(page.title)
if has_error or not wiki_xml:
print(f"Error fetching XML data: {has_error}")
return None
# Get the details dictionary
details = {
"title": page.title,
"wiki_xml": wiki_xml,
"pageid": page.pageid,
"url": page.url,
"content": page.content,
"summary": page.summary,
"images": page.images,
"links": page.links,
"categories": page.categories,
"references": page.references,
"sections": page.sections
}
return details
except wikipedia.exceptions.DisambiguationError as e:
print(f"Disambiguation error: {e}")
print(f"Please specify one of the options above.")
return None
except Exception as e:
print(f"An error occurred: {str(e)}")
return None
# functio to get xml data from a given wiki id
def get_wiki_xml(page_title):
try:
# MediaWiki API endpoint
url = "https://en.wikipedia.org/w/api.php"
# Parameters for XML format
params = {
"action": "query",
"titles": page_title,
"prop": "revisions",
"rvprop": "content",
"format": "xml"
}
# Make the request
response = requests.get(url, params=params)
xml_content = response.text
return xml_content, None
except wikipedia.exceptions.PageError:
return None, {"error": f"Page '{page_title}' does not exist"}
except wikipedia.exceptions.DisambiguationError as e:
return None, {"error": f"Disambiguation error: {e}"}
except Exception as e:
return None, {"error": f"An error occurred: {str(e)}"}
# function to split content into sections using === [SECTION NAME] === regex pattern
def split_content_into_sections(content: str, content_format: str=None) -> List[str]:
"""
Splits the content into sections using the === [SECTION NAME] === regex pattern.
Args:
content (str): The content to split.
content_format (str): The format to return the content in ("Plain Text" or "XML").
Returns:
dict: The sections dictionary.
"""
sections_dict = {}
# Split the content into sections using regex
sections = re.split(r'={2,}([^=]+)={2,}', content)
# Iterate over the sections and add them to the dictionary
for i in range(1, len(sections), 2):
section_name = sections[i].strip()
section_content = sections[i + 1].strip()
sections_dict[section_name] = section_content
return sections_dict