Spaces:

bakrianoo
/

wikipedia-translator

Running

App Files Files Community

wikipedia-translator / utils /wikipedia_extractor.py

bakrianoo

fix searching for wikipedia pages

49b1cdf about 1 month ago

raw

history blame

5.43 kB

	import wikipedia
	from typing import List, Dict, Any, Optional, Tuple
	import urllib.parse
	import requests
	import xml.etree.ElementTree as ET
	import re


	# Function to extract wiki id from a given url
	def extract_wiki_id(url: str) -> str:
	"""
	Extracts the wiki id from a given url.

	Args:
	url (str): The url to extract the wiki id from.

	Returns:
	str: The extracted wiki id.
	"""

	# validate the url is from wikipedia
	if "wikipedia.org" not in url:
	raise ValueError("URL is not from Wikipedia")

	# Parse the URL
	parsed_url = urllib.parse.urlparse(url)

	# Extract the path from the parsed URL
	path = parsed_url.path

	# Split the path into parts
	path_parts = path.split('/')

	# The wiki id is the last part of the path
	wiki_id = path_parts[-1]

	# Remove any query parameters
	if '?' in wiki_id:
	wiki_id = wiki_id.split('?')[0]

	# Remove any fragment identifiers
	if '#' in wiki_id:
	wiki_id = wiki_id.split('#')[0]

	# URL decode the wiki id to handle special characters
	wiki_id = urllib.parse.unquote(wiki_id)

	# Replace underscores with spaces as Wikipedia API expects spaces
	wiki_id = wiki_id.replace('_', ' ')

	return wiki_id

	# Function to get all details dictionary from a given wiki id
	def get_wiki_details(wiki_id_or_url: str) -> Optional[Dict[str, Any]]:
	"""
	Gets all details dictionary from a given wiki id or URL.

	Args:
	wiki_id_or_url (str): The wiki id or URL to get the details from.

	Returns:
	dict: The details dictionary or None if there was an error.
	"""
	try:
	# Check if input is a URL and extract wiki_id if it is
	if "wikipedia.org" in wiki_id_or_url:
	wiki_id = extract_wiki_id(wiki_id_or_url)
	else:
	wiki_id = wiki_id_or_url

	# Get the page object
	try:
	page = wikipedia.page(wiki_id, auto_suggest=False)
	except wikipedia.exceptions.PageError:
	# If direct page lookup fails, try search
	search_results = wikipedia.search(wiki_id)
	if not search_results:
	print(f"No results found for '{wiki_id}'")
	return None

	# Use the first search result
	try:
	page = wikipedia.page(search_results[0], auto_suggest=False)
	print(f"Using closest match: '{page.title}' for query '{wiki_id}'")
	except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError) as e:
	print(f"Error with search result: {e}")
	return None

	wiki_xml, has_error = get_wiki_xml(page.title)
	if has_error or not wiki_xml:
	print(f"Error fetching XML data: {has_error}")
	return None

	# Get the details dictionary
	details = {
	"title": page.title,
	"wiki_xml": wiki_xml,
	"pageid": page.pageid,
	"url": page.url,
	"content": page.content,
	"summary": page.summary,
	"images": page.images,
	"links": page.links,
	"categories": page.categories,
	"references": page.references,
	"sections": page.sections
	}

	return details

	except wikipedia.exceptions.DisambiguationError as e:
	print(f"Disambiguation error: {e}")
	print(f"Please specify one of the options above.")
	return None
	except Exception as e:
	print(f"An error occurred: {str(e)}")
	return None

	# functio to get xml data from a given wiki id
	def get_wiki_xml(page_title):
	try:

	# MediaWiki API endpoint
	url = "https://en.wikipedia.org/w/api.php"

	# Parameters for XML format
	params = {
	"action": "query",
	"titles": page_title,
	"prop": "revisions",
	"rvprop": "content",
	"format": "xml"
	}

	# Make the request
	response = requests.get(url, params=params)
	xml_content = response.text

	return xml_content, None

	except wikipedia.exceptions.PageError:
	return None, {"error": f"Page '{page_title}' does not exist"}
	except wikipedia.exceptions.DisambiguationError as e:
	return None, {"error": f"Disambiguation error: {e}"}
	except Exception as e:
	return None, {"error": f"An error occurred: {str(e)}"}

	# function to split content into sections using === [SECTION NAME] === regex pattern
	def split_content_into_sections(content: str, content_format: str=None) -> List[str]:

	"""
	Splits the content into sections using the === [SECTION NAME] === regex pattern.

	Args:
	content (str): The content to split.
	content_format (str): The format to return the content in ("Plain Text" or "XML").

	Returns:
	dict: The sections dictionary.
	"""

	sections_dict = {}

	# Split the content into sections using regex
	sections = re.split(r'={2,}([^=]+)={2,}', content)

	# Iterate over the sections and add them to the dictionary
	for i in range(1, len(sections), 2):
	section_name = sections[i].strip()
	section_content = sections[i + 1].strip()
	sections_dict[section_name] = section_content

	return sections_dict