import wikipedia from typing import List, Dict, Any, Optional, Tuple import urllib.parse import requests import xml.etree.ElementTree as ET import re # Function to extract wiki id from a given url def extract_wiki_id(url: str) -> str: """ Extracts the wiki id from a given url. Args: url (str): The url to extract the wiki id from. Returns: str: The extracted wiki id. """ # validate the url is from wikipedia if "wikipedia.org" not in url: raise ValueError("URL is not from Wikipedia") # Parse the URL parsed_url = urllib.parse.urlparse(url) # Extract the path from the parsed URL path = parsed_url.path # Split the path into parts path_parts = path.split('/') # The wiki id is the last part of the path wiki_id = path_parts[-1] # Remove any query parameters if '?' in wiki_id: wiki_id = wiki_id.split('?')[0] # Remove any fragment identifiers if '#' in wiki_id: wiki_id = wiki_id.split('#')[0] # URL decode the wiki id to handle special characters wiki_id = urllib.parse.unquote(wiki_id) # Replace underscores with spaces as Wikipedia API expects spaces wiki_id = wiki_id.replace('_', ' ') return wiki_id # Function to get all details dictionary from a given wiki id def get_wiki_details(wiki_id_or_url: str) -> Optional[Dict[str, Any]]: """ Gets all details dictionary from a given wiki id or URL. Args: wiki_id_or_url (str): The wiki id or URL to get the details from. Returns: dict: The details dictionary or None if there was an error. """ try: # Check if input is a URL and extract wiki_id if it is if "wikipedia.org" in wiki_id_or_url: wiki_id = extract_wiki_id(wiki_id_or_url) else: wiki_id = wiki_id_or_url # Get the page object try: page = wikipedia.page(wiki_id, auto_suggest=False) except wikipedia.exceptions.PageError: # If direct page lookup fails, try search search_results = wikipedia.search(wiki_id) if not search_results: print(f"No results found for '{wiki_id}'") return None # Use the first search result try: page = wikipedia.page(search_results[0], auto_suggest=False) print(f"Using closest match: '{page.title}' for query '{wiki_id}'") except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError) as e: print(f"Error with search result: {e}") return None wiki_xml, has_error = get_wiki_xml(page.title) if has_error or not wiki_xml: print(f"Error fetching XML data: {has_error}") return None # Get the details dictionary details = { "title": page.title, "wiki_xml": wiki_xml, "pageid": page.pageid, "url": page.url, "content": page.content, "summary": page.summary, "images": page.images, "links": page.links, "categories": page.categories, "references": page.references, "sections": page.sections } return details except wikipedia.exceptions.DisambiguationError as e: print(f"Disambiguation error: {e}") print(f"Please specify one of the options above.") return None except Exception as e: print(f"An error occurred: {str(e)}") return None # functio to get xml data from a given wiki id def get_wiki_xml(page_title): try: # MediaWiki API endpoint url = "https://en.wikipedia.org/w/api.php" # Parameters for XML format params = { "action": "query", "titles": page_title, "prop": "revisions", "rvprop": "content", "format": "xml" } # Make the request response = requests.get(url, params=params) xml_content = response.text return xml_content, None except wikipedia.exceptions.PageError: return None, {"error": f"Page '{page_title}' does not exist"} except wikipedia.exceptions.DisambiguationError as e: return None, {"error": f"Disambiguation error: {e}"} except Exception as e: return None, {"error": f"An error occurred: {str(e)}"} # function to split content into sections using === [SECTION NAME] === regex pattern def split_content_into_sections(content: str, content_format: str=None) -> List[str]: """ Splits the content into sections using the === [SECTION NAME] === regex pattern. Args: content (str): The content to split. content_format (str): The format to return the content in ("Plain Text" or "XML"). Returns: dict: The sections dictionary. """ sections_dict = {} # Split the content into sections using regex sections = re.split(r'={2,}([^=]+)={2,}', content) # Iterate over the sections and add them to the dictionary for i in range(1, len(sections), 2): section_name = sections[i].strip() section_content = sections[i + 1].strip() sections_dict[section_name] = section_content return sections_dict