import pandas as pd import requests from bs4 import BeautifulSoup def fetch_wikipedia_tables( url: str, handle_special_chars: bool = True, ) -> list[pd.DataFrame]: """ Fetch tables from a Wikipedia URL with robust error handling. Parameters: ----------- url : str The Wikipedia URL to fetch tables from. handle_special_chars : bool, default True Whether to clean special characters in data before parsing. Returns: -------- list of pd.DataFrame A list of pandas DataFrames containing the tables found on the page. """ try: all_tables = _fetch_tables_with_bs4(url) if handle_special_chars: # Clean tables to handle special characters and formatting issues for i, table in enumerate(all_tables): all_tables[i] = _clean_table(table) if all_tables: return all_tables else: print(f"No tables found at {url}") return [] except Exception as e: print(f"Error fetching tables: {e}") return [] def _fetch_tables_with_bs4(url: str) -> list[pd.DataFrame]: """Method to fetch tables using BeautifulSoup.""" try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") tables = [] for table in soup.find_all("table", {"class": "wikitable"}): data = [] headers = [] # Extract headers for th in table.find_all("th"): headers.append(th.text.strip()) # If no headers found in th tags, try first tr if not headers and table.find("tr"): for td in table.find("tr").find_all(["th", "td"]): headers.append(td.text.strip()) # Extract rows for row in table.find_all("tr")[1:] if headers else table.find_all("tr"): row_data = [] for cell in row.find_all(["td", "th"]): row_data.append(cell.text.strip()) if row_data: # Skip empty rows data.append(row_data) # Create DataFrame if data: if headers and len(headers) == len(data[0]): df = pd.DataFrame(data, columns=headers) else: df = pd.DataFrame(data) tables.append(df) return tables except Exception as e: print(f"Error in BeautifulSoup fallback: {e}") return [] def _clean_table(df: pd.DataFrame) -> pd.DataFrame: """Clean a table by handling special characters and formatting issues.""" # Make a copy to avoid modifying the original df = df.copy() # Handle all string columns for col in df.columns: if df[col].dtype == "object": # Replace common problematic characters df[col] = df[col].astype(str).str.replace(";", "", regex=False) df[col] = df[col].str.replace("−", "-", regex=False) # Replace minus sign df[col] = df[col].str.replace( "\xa0", " ", regex=False ) # Replace non-breaking space df[col] = df[col].str.replace("\n", " ", regex=False) # Replace newlines df[col] = df[col].str.strip() # Strip whitespace # Remove reference tags like [1], [2], etc. df[col] = df[col].str.replace(r"\[\d+\]", "", regex=True) return df