|
import pandas as pd |
|
import requests |
|
from bs4 import BeautifulSoup |
|
|
|
|
|
def fetch_wikipedia_tables( |
|
url: str, |
|
handle_special_chars: bool = True, |
|
) -> list[pd.DataFrame]: |
|
""" |
|
Fetch tables from a Wikipedia URL with robust error handling. |
|
|
|
Parameters: |
|
----------- |
|
url : str |
|
The Wikipedia URL to fetch tables from. |
|
handle_special_chars : bool, default True |
|
Whether to clean special characters in data before parsing. |
|
|
|
Returns: |
|
-------- |
|
list of pd.DataFrame |
|
A list of pandas DataFrames containing the tables found on the page. |
|
""" |
|
try: |
|
all_tables = _fetch_tables_with_bs4(url) |
|
|
|
if handle_special_chars: |
|
|
|
for i, table in enumerate(all_tables): |
|
all_tables[i] = _clean_table(table) |
|
|
|
if all_tables: |
|
return all_tables |
|
else: |
|
print(f"No tables found at {url}") |
|
return [] |
|
except Exception as e: |
|
print(f"Error fetching tables: {e}") |
|
return [] |
|
|
|
|
|
def _fetch_tables_with_bs4(url: str) -> list[pd.DataFrame]: |
|
"""Method to fetch tables using BeautifulSoup.""" |
|
try: |
|
response = requests.get(url) |
|
response.raise_for_status() |
|
soup = BeautifulSoup(response.content, "html.parser") |
|
tables = [] |
|
|
|
for table in soup.find_all("table", {"class": "wikitable"}): |
|
data = [] |
|
headers = [] |
|
|
|
|
|
for th in table.find_all("th"): |
|
headers.append(th.text.strip()) |
|
|
|
|
|
if not headers and table.find("tr"): |
|
for td in table.find("tr").find_all(["th", "td"]): |
|
headers.append(td.text.strip()) |
|
|
|
|
|
for row in table.find_all("tr")[1:] if headers else table.find_all("tr"): |
|
row_data = [] |
|
for cell in row.find_all(["td", "th"]): |
|
row_data.append(cell.text.strip()) |
|
if row_data: |
|
data.append(row_data) |
|
|
|
|
|
if data: |
|
if headers and len(headers) == len(data[0]): |
|
df = pd.DataFrame(data, columns=headers) |
|
else: |
|
df = pd.DataFrame(data) |
|
tables.append(df) |
|
|
|
return tables |
|
except Exception as e: |
|
print(f"Error in BeautifulSoup fallback: {e}") |
|
return [] |
|
|
|
|
|
def _clean_table(df: pd.DataFrame) -> pd.DataFrame: |
|
"""Clean a table by handling special characters and formatting issues.""" |
|
|
|
df = df.copy() |
|
|
|
|
|
for col in df.columns: |
|
if df[col].dtype == "object": |
|
|
|
df[col] = df[col].astype(str).str.replace(";", "", regex=False) |
|
df[col] = df[col].str.replace("−", "-", regex=False) |
|
df[col] = df[col].str.replace( |
|
"\xa0", " ", regex=False |
|
) |
|
df[col] = df[col].str.replace("\n", " ", regex=False) |
|
df[col] = df[col].str.strip() |
|
|
|
|
|
df[col] = df[col].str.replace(r"\[\d+\]", "", regex=True) |
|
|
|
return df |
|
|