Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
import plotly.express as px | |
import plotly.graph_objects as go | |
from datasets import load_dataset | |
import pyarrow.parquet as pq | |
# Load the dataset using direct parquet file loading | |
print("Loading dataset...") | |
# Load directly from Hugging Face using pandas | |
train_dfs = [] | |
test_dfs = [] | |
# Base URL for the dataset files | |
base_url = "https://huggingface.co/datasets/irf23/canadian-parliamentary-expenditures/resolve/main/data" | |
# List of expected files based on the dataset description | |
print("Loading training data...") | |
for year in range(2021, 2025): | |
for quarter in range(1, 5): | |
if year == 2021 and quarter == 1: | |
continue # Data starts from 2021 Q2 | |
try: | |
url = f"{base_url}/train/expenditures-{year}-q{quarter}.parquet" | |
df = pd.read_parquet(url) | |
train_dfs.append(df) | |
print(f"Loaded {year} Q{quarter} train data ({len(df)} records)") | |
except Exception as e: | |
print(f"Could not load {year} Q{quarter}: {e}") | |
# Load 2025 test data | |
print("\nLoading test data...") | |
for quarter in range(1, 5): | |
try: | |
url = f"{base_url}/test/expenditures-2025-q{quarter}.parquet" | |
df = pd.read_parquet(url) | |
test_dfs.append(df) | |
print(f"Loaded 2025 Q{quarter} test data ({len(df)} records)") | |
except Exception as e: | |
print(f"Could not load 2025 Q{quarter}: {e}") | |
# Combine all dataframes | |
if train_dfs and test_dfs: | |
expenditures_df = pd.concat(train_dfs + test_dfs, ignore_index=True) | |
elif train_dfs: | |
expenditures_df = pd.concat(train_dfs, ignore_index=True) | |
else: | |
# Create dummy data for testing | |
print("Creating dummy data for demonstration") | |
expenditures_df = pd.DataFrame({ | |
'Id': ['1', '2', '3'], | |
'MemberId': ['m1', 'm2', 'm3'], | |
'MemberName': ['John Doe', 'Jane Smith', 'Bob Johnson'], | |
'Constituency': ['Riding A', 'Riding B', 'Riding C'], | |
'Party': ['Liberal', 'Conservative', 'NDP'], | |
'Category': ['Travel', 'Hospitality', 'Contract'], | |
'Amount': [1000.0, 2000.0, 1500.0], | |
'Description': ['Flight to Ottawa', 'Meeting expenses', 'Consulting'], | |
'Location': ['Toronto', 'Vancouver', 'Montreal'], | |
'Supplier': ['Air Canada', 'Hotel XYZ', 'Consultant ABC'], | |
'PeriodYear': [2024, 2024, 2024], | |
'PeriodQuarter': [1, 1, 2], | |
'DateIncurred': ['2024-01-15', '2024-02-20', '2024-04-10'], | |
'ClaimId': ['c1', 'c2', 'c3'], | |
'CreatedAt': ['2024-01-20', '2024-02-25', '2024-04-15'], | |
'UpdatedAt': ['2024-01-20', '2024-02-25', '2024-04-15'] | |
}) | |
# Convert column names to lowercase | |
expenditures_df.columns = expenditures_df.columns.str.lower() | |
# Convert data types | |
expenditures_df['dateincurred'] = pd.to_datetime(expenditures_df['dateincurred'], errors='coerce') | |
expenditures_df['amount'] = pd.to_numeric(expenditures_df['amount'], errors='coerce') | |
expenditures_df['periodyear'] = pd.to_numeric(expenditures_df['periodyear'], errors='coerce') | |
expenditures_df['periodquarter'] = pd.to_numeric(expenditures_df['periodquarter'], errors='coerce') | |
print(f"\nLoaded {len(expenditures_df)} total expenditure records") | |
print(f"Columns: {list(expenditures_df.columns)}") | |
def create_overview_plots(year_filter, party_filter, category_filter): | |
# Apply filters | |
filtered_df = expenditures_df.copy() | |
if year_filter: | |
filtered_df = filtered_df[filtered_df['periodyear'].isin(year_filter)] | |
if party_filter: | |
filtered_df = filtered_df[filtered_df['party'].isin(party_filter)] | |
if category_filter: | |
filtered_df = filtered_df[filtered_df['category'].isin(category_filter)] | |
# Calculate metrics | |
total_spending = filtered_df['amount'].sum() | |
num_records = len(filtered_df) | |
avg_expense = filtered_df['amount'].mean() if num_records > 0 else 0 | |
num_members = filtered_df['memberid'].nunique() | |
metrics_text = f""" | |
### Key Metrics | |
- **Total Spending**: ${total_spending:,.2f} | |
- **Number of Records**: {num_records:,} | |
- **Average Expense**: ${avg_expense:,.2f} | |
- **Active Members**: {num_members} | |
""" | |
# Create spending by category pie chart | |
if len(filtered_df) > 0: | |
category_spending = filtered_df.groupby('category')['amount'].sum().reset_index() | |
fig_category = px.pie( | |
category_spending, | |
values='amount', | |
names='category', | |
title='Spending by Category' | |
) | |
else: | |
fig_category = px.pie(title='No data for selected filters') | |
# Create spending by party bar chart | |
if len(filtered_df) > 0: | |
party_spending = filtered_df.groupby('party')['amount'].sum().sort_values(ascending=False).reset_index() | |
fig_party = px.bar( | |
party_spending, | |
x='party', | |
y='amount', | |
title='Total Spending by Party', | |
labels={'amount': 'Total Amount ($)', 'party': 'Party'} | |
) | |
else: | |
fig_party = px.bar(title='No data for selected filters') | |
# Create quarterly trend line chart | |
if len(filtered_df) > 0: | |
quarterly = filtered_df.groupby(['periodyear', 'periodquarter'])['amount'].sum().reset_index() | |
quarterly['period'] = quarterly['periodyear'].astype(str) + '-Q' + quarterly['periodquarter'].astype(str) | |
fig_trend = px.line( | |
quarterly, | |
x='period', | |
y='amount', | |
title='Quarterly Spending Trend', | |
labels={'amount': 'Total Amount ($)', 'period': 'Period'}, | |
markers=True | |
) | |
else: | |
fig_trend = px.line(title='No data for selected filters') | |
return metrics_text, fig_category, fig_party, fig_trend | |
def get_top_spenders(n_top, year_filter, party_filter, category_filter): | |
# Apply filters | |
filtered_df = expenditures_df.copy() | |
if year_filter: | |
filtered_df = filtered_df[filtered_df['periodyear'].isin(year_filter)] | |
if party_filter: | |
filtered_df = filtered_df[filtered_df['party'].isin(party_filter)] | |
if category_filter: | |
filtered_df = filtered_df[filtered_df['category'].isin(category_filter)] | |
if len(filtered_df) > 0: | |
# Get top spenders | |
top_spenders = filtered_df.groupby(['membername', 'party'])['amount'].sum().sort_values(ascending=False).head(n_top).reset_index() | |
fig = px.bar( | |
top_spenders, | |
x='amount', | |
y='membername', | |
color='party', | |
orientation='h', | |
title=f'Top {n_top} Spenders', | |
labels={'amount': 'Total Amount ($)', 'membername': 'Member'}, | |
height=max(400, n_top * 25) | |
) | |
fig.update_layout(yaxis={'categoryorder': 'total ascending'}) | |
else: | |
fig = px.bar(title='No data for selected filters') | |
return fig | |
def analyze_member(member_name): | |
if not member_name: | |
return "Please select a member", None | |
member_df = expenditures_df[expenditures_df['membername'] == member_name] | |
if member_df.empty: | |
return "No data found for this member", None | |
# Calculate metrics | |
total = member_df['amount'].sum() | |
count = len(member_df) | |
avg = member_df['amount'].mean() | |
party = member_df['party'].iloc[0] | |
info = f""" | |
### {member_name} ({party}) | |
- **Total Expenses**: ${total:,.2f} | |
- **Number of Expenses**: {count:,} | |
- **Average Expense**: ${avg:,.2f} | |
""" | |
# Create category breakdown | |
category_breakdown = member_df.groupby('category')['amount'].sum().reset_index() | |
fig = px.pie( | |
category_breakdown, | |
values='amount', | |
names='category', | |
title=f'Expense Categories for {member_name}' | |
) | |
return info, fig | |
def search_expenses(member_search, min_amount, max_amount, category_filter): | |
filtered_df = expenditures_df.copy() | |
if member_search: | |
filtered_df = filtered_df[filtered_df['membername'].str.contains(member_search, case=False, na=False)] | |
filtered_df = filtered_df[(filtered_df['amount'] >= min_amount) & (filtered_df['amount'] <= max_amount)] | |
if category_filter and category_filter != "All": | |
filtered_df = filtered_df[filtered_df['category'] == category_filter] | |
# Get top 100 results | |
result = filtered_df.nlargest(100, 'amount')[['membername', 'party', 'category', 'amount', 'description', 'supplier', 'dateincurred']] | |
return result | |
# Get unique values for filters | |
years = sorted(expenditures_df['periodyear'].dropna().unique().tolist()) | |
parties = sorted(expenditures_df['party'].dropna().unique().tolist()) | |
categories = sorted(expenditures_df['category'].dropna().unique().tolist()) | |
member_names = sorted(expenditures_df['membername'].dropna().unique().tolist()) | |
# Create Gradio interface | |
with gr.Blocks(title="Canadian Parliamentary Expenditures", theme=gr.themes.Soft()) as demo: | |
gr.Markdown("# π Canadian Parliamentary Expenditures Explorer") | |
gr.Markdown("Explore spending data from the Canadian House of Commons (2021-2025)") | |
# Filters | |
with gr.Row(): | |
year_filter = gr.CheckboxGroup( | |
choices=years, | |
value=years[-2:] if len(years) >= 2 else years, | |
label="Select Years" | |
) | |
party_filter = gr.CheckboxGroup( | |
choices=parties, | |
value=parties, | |
label="Select Parties" | |
) | |
category_filter = gr.CheckboxGroup( | |
choices=categories, | |
value=categories, | |
label="Select Categories" | |
) | |
# Overview Tab | |
with gr.Tab("Overview"): | |
overview_btn = gr.Button("Update Overview", variant="primary") | |
metrics_display = gr.Markdown() | |
with gr.Row(): | |
category_plot = gr.Plot() | |
party_plot = gr.Plot() | |
trend_plot = gr.Plot() | |
overview_btn.click( | |
create_overview_plots, | |
inputs=[year_filter, party_filter, category_filter], | |
outputs=[metrics_display, category_plot, party_plot, trend_plot] | |
) | |
# Top Spenders Tab | |
with gr.Tab("Top Spenders"): | |
n_slider = gr.Slider(10, 50, value=20, step=5, label="Number of top spenders") | |
spenders_btn = gr.Button("Show Top Spenders", variant="primary") | |
spenders_plot = gr.Plot() | |
spenders_btn.click( | |
get_top_spenders, | |
inputs=[n_slider, year_filter, party_filter, category_filter], | |
outputs=spenders_plot | |
) | |
# Member Analysis Tab | |
with gr.Tab("Member Analysis"): | |
member_dropdown = gr.Dropdown( | |
choices=member_names, | |
label="Select a Member", | |
interactive=True | |
) | |
member_info = gr.Markdown() | |
member_plot = gr.Plot() | |
member_dropdown.change( | |
analyze_member, | |
inputs=member_dropdown, | |
outputs=[member_info, member_plot] | |
) | |
# Search Tab | |
with gr.Tab("Search Expenses"): | |
with gr.Row(): | |
search_member = gr.Textbox(label="Member Name (partial match)", placeholder="e.g., Trudeau") | |
search_category = gr.Dropdown( | |
choices=["All"] + categories, | |
value="All", | |
label="Category" | |
) | |
with gr.Row(): | |
min_amount_input = gr.Number(value=0, label="Minimum Amount ($)") | |
max_amount_input = gr.Number(value=1000000, label="Maximum Amount ($)") | |
search_btn = gr.Button("Search", variant="primary") | |
results_table = gr.Dataframe( | |
headers=["Member", "Party", "Category", "Amount", "Description", "Supplier", "Date"], | |
datatype=["str", "str", "str", "number", "str", "str", "str"] | |
) | |
search_btn.click( | |
search_expenses, | |
inputs=[search_member, min_amount_input, max_amount_input, search_category], | |
outputs=results_table | |
) | |
# Footer | |
gr.Markdown(""" | |
--- | |
**Data Source**: Canadian House of Commons | |
**Dataset**: [irf23/canadian-parliamentary-expenditures](https://huggingface.co/datasets/irf23/canadian-parliamentary-expenditures) | |
**License**: CC0-1.0 (Public Domain) | |
""") | |
# Load initial overview | |
demo.load( | |
create_overview_plots, | |
inputs=[year_filter, party_filter, category_filter], | |
outputs=[metrics_display, category_plot, party_plot, trend_plot] | |
) | |
if __name__ == "__main__": | |
demo.launch() |