import gradio as gr import pandas as pd import plotly.express as px import plotly.graph_objects as go from datasets import load_dataset import pyarrow.parquet as pq # Load the dataset using direct parquet file loading print("Loading dataset...") # Load directly from Hugging Face using pandas train_dfs = [] test_dfs = [] # Base URL for the dataset files base_url = "https://huggingface.co/datasets/irf23/canadian-parliamentary-expenditures/resolve/main/data" # List of expected files based on the dataset description print("Loading training data...") for year in range(2021, 2025): for quarter in range(1, 5): if year == 2021 and quarter == 1: continue # Data starts from 2021 Q2 try: url = f"{base_url}/train/expenditures-{year}-q{quarter}.parquet" df = pd.read_parquet(url) train_dfs.append(df) print(f"Loaded {year} Q{quarter} train data ({len(df)} records)") except Exception as e: print(f"Could not load {year} Q{quarter}: {e}") # Load 2025 test data print("\nLoading test data...") for quarter in range(1, 5): try: url = f"{base_url}/test/expenditures-2025-q{quarter}.parquet" df = pd.read_parquet(url) test_dfs.append(df) print(f"Loaded 2025 Q{quarter} test data ({len(df)} records)") except Exception as e: print(f"Could not load 2025 Q{quarter}: {e}") # Combine all dataframes if train_dfs and test_dfs: expenditures_df = pd.concat(train_dfs + test_dfs, ignore_index=True) elif train_dfs: expenditures_df = pd.concat(train_dfs, ignore_index=True) else: # Create dummy data for testing print("Creating dummy data for demonstration") expenditures_df = pd.DataFrame({ 'Id': ['1', '2', '3'], 'MemberId': ['m1', 'm2', 'm3'], 'MemberName': ['John Doe', 'Jane Smith', 'Bob Johnson'], 'Constituency': ['Riding A', 'Riding B', 'Riding C'], 'Party': ['Liberal', 'Conservative', 'NDP'], 'Category': ['Travel', 'Hospitality', 'Contract'], 'Amount': [1000.0, 2000.0, 1500.0], 'Description': ['Flight to Ottawa', 'Meeting expenses', 'Consulting'], 'Location': ['Toronto', 'Vancouver', 'Montreal'], 'Supplier': ['Air Canada', 'Hotel XYZ', 'Consultant ABC'], 'PeriodYear': [2024, 2024, 2024], 'PeriodQuarter': [1, 1, 2], 'DateIncurred': ['2024-01-15', '2024-02-20', '2024-04-10'], 'ClaimId': ['c1', 'c2', 'c3'], 'CreatedAt': ['2024-01-20', '2024-02-25', '2024-04-15'], 'UpdatedAt': ['2024-01-20', '2024-02-25', '2024-04-15'] }) # Convert column names to lowercase expenditures_df.columns = expenditures_df.columns.str.lower() # Convert data types expenditures_df['dateincurred'] = pd.to_datetime(expenditures_df['dateincurred'], errors='coerce') expenditures_df['amount'] = pd.to_numeric(expenditures_df['amount'], errors='coerce') expenditures_df['periodyear'] = pd.to_numeric(expenditures_df['periodyear'], errors='coerce') expenditures_df['periodquarter'] = pd.to_numeric(expenditures_df['periodquarter'], errors='coerce') print(f"\nLoaded {len(expenditures_df)} total expenditure records") print(f"Columns: {list(expenditures_df.columns)}") def create_overview_plots(year_filter, party_filter, category_filter): # Apply filters filtered_df = expenditures_df.copy() if year_filter: filtered_df = filtered_df[filtered_df['periodyear'].isin(year_filter)] if party_filter: filtered_df = filtered_df[filtered_df['party'].isin(party_filter)] if category_filter: filtered_df = filtered_df[filtered_df['category'].isin(category_filter)] # Calculate metrics total_spending = filtered_df['amount'].sum() num_records = len(filtered_df) avg_expense = filtered_df['amount'].mean() if num_records > 0 else 0 num_members = filtered_df['memberid'].nunique() metrics_text = f""" ### Key Metrics - **Total Spending**: ${total_spending:,.2f} - **Number of Records**: {num_records:,} - **Average Expense**: ${avg_expense:,.2f} - **Active Members**: {num_members} """ # Create spending by category pie chart if len(filtered_df) > 0: category_spending = filtered_df.groupby('category')['amount'].sum().reset_index() fig_category = px.pie( category_spending, values='amount', names='category', title='Spending by Category' ) else: fig_category = px.pie(title='No data for selected filters') # Create spending by party bar chart if len(filtered_df) > 0: party_spending = filtered_df.groupby('party')['amount'].sum().sort_values(ascending=False).reset_index() fig_party = px.bar( party_spending, x='party', y='amount', title='Total Spending by Party', labels={'amount': 'Total Amount ($)', 'party': 'Party'} ) else: fig_party = px.bar(title='No data for selected filters') # Create quarterly trend line chart if len(filtered_df) > 0: quarterly = filtered_df.groupby(['periodyear', 'periodquarter'])['amount'].sum().reset_index() quarterly['period'] = quarterly['periodyear'].astype(str) + '-Q' + quarterly['periodquarter'].astype(str) fig_trend = px.line( quarterly, x='period', y='amount', title='Quarterly Spending Trend', labels={'amount': 'Total Amount ($)', 'period': 'Period'}, markers=True ) else: fig_trend = px.line(title='No data for selected filters') return metrics_text, fig_category, fig_party, fig_trend def get_top_spenders(n_top, year_filter, party_filter, category_filter): # Apply filters filtered_df = expenditures_df.copy() if year_filter: filtered_df = filtered_df[filtered_df['periodyear'].isin(year_filter)] if party_filter: filtered_df = filtered_df[filtered_df['party'].isin(party_filter)] if category_filter: filtered_df = filtered_df[filtered_df['category'].isin(category_filter)] if len(filtered_df) > 0: # Get top spenders top_spenders = filtered_df.groupby(['membername', 'party'])['amount'].sum().sort_values(ascending=False).head(n_top).reset_index() fig = px.bar( top_spenders, x='amount', y='membername', color='party', orientation='h', title=f'Top {n_top} Spenders', labels={'amount': 'Total Amount ($)', 'membername': 'Member'}, height=max(400, n_top * 25) ) fig.update_layout(yaxis={'categoryorder': 'total ascending'}) else: fig = px.bar(title='No data for selected filters') return fig def analyze_member(member_name): if not member_name: return "Please select a member", None member_df = expenditures_df[expenditures_df['membername'] == member_name] if member_df.empty: return "No data found for this member", None # Calculate metrics total = member_df['amount'].sum() count = len(member_df) avg = member_df['amount'].mean() party = member_df['party'].iloc[0] info = f""" ### {member_name} ({party}) - **Total Expenses**: ${total:,.2f} - **Number of Expenses**: {count:,} - **Average Expense**: ${avg:,.2f} """ # Create category breakdown category_breakdown = member_df.groupby('category')['amount'].sum().reset_index() fig = px.pie( category_breakdown, values='amount', names='category', title=f'Expense Categories for {member_name}' ) return info, fig def search_expenses(member_search, min_amount, max_amount, category_filter): filtered_df = expenditures_df.copy() if member_search: filtered_df = filtered_df[filtered_df['membername'].str.contains(member_search, case=False, na=False)] filtered_df = filtered_df[(filtered_df['amount'] >= min_amount) & (filtered_df['amount'] <= max_amount)] if category_filter and category_filter != "All": filtered_df = filtered_df[filtered_df['category'] == category_filter] # Get top 100 results result = filtered_df.nlargest(100, 'amount')[['membername', 'party', 'category', 'amount', 'description', 'supplier', 'dateincurred']] return result # Get unique values for filters years = sorted(expenditures_df['periodyear'].dropna().unique().tolist()) parties = sorted(expenditures_df['party'].dropna().unique().tolist()) categories = sorted(expenditures_df['category'].dropna().unique().tolist()) member_names = sorted(expenditures_df['membername'].dropna().unique().tolist()) # Create Gradio interface with gr.Blocks(title="Canadian Parliamentary Expenditures", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🍁 Canadian Parliamentary Expenditures Explorer") gr.Markdown("Explore spending data from the Canadian House of Commons (2021-2025)") # Filters with gr.Row(): year_filter = gr.CheckboxGroup( choices=years, value=years[-2:] if len(years) >= 2 else years, label="Select Years" ) party_filter = gr.CheckboxGroup( choices=parties, value=parties, label="Select Parties" ) category_filter = gr.CheckboxGroup( choices=categories, value=categories, label="Select Categories" ) # Overview Tab with gr.Tab("Overview"): overview_btn = gr.Button("Update Overview", variant="primary") metrics_display = gr.Markdown() with gr.Row(): category_plot = gr.Plot() party_plot = gr.Plot() trend_plot = gr.Plot() overview_btn.click( create_overview_plots, inputs=[year_filter, party_filter, category_filter], outputs=[metrics_display, category_plot, party_plot, trend_plot] ) # Top Spenders Tab with gr.Tab("Top Spenders"): n_slider = gr.Slider(10, 50, value=20, step=5, label="Number of top spenders") spenders_btn = gr.Button("Show Top Spenders", variant="primary") spenders_plot = gr.Plot() spenders_btn.click( get_top_spenders, inputs=[n_slider, year_filter, party_filter, category_filter], outputs=spenders_plot ) # Member Analysis Tab with gr.Tab("Member Analysis"): member_dropdown = gr.Dropdown( choices=member_names, label="Select a Member", interactive=True ) member_info = gr.Markdown() member_plot = gr.Plot() member_dropdown.change( analyze_member, inputs=member_dropdown, outputs=[member_info, member_plot] ) # Search Tab with gr.Tab("Search Expenses"): with gr.Row(): search_member = gr.Textbox(label="Member Name (partial match)", placeholder="e.g., Trudeau") search_category = gr.Dropdown( choices=["All"] + categories, value="All", label="Category" ) with gr.Row(): min_amount_input = gr.Number(value=0, label="Minimum Amount ($)") max_amount_input = gr.Number(value=1000000, label="Maximum Amount ($)") search_btn = gr.Button("Search", variant="primary") results_table = gr.Dataframe( headers=["Member", "Party", "Category", "Amount", "Description", "Supplier", "Date"], datatype=["str", "str", "str", "number", "str", "str", "str"] ) search_btn.click( search_expenses, inputs=[search_member, min_amount_input, max_amount_input, search_category], outputs=results_table ) # Footer gr.Markdown(""" --- **Data Source**: Canadian House of Commons **Dataset**: [irf23/canadian-parliamentary-expenditures](https://huggingface.co/datasets/irf23/canadian-parliamentary-expenditures) **License**: CC0-1.0 (Public Domain) """) # Load initial overview demo.load( create_overview_plots, inputs=[year_filter, party_filter, category_filter], outputs=[metrics_display, category_plot, party_plot, trend_plot] ) if __name__ == "__main__": demo.launch()