Spaces:
Sleeping
Sleeping
File size: 12,613 Bytes
4b5e136 975185d 4b5e136 4dcee19 4b5e136 4dcee19 975185d 4dcee19 4b5e136 4dcee19 b7b327f 975185d b7b327f 4b5e136 975185d 4b5e136 4dcee19 975185d 4b5e136 b7b327f 4b5e136 b7b327f 4b5e136 975185d 4b5e136 975185d 4b5e136 975185d 4b5e136 b7b327f 4b5e136 975185d 4b5e136 b7b327f 4b5e136 b7b327f 4b5e136 b7b327f 4b5e136 975185d 4b5e136 975185d 4b5e136 33f5253 4b5e136 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 |
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from datasets import load_dataset
import pyarrow.parquet as pq
# Load the dataset using direct parquet file loading
print("Loading dataset...")
# Load directly from Hugging Face using pandas
train_dfs = []
test_dfs = []
# Base URL for the dataset files
base_url = "https://huggingface.co/datasets/irf23/canadian-parliamentary-expenditures/resolve/main/data"
# List of expected files based on the dataset description
print("Loading training data...")
for year in range(2021, 2025):
for quarter in range(1, 5):
if year == 2021 and quarter == 1:
continue # Data starts from 2021 Q2
try:
url = f"{base_url}/train/expenditures-{year}-q{quarter}.parquet"
df = pd.read_parquet(url)
train_dfs.append(df)
print(f"Loaded {year} Q{quarter} train data ({len(df)} records)")
except Exception as e:
print(f"Could not load {year} Q{quarter}: {e}")
# Load 2025 test data
print("\nLoading test data...")
for quarter in range(1, 5):
try:
url = f"{base_url}/test/expenditures-2025-q{quarter}.parquet"
df = pd.read_parquet(url)
test_dfs.append(df)
print(f"Loaded 2025 Q{quarter} test data ({len(df)} records)")
except Exception as e:
print(f"Could not load 2025 Q{quarter}: {e}")
# Combine all dataframes
if train_dfs and test_dfs:
expenditures_df = pd.concat(train_dfs + test_dfs, ignore_index=True)
elif train_dfs:
expenditures_df = pd.concat(train_dfs, ignore_index=True)
else:
# Create dummy data for testing
print("Creating dummy data for demonstration")
expenditures_df = pd.DataFrame({
'Id': ['1', '2', '3'],
'MemberId': ['m1', 'm2', 'm3'],
'MemberName': ['John Doe', 'Jane Smith', 'Bob Johnson'],
'Constituency': ['Riding A', 'Riding B', 'Riding C'],
'Party': ['Liberal', 'Conservative', 'NDP'],
'Category': ['Travel', 'Hospitality', 'Contract'],
'Amount': [1000.0, 2000.0, 1500.0],
'Description': ['Flight to Ottawa', 'Meeting expenses', 'Consulting'],
'Location': ['Toronto', 'Vancouver', 'Montreal'],
'Supplier': ['Air Canada', 'Hotel XYZ', 'Consultant ABC'],
'PeriodYear': [2024, 2024, 2024],
'PeriodQuarter': [1, 1, 2],
'DateIncurred': ['2024-01-15', '2024-02-20', '2024-04-10'],
'ClaimId': ['c1', 'c2', 'c3'],
'CreatedAt': ['2024-01-20', '2024-02-25', '2024-04-15'],
'UpdatedAt': ['2024-01-20', '2024-02-25', '2024-04-15']
})
# Convert column names to lowercase
expenditures_df.columns = expenditures_df.columns.str.lower()
# Convert data types
expenditures_df['dateincurred'] = pd.to_datetime(expenditures_df['dateincurred'], errors='coerce')
expenditures_df['amount'] = pd.to_numeric(expenditures_df['amount'], errors='coerce')
expenditures_df['periodyear'] = pd.to_numeric(expenditures_df['periodyear'], errors='coerce')
expenditures_df['periodquarter'] = pd.to_numeric(expenditures_df['periodquarter'], errors='coerce')
print(f"\nLoaded {len(expenditures_df)} total expenditure records")
print(f"Columns: {list(expenditures_df.columns)}")
def create_overview_plots(year_filter, party_filter, category_filter):
# Apply filters
filtered_df = expenditures_df.copy()
if year_filter:
filtered_df = filtered_df[filtered_df['periodyear'].isin(year_filter)]
if party_filter:
filtered_df = filtered_df[filtered_df['party'].isin(party_filter)]
if category_filter:
filtered_df = filtered_df[filtered_df['category'].isin(category_filter)]
# Calculate metrics
total_spending = filtered_df['amount'].sum()
num_records = len(filtered_df)
avg_expense = filtered_df['amount'].mean() if num_records > 0 else 0
num_members = filtered_df['memberid'].nunique()
metrics_text = f"""
### Key Metrics
- **Total Spending**: ${total_spending:,.2f}
- **Number of Records**: {num_records:,}
- **Average Expense**: ${avg_expense:,.2f}
- **Active Members**: {num_members}
"""
# Create spending by category pie chart
if len(filtered_df) > 0:
category_spending = filtered_df.groupby('category')['amount'].sum().reset_index()
fig_category = px.pie(
category_spending,
values='amount',
names='category',
title='Spending by Category'
)
else:
fig_category = px.pie(title='No data for selected filters')
# Create spending by party bar chart
if len(filtered_df) > 0:
party_spending = filtered_df.groupby('party')['amount'].sum().sort_values(ascending=False).reset_index()
fig_party = px.bar(
party_spending,
x='party',
y='amount',
title='Total Spending by Party',
labels={'amount': 'Total Amount ($)', 'party': 'Party'}
)
else:
fig_party = px.bar(title='No data for selected filters')
# Create quarterly trend line chart
if len(filtered_df) > 0:
quarterly = filtered_df.groupby(['periodyear', 'periodquarter'])['amount'].sum().reset_index()
quarterly['period'] = quarterly['periodyear'].astype(str) + '-Q' + quarterly['periodquarter'].astype(str)
fig_trend = px.line(
quarterly,
x='period',
y='amount',
title='Quarterly Spending Trend',
labels={'amount': 'Total Amount ($)', 'period': 'Period'},
markers=True
)
else:
fig_trend = px.line(title='No data for selected filters')
return metrics_text, fig_category, fig_party, fig_trend
def get_top_spenders(n_top, year_filter, party_filter, category_filter):
# Apply filters
filtered_df = expenditures_df.copy()
if year_filter:
filtered_df = filtered_df[filtered_df['periodyear'].isin(year_filter)]
if party_filter:
filtered_df = filtered_df[filtered_df['party'].isin(party_filter)]
if category_filter:
filtered_df = filtered_df[filtered_df['category'].isin(category_filter)]
if len(filtered_df) > 0:
# Get top spenders
top_spenders = filtered_df.groupby(['membername', 'party'])['amount'].sum().sort_values(ascending=False).head(n_top).reset_index()
fig = px.bar(
top_spenders,
x='amount',
y='membername',
color='party',
orientation='h',
title=f'Top {n_top} Spenders',
labels={'amount': 'Total Amount ($)', 'membername': 'Member'},
height=max(400, n_top * 25)
)
fig.update_layout(yaxis={'categoryorder': 'total ascending'})
else:
fig = px.bar(title='No data for selected filters')
return fig
def analyze_member(member_name):
if not member_name:
return "Please select a member", None
member_df = expenditures_df[expenditures_df['membername'] == member_name]
if member_df.empty:
return "No data found for this member", None
# Calculate metrics
total = member_df['amount'].sum()
count = len(member_df)
avg = member_df['amount'].mean()
party = member_df['party'].iloc[0]
info = f"""
### {member_name} ({party})
- **Total Expenses**: ${total:,.2f}
- **Number of Expenses**: {count:,}
- **Average Expense**: ${avg:,.2f}
"""
# Create category breakdown
category_breakdown = member_df.groupby('category')['amount'].sum().reset_index()
fig = px.pie(
category_breakdown,
values='amount',
names='category',
title=f'Expense Categories for {member_name}'
)
return info, fig
def search_expenses(member_search, min_amount, max_amount, category_filter):
filtered_df = expenditures_df.copy()
if member_search:
filtered_df = filtered_df[filtered_df['membername'].str.contains(member_search, case=False, na=False)]
filtered_df = filtered_df[(filtered_df['amount'] >= min_amount) & (filtered_df['amount'] <= max_amount)]
if category_filter and category_filter != "All":
filtered_df = filtered_df[filtered_df['category'] == category_filter]
# Get top 100 results
result = filtered_df.nlargest(100, 'amount')[['membername', 'party', 'category', 'amount', 'description', 'supplier', 'dateincurred']]
return result
# Get unique values for filters
years = sorted(expenditures_df['periodyear'].dropna().unique().tolist())
parties = sorted(expenditures_df['party'].dropna().unique().tolist())
categories = sorted(expenditures_df['category'].dropna().unique().tolist())
member_names = sorted(expenditures_df['membername'].dropna().unique().tolist())
# Create Gradio interface
with gr.Blocks(title="Canadian Parliamentary Expenditures", theme=gr.themes.Soft()) as demo:
gr.Markdown("# π Canadian Parliamentary Expenditures Explorer")
gr.Markdown("Explore spending data from the Canadian House of Commons (2021-2025)")
# Filters
with gr.Row():
year_filter = gr.CheckboxGroup(
choices=years,
value=years[-2:] if len(years) >= 2 else years,
label="Select Years"
)
party_filter = gr.CheckboxGroup(
choices=parties,
value=parties,
label="Select Parties"
)
category_filter = gr.CheckboxGroup(
choices=categories,
value=categories,
label="Select Categories"
)
# Overview Tab
with gr.Tab("Overview"):
overview_btn = gr.Button("Update Overview", variant="primary")
metrics_display = gr.Markdown()
with gr.Row():
category_plot = gr.Plot()
party_plot = gr.Plot()
trend_plot = gr.Plot()
overview_btn.click(
create_overview_plots,
inputs=[year_filter, party_filter, category_filter],
outputs=[metrics_display, category_plot, party_plot, trend_plot]
)
# Top Spenders Tab
with gr.Tab("Top Spenders"):
n_slider = gr.Slider(10, 50, value=20, step=5, label="Number of top spenders")
spenders_btn = gr.Button("Show Top Spenders", variant="primary")
spenders_plot = gr.Plot()
spenders_btn.click(
get_top_spenders,
inputs=[n_slider, year_filter, party_filter, category_filter],
outputs=spenders_plot
)
# Member Analysis Tab
with gr.Tab("Member Analysis"):
member_dropdown = gr.Dropdown(
choices=member_names,
label="Select a Member",
interactive=True
)
member_info = gr.Markdown()
member_plot = gr.Plot()
member_dropdown.change(
analyze_member,
inputs=member_dropdown,
outputs=[member_info, member_plot]
)
# Search Tab
with gr.Tab("Search Expenses"):
with gr.Row():
search_member = gr.Textbox(label="Member Name (partial match)", placeholder="e.g., Trudeau")
search_category = gr.Dropdown(
choices=["All"] + categories,
value="All",
label="Category"
)
with gr.Row():
min_amount_input = gr.Number(value=0, label="Minimum Amount ($)")
max_amount_input = gr.Number(value=1000000, label="Maximum Amount ($)")
search_btn = gr.Button("Search", variant="primary")
results_table = gr.Dataframe(
headers=["Member", "Party", "Category", "Amount", "Description", "Supplier", "Date"],
datatype=["str", "str", "str", "number", "str", "str", "str"]
)
search_btn.click(
search_expenses,
inputs=[search_member, min_amount_input, max_amount_input, search_category],
outputs=results_table
)
# Footer
gr.Markdown("""
---
**Data Source**: Canadian House of Commons
**Dataset**: [irf23/canadian-parliamentary-expenditures](https://huggingface.co/datasets/irf23/canadian-parliamentary-expenditures)
**License**: CC0-1.0 (Public Domain)
""")
# Load initial overview
demo.load(
create_overview_plots,
inputs=[year_filter, party_filter, category_filter],
outputs=[metrics_display, category_plot, party_plot, trend_plot]
)
if __name__ == "__main__":
demo.launch() |