Spaces:
Sleeping
Sleeping
Irfan Savji
commited on
Commit
·
b7b327f
1
Parent(s):
4b5e136
Fix column name issues with dataset
Browse filesUpdated app.py to handle CamelCase column names from the dataset
and properly load data from train/test splits. All column references
are now lowercase for consistency.
app.py
CHANGED
|
@@ -8,12 +8,16 @@ from datasets import load_dataset
|
|
| 8 |
print("Loading dataset...")
|
| 9 |
dataset = load_dataset("irf23/canadian-parliamentary-expenditures")
|
| 10 |
|
| 11 |
-
#
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
# Convert date columns
|
| 16 |
-
expenditures_df['
|
| 17 |
expenditures_df['amount'] = pd.to_numeric(expenditures_df['amount'])
|
| 18 |
|
| 19 |
print(f"Loaded {len(expenditures_df)} expenditure records")
|
|
@@ -22,7 +26,7 @@ def create_overview_plots(year_filter, party_filter, category_filter):
|
|
| 22 |
# Apply filters
|
| 23 |
filtered_df = expenditures_df.copy()
|
| 24 |
if year_filter:
|
| 25 |
-
filtered_df = filtered_df[filtered_df['
|
| 26 |
if party_filter:
|
| 27 |
filtered_df = filtered_df[filtered_df['party'].isin(party_filter)]
|
| 28 |
if category_filter:
|
|
@@ -31,8 +35,8 @@ def create_overview_plots(year_filter, party_filter, category_filter):
|
|
| 31 |
# Calculate metrics
|
| 32 |
total_spending = filtered_df['amount'].sum()
|
| 33 |
num_records = len(filtered_df)
|
| 34 |
-
avg_expense = filtered_df['amount'].mean()
|
| 35 |
-
num_members = filtered_df['
|
| 36 |
|
| 37 |
metrics_text = f"""
|
| 38 |
### Key Metrics
|
|
@@ -62,8 +66,8 @@ def create_overview_plots(year_filter, party_filter, category_filter):
|
|
| 62 |
)
|
| 63 |
|
| 64 |
# Create quarterly trend line chart
|
| 65 |
-
quarterly = filtered_df.groupby(['
|
| 66 |
-
quarterly['period'] = quarterly['
|
| 67 |
fig_trend = px.line(
|
| 68 |
quarterly,
|
| 69 |
x='period',
|
|
@@ -79,23 +83,23 @@ def get_top_spenders(n_top, year_filter, party_filter, category_filter):
|
|
| 79 |
# Apply filters
|
| 80 |
filtered_df = expenditures_df.copy()
|
| 81 |
if year_filter:
|
| 82 |
-
filtered_df = filtered_df[filtered_df['
|
| 83 |
if party_filter:
|
| 84 |
filtered_df = filtered_df[filtered_df['party'].isin(party_filter)]
|
| 85 |
if category_filter:
|
| 86 |
filtered_df = filtered_df[filtered_df['category'].isin(category_filter)]
|
| 87 |
|
| 88 |
# Get top spenders
|
| 89 |
-
top_spenders = filtered_df.groupby(['
|
| 90 |
|
| 91 |
fig = px.bar(
|
| 92 |
top_spenders,
|
| 93 |
x='amount',
|
| 94 |
-
y='
|
| 95 |
color='party',
|
| 96 |
orientation='h',
|
| 97 |
title=f'Top {n_top} Spenders',
|
| 98 |
-
labels={'amount': 'Total Amount ($)', '
|
| 99 |
height=max(400, n_top * 25)
|
| 100 |
)
|
| 101 |
fig.update_layout(yaxis={'categoryorder': 'total ascending'})
|
|
@@ -106,7 +110,7 @@ def analyze_member(member_name):
|
|
| 106 |
if not member_name:
|
| 107 |
return "Please select a member", None
|
| 108 |
|
| 109 |
-
member_df = expenditures_df[expenditures_df['
|
| 110 |
|
| 111 |
if member_df.empty:
|
| 112 |
return "No data found for this member", None
|
|
@@ -139,7 +143,7 @@ def search_expenses(member_search, min_amount, max_amount, category_filter):
|
|
| 139 |
filtered_df = expenditures_df.copy()
|
| 140 |
|
| 141 |
if member_search:
|
| 142 |
-
filtered_df = filtered_df[filtered_df['
|
| 143 |
|
| 144 |
filtered_df = filtered_df[(filtered_df['amount'] >= min_amount) & (filtered_df['amount'] <= max_amount)]
|
| 145 |
|
|
@@ -147,15 +151,15 @@ def search_expenses(member_search, min_amount, max_amount, category_filter):
|
|
| 147 |
filtered_df = filtered_df[filtered_df['category'] == category_filter]
|
| 148 |
|
| 149 |
# Get top 100 results
|
| 150 |
-
result = filtered_df.nlargest(100, 'amount')[['
|
| 151 |
|
| 152 |
return result
|
| 153 |
|
| 154 |
# Get unique values for filters
|
| 155 |
-
years = sorted(expenditures_df['
|
| 156 |
parties = sorted(expenditures_df['party'].unique().tolist())
|
| 157 |
categories = sorted(expenditures_df['category'].unique().tolist())
|
| 158 |
-
member_names = sorted(expenditures_df['
|
| 159 |
|
| 160 |
# Create Gradio interface
|
| 161 |
with gr.Blocks(title="Canadian Parliamentary Expenditures", theme=gr.themes.Soft()) as demo:
|
|
|
|
| 8 |
print("Loading dataset...")
|
| 9 |
dataset = load_dataset("irf23/canadian-parliamentary-expenditures")
|
| 10 |
|
| 11 |
+
# Combine train and test splits
|
| 12 |
+
train_df = dataset['train'].to_pandas()
|
| 13 |
+
test_df = dataset['test'].to_pandas()
|
| 14 |
+
expenditures_df = pd.concat([train_df, test_df], ignore_index=True)
|
| 15 |
+
|
| 16 |
+
# The dataset uses CamelCase column names, let's rename them to lowercase for consistency
|
| 17 |
+
expenditures_df.columns = expenditures_df.columns.str.lower()
|
| 18 |
|
| 19 |
# Convert date columns
|
| 20 |
+
expenditures_df['dateincurred'] = pd.to_datetime(expenditures_df['dateincurred'])
|
| 21 |
expenditures_df['amount'] = pd.to_numeric(expenditures_df['amount'])
|
| 22 |
|
| 23 |
print(f"Loaded {len(expenditures_df)} expenditure records")
|
|
|
|
| 26 |
# Apply filters
|
| 27 |
filtered_df = expenditures_df.copy()
|
| 28 |
if year_filter:
|
| 29 |
+
filtered_df = filtered_df[filtered_df['periodyear'].isin(year_filter)]
|
| 30 |
if party_filter:
|
| 31 |
filtered_df = filtered_df[filtered_df['party'].isin(party_filter)]
|
| 32 |
if category_filter:
|
|
|
|
| 35 |
# Calculate metrics
|
| 36 |
total_spending = filtered_df['amount'].sum()
|
| 37 |
num_records = len(filtered_df)
|
| 38 |
+
avg_expense = filtered_df['amount'].mean() if num_records > 0 else 0
|
| 39 |
+
num_members = filtered_df['memberid'].nunique()
|
| 40 |
|
| 41 |
metrics_text = f"""
|
| 42 |
### Key Metrics
|
|
|
|
| 66 |
)
|
| 67 |
|
| 68 |
# Create quarterly trend line chart
|
| 69 |
+
quarterly = filtered_df.groupby(['periodyear', 'periodquarter'])['amount'].sum().reset_index()
|
| 70 |
+
quarterly['period'] = quarterly['periodyear'].astype(str) + '-Q' + quarterly['periodquarter'].astype(str)
|
| 71 |
fig_trend = px.line(
|
| 72 |
quarterly,
|
| 73 |
x='period',
|
|
|
|
| 83 |
# Apply filters
|
| 84 |
filtered_df = expenditures_df.copy()
|
| 85 |
if year_filter:
|
| 86 |
+
filtered_df = filtered_df[filtered_df['periodyear'].isin(year_filter)]
|
| 87 |
if party_filter:
|
| 88 |
filtered_df = filtered_df[filtered_df['party'].isin(party_filter)]
|
| 89 |
if category_filter:
|
| 90 |
filtered_df = filtered_df[filtered_df['category'].isin(category_filter)]
|
| 91 |
|
| 92 |
# Get top spenders
|
| 93 |
+
top_spenders = filtered_df.groupby(['membername', 'party'])['amount'].sum().sort_values(ascending=False).head(n_top).reset_index()
|
| 94 |
|
| 95 |
fig = px.bar(
|
| 96 |
top_spenders,
|
| 97 |
x='amount',
|
| 98 |
+
y='membername',
|
| 99 |
color='party',
|
| 100 |
orientation='h',
|
| 101 |
title=f'Top {n_top} Spenders',
|
| 102 |
+
labels={'amount': 'Total Amount ($)', 'membername': 'Member'},
|
| 103 |
height=max(400, n_top * 25)
|
| 104 |
)
|
| 105 |
fig.update_layout(yaxis={'categoryorder': 'total ascending'})
|
|
|
|
| 110 |
if not member_name:
|
| 111 |
return "Please select a member", None
|
| 112 |
|
| 113 |
+
member_df = expenditures_df[expenditures_df['membername'] == member_name]
|
| 114 |
|
| 115 |
if member_df.empty:
|
| 116 |
return "No data found for this member", None
|
|
|
|
| 143 |
filtered_df = expenditures_df.copy()
|
| 144 |
|
| 145 |
if member_search:
|
| 146 |
+
filtered_df = filtered_df[filtered_df['membername'].str.contains(member_search, case=False, na=False)]
|
| 147 |
|
| 148 |
filtered_df = filtered_df[(filtered_df['amount'] >= min_amount) & (filtered_df['amount'] <= max_amount)]
|
| 149 |
|
|
|
|
| 151 |
filtered_df = filtered_df[filtered_df['category'] == category_filter]
|
| 152 |
|
| 153 |
# Get top 100 results
|
| 154 |
+
result = filtered_df.nlargest(100, 'amount')[['membername', 'party', 'category', 'amount', 'description', 'supplier', 'dateincurred']]
|
| 155 |
|
| 156 |
return result
|
| 157 |
|
| 158 |
# Get unique values for filters
|
| 159 |
+
years = sorted(expenditures_df['periodyear'].unique().tolist())
|
| 160 |
parties = sorted(expenditures_df['party'].unique().tolist())
|
| 161 |
categories = sorted(expenditures_df['category'].unique().tolist())
|
| 162 |
+
member_names = sorted(expenditures_df['membername'].unique().tolist())
|
| 163 |
|
| 164 |
# Create Gradio interface
|
| 165 |
with gr.Blocks(title="Canadian Parliamentary Expenditures", theme=gr.themes.Soft()) as demo:
|