Irfan Savji commited on
Commit
b7b327f
·
1 Parent(s): 4b5e136

Fix column name issues with dataset

Browse files

Updated app.py to handle CamelCase column names from the dataset
and properly load data from train/test splits. All column references
are now lowercase for consistency.

Files changed (1) hide show
  1. app.py +22 -18
app.py CHANGED
@@ -8,12 +8,16 @@ from datasets import load_dataset
8
  print("Loading dataset...")
9
  dataset = load_dataset("irf23/canadian-parliamentary-expenditures")
10
 
11
- # Convert to pandas DataFrames
12
- expenditures_df = dataset['expenditures'].to_pandas()
13
- members_df = dataset['members'].to_pandas()
 
 
 
 
14
 
15
  # Convert date columns
16
- expenditures_df['date_incurred'] = pd.to_datetime(expenditures_df['date_incurred'])
17
  expenditures_df['amount'] = pd.to_numeric(expenditures_df['amount'])
18
 
19
  print(f"Loaded {len(expenditures_df)} expenditure records")
@@ -22,7 +26,7 @@ def create_overview_plots(year_filter, party_filter, category_filter):
22
  # Apply filters
23
  filtered_df = expenditures_df.copy()
24
  if year_filter:
25
- filtered_df = filtered_df[filtered_df['period_year'].isin(year_filter)]
26
  if party_filter:
27
  filtered_df = filtered_df[filtered_df['party'].isin(party_filter)]
28
  if category_filter:
@@ -31,8 +35,8 @@ def create_overview_plots(year_filter, party_filter, category_filter):
31
  # Calculate metrics
32
  total_spending = filtered_df['amount'].sum()
33
  num_records = len(filtered_df)
34
- avg_expense = filtered_df['amount'].mean()
35
- num_members = filtered_df['member_id'].nunique()
36
 
37
  metrics_text = f"""
38
  ### Key Metrics
@@ -62,8 +66,8 @@ def create_overview_plots(year_filter, party_filter, category_filter):
62
  )
63
 
64
  # Create quarterly trend line chart
65
- quarterly = filtered_df.groupby(['period_year', 'period_quarter'])['amount'].sum().reset_index()
66
- quarterly['period'] = quarterly['period_year'].astype(str) + '-Q' + quarterly['period_quarter'].astype(str)
67
  fig_trend = px.line(
68
  quarterly,
69
  x='period',
@@ -79,23 +83,23 @@ def get_top_spenders(n_top, year_filter, party_filter, category_filter):
79
  # Apply filters
80
  filtered_df = expenditures_df.copy()
81
  if year_filter:
82
- filtered_df = filtered_df[filtered_df['period_year'].isin(year_filter)]
83
  if party_filter:
84
  filtered_df = filtered_df[filtered_df['party'].isin(party_filter)]
85
  if category_filter:
86
  filtered_df = filtered_df[filtered_df['category'].isin(category_filter)]
87
 
88
  # Get top spenders
89
- top_spenders = filtered_df.groupby(['member_name', 'party'])['amount'].sum().sort_values(ascending=False).head(n_top).reset_index()
90
 
91
  fig = px.bar(
92
  top_spenders,
93
  x='amount',
94
- y='member_name',
95
  color='party',
96
  orientation='h',
97
  title=f'Top {n_top} Spenders',
98
- labels={'amount': 'Total Amount ($)', 'member_name': 'Member'},
99
  height=max(400, n_top * 25)
100
  )
101
  fig.update_layout(yaxis={'categoryorder': 'total ascending'})
@@ -106,7 +110,7 @@ def analyze_member(member_name):
106
  if not member_name:
107
  return "Please select a member", None
108
 
109
- member_df = expenditures_df[expenditures_df['member_name'] == member_name]
110
 
111
  if member_df.empty:
112
  return "No data found for this member", None
@@ -139,7 +143,7 @@ def search_expenses(member_search, min_amount, max_amount, category_filter):
139
  filtered_df = expenditures_df.copy()
140
 
141
  if member_search:
142
- filtered_df = filtered_df[filtered_df['member_name'].str.contains(member_search, case=False, na=False)]
143
 
144
  filtered_df = filtered_df[(filtered_df['amount'] >= min_amount) & (filtered_df['amount'] <= max_amount)]
145
 
@@ -147,15 +151,15 @@ def search_expenses(member_search, min_amount, max_amount, category_filter):
147
  filtered_df = filtered_df[filtered_df['category'] == category_filter]
148
 
149
  # Get top 100 results
150
- result = filtered_df.nlargest(100, 'amount')[['member_name', 'party', 'category', 'amount', 'description', 'supplier', 'date_incurred']]
151
 
152
  return result
153
 
154
  # Get unique values for filters
155
- years = sorted(expenditures_df['period_year'].unique().tolist())
156
  parties = sorted(expenditures_df['party'].unique().tolist())
157
  categories = sorted(expenditures_df['category'].unique().tolist())
158
- member_names = sorted(expenditures_df['member_name'].unique().tolist())
159
 
160
  # Create Gradio interface
161
  with gr.Blocks(title="Canadian Parliamentary Expenditures", theme=gr.themes.Soft()) as demo:
 
8
  print("Loading dataset...")
9
  dataset = load_dataset("irf23/canadian-parliamentary-expenditures")
10
 
11
+ # Combine train and test splits
12
+ train_df = dataset['train'].to_pandas()
13
+ test_df = dataset['test'].to_pandas()
14
+ expenditures_df = pd.concat([train_df, test_df], ignore_index=True)
15
+
16
+ # The dataset uses CamelCase column names, let's rename them to lowercase for consistency
17
+ expenditures_df.columns = expenditures_df.columns.str.lower()
18
 
19
  # Convert date columns
20
+ expenditures_df['dateincurred'] = pd.to_datetime(expenditures_df['dateincurred'])
21
  expenditures_df['amount'] = pd.to_numeric(expenditures_df['amount'])
22
 
23
  print(f"Loaded {len(expenditures_df)} expenditure records")
 
26
  # Apply filters
27
  filtered_df = expenditures_df.copy()
28
  if year_filter:
29
+ filtered_df = filtered_df[filtered_df['periodyear'].isin(year_filter)]
30
  if party_filter:
31
  filtered_df = filtered_df[filtered_df['party'].isin(party_filter)]
32
  if category_filter:
 
35
  # Calculate metrics
36
  total_spending = filtered_df['amount'].sum()
37
  num_records = len(filtered_df)
38
+ avg_expense = filtered_df['amount'].mean() if num_records > 0 else 0
39
+ num_members = filtered_df['memberid'].nunique()
40
 
41
  metrics_text = f"""
42
  ### Key Metrics
 
66
  )
67
 
68
  # Create quarterly trend line chart
69
+ quarterly = filtered_df.groupby(['periodyear', 'periodquarter'])['amount'].sum().reset_index()
70
+ quarterly['period'] = quarterly['periodyear'].astype(str) + '-Q' + quarterly['periodquarter'].astype(str)
71
  fig_trend = px.line(
72
  quarterly,
73
  x='period',
 
83
  # Apply filters
84
  filtered_df = expenditures_df.copy()
85
  if year_filter:
86
+ filtered_df = filtered_df[filtered_df['periodyear'].isin(year_filter)]
87
  if party_filter:
88
  filtered_df = filtered_df[filtered_df['party'].isin(party_filter)]
89
  if category_filter:
90
  filtered_df = filtered_df[filtered_df['category'].isin(category_filter)]
91
 
92
  # Get top spenders
93
+ top_spenders = filtered_df.groupby(['membername', 'party'])['amount'].sum().sort_values(ascending=False).head(n_top).reset_index()
94
 
95
  fig = px.bar(
96
  top_spenders,
97
  x='amount',
98
+ y='membername',
99
  color='party',
100
  orientation='h',
101
  title=f'Top {n_top} Spenders',
102
+ labels={'amount': 'Total Amount ($)', 'membername': 'Member'},
103
  height=max(400, n_top * 25)
104
  )
105
  fig.update_layout(yaxis={'categoryorder': 'total ascending'})
 
110
  if not member_name:
111
  return "Please select a member", None
112
 
113
+ member_df = expenditures_df[expenditures_df['membername'] == member_name]
114
 
115
  if member_df.empty:
116
  return "No data found for this member", None
 
143
  filtered_df = expenditures_df.copy()
144
 
145
  if member_search:
146
+ filtered_df = filtered_df[filtered_df['membername'].str.contains(member_search, case=False, na=False)]
147
 
148
  filtered_df = filtered_df[(filtered_df['amount'] >= min_amount) & (filtered_df['amount'] <= max_amount)]
149
 
 
151
  filtered_df = filtered_df[filtered_df['category'] == category_filter]
152
 
153
  # Get top 100 results
154
+ result = filtered_df.nlargest(100, 'amount')[['membername', 'party', 'category', 'amount', 'description', 'supplier', 'dateincurred']]
155
 
156
  return result
157
 
158
  # Get unique values for filters
159
+ years = sorted(expenditures_df['periodyear'].unique().tolist())
160
  parties = sorted(expenditures_df['party'].unique().tolist())
161
  categories = sorted(expenditures_df['category'].unique().tolist())
162
+ member_names = sorted(expenditures_df['membername'].unique().tolist())
163
 
164
  # Create Gradio interface
165
  with gr.Blocks(title="Canadian Parliamentary Expenditures", theme=gr.themes.Soft()) as demo: