Irfan Savji commited on
Commit
4dcee19
·
1 Parent(s): 975185d

Simplify data loading to avoid dataset schema conflicts

Browse files

- Remove dataset library loading to avoid schema mismatch with members table
- Load parquet files directly from URLs using pandas
- This avoids the conflict between expenditures and members schemas

Files changed (1) hide show
  1. app.py +59 -91
app.py CHANGED
@@ -5,99 +5,67 @@ import plotly.graph_objects as go
5
  from datasets import load_dataset
6
  import pyarrow.parquet as pq
7
 
8
- # Try to load the dataset with error handling
9
  print("Loading dataset...")
10
- try:
11
- # Try loading with streaming first to check structure
12
- dataset = load_dataset("irf23/canadian-parliamentary-expenditures", streaming=True)
13
- print(f"Dataset keys: {list(dataset.keys())}")
14
-
15
- # Try to peek at the data structure
16
- for split_name in dataset.keys():
17
- print(f"\nChecking split: {split_name}")
18
- for i, example in enumerate(dataset[split_name]):
19
- print(f"Columns in {split_name}: {list(example.keys())}")
20
- break
21
-
22
- # Now load the full dataset
23
- dataset = load_dataset("irf23/canadian-parliamentary-expenditures", streaming=False, trust_remote_code=True)
24
-
25
- except Exception as e:
26
- print(f"Error loading dataset directly: {e}")
27
- print("Trying alternative loading method...")
28
-
29
- # Alternative: Try loading the parquet files directly
 
 
 
 
 
30
  try:
31
- # Load directly from Hugging Face using pandas
32
- import requests
33
-
34
- # Base URL for the dataset files
35
- base_url = "https://huggingface.co/datasets/irf23/canadian-parliamentary-expenditures/resolve/main/data"
36
-
37
- # Try to load train and test data
38
- train_dfs = []
39
- test_dfs = []
40
-
41
- # List of expected files based on the dataset description
42
- for year in range(2021, 2025):
43
- for quarter in range(1, 5):
44
- if year == 2021 and quarter == 1:
45
- continue # Data starts from 2021 Q2
46
- try:
47
- url = f"{base_url}/train/expenditures-{year}-q{quarter}.parquet"
48
- df = pd.read_parquet(url)
49
- train_dfs.append(df)
50
- print(f"Loaded {year} Q{quarter} train data")
51
- except:
52
- pass
53
-
54
- # Load 2025 test data
55
- for quarter in range(1, 5):
56
- try:
57
- url = f"{base_url}/test/expenditures-2025-q{quarter}.parquet"
58
- df = pd.read_parquet(url)
59
- test_dfs.append(df)
60
- print(f"Loaded 2025 Q{quarter} test data")
61
- except:
62
- pass
63
-
64
- # Combine all dataframes
65
- if train_dfs and test_dfs:
66
- expenditures_df = pd.concat(train_dfs + test_dfs, ignore_index=True)
67
- elif train_dfs:
68
- expenditures_df = pd.concat(train_dfs, ignore_index=True)
69
- else:
70
- raise Exception("Could not load any data files")
71
-
72
- except Exception as e2:
73
- print(f"Alternative loading also failed: {e2}")
74
- # Create dummy data for testing
75
- expenditures_df = pd.DataFrame({
76
- 'Id': ['1', '2', '3'],
77
- 'MemberId': ['m1', 'm2', 'm3'],
78
- 'MemberName': ['John Doe', 'Jane Smith', 'Bob Johnson'],
79
- 'Constituency': ['Riding A', 'Riding B', 'Riding C'],
80
- 'Party': ['Liberal', 'Conservative', 'NDP'],
81
- 'Category': ['Travel', 'Hospitality', 'Contract'],
82
- 'Amount': [1000.0, 2000.0, 1500.0],
83
- 'Description': ['Flight to Ottawa', 'Meeting expenses', 'Consulting'],
84
- 'Location': ['Toronto', 'Vancouver', 'Montreal'],
85
- 'Supplier': ['Air Canada', 'Hotel XYZ', 'Consultant ABC'],
86
- 'PeriodYear': [2024, 2024, 2024],
87
- 'PeriodQuarter': [1, 1, 2],
88
- 'DateIncurred': ['2024-01-15', '2024-02-20', '2024-04-10'],
89
- 'ClaimId': ['c1', 'c2', 'c3'],
90
- 'CreatedAt': ['2024-01-20', '2024-02-25', '2024-04-15'],
91
- 'UpdatedAt': ['2024-01-20', '2024-02-25', '2024-04-15']
92
- })
93
- print("Using dummy data for demonstration")
94
 
95
- # If we successfully loaded data, process it
96
- if 'dataset' in locals() and not isinstance(dataset, Exception):
97
- # Combine train and test splits
98
- train_df = dataset['train'].to_pandas()
99
- test_df = dataset['test'].to_pandas()
100
- expenditures_df = pd.concat([train_df, test_df], ignore_index=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  # Convert column names to lowercase
103
  expenditures_df.columns = expenditures_df.columns.str.lower()
@@ -108,7 +76,7 @@ expenditures_df['amount'] = pd.to_numeric(expenditures_df['amount'], errors='coe
108
  expenditures_df['periodyear'] = pd.to_numeric(expenditures_df['periodyear'], errors='coerce')
109
  expenditures_df['periodquarter'] = pd.to_numeric(expenditures_df['periodquarter'], errors='coerce')
110
 
111
- print(f"Loaded {len(expenditures_df)} expenditure records")
112
  print(f"Columns: {list(expenditures_df.columns)}")
113
 
114
  def create_overview_plots(year_filter, party_filter, category_filter):
 
5
  from datasets import load_dataset
6
  import pyarrow.parquet as pq
7
 
8
+ # Load the dataset using direct parquet file loading
9
  print("Loading dataset...")
10
+
11
+ # Load directly from Hugging Face using pandas
12
+ train_dfs = []
13
+ test_dfs = []
14
+
15
+ # Base URL for the dataset files
16
+ base_url = "https://huggingface.co/datasets/irf23/canadian-parliamentary-expenditures/resolve/main/data"
17
+
18
+ # List of expected files based on the dataset description
19
+ print("Loading training data...")
20
+ for year in range(2021, 2025):
21
+ for quarter in range(1, 5):
22
+ if year == 2021 and quarter == 1:
23
+ continue # Data starts from 2021 Q2
24
+ try:
25
+ url = f"{base_url}/train/expenditures-{year}-q{quarter}.parquet"
26
+ df = pd.read_parquet(url)
27
+ train_dfs.append(df)
28
+ print(f"Loaded {year} Q{quarter} train data ({len(df)} records)")
29
+ except Exception as e:
30
+ print(f"Could not load {year} Q{quarter}: {e}")
31
+
32
+ # Load 2025 test data
33
+ print("\nLoading test data...")
34
+ for quarter in range(1, 5):
35
  try:
36
+ url = f"{base_url}/test/expenditures-2025-q{quarter}.parquet"
37
+ df = pd.read_parquet(url)
38
+ test_dfs.append(df)
39
+ print(f"Loaded 2025 Q{quarter} test data ({len(df)} records)")
40
+ except Exception as e:
41
+ print(f"Could not load 2025 Q{quarter}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ # Combine all dataframes
44
+ if train_dfs and test_dfs:
45
+ expenditures_df = pd.concat(train_dfs + test_dfs, ignore_index=True)
46
+ elif train_dfs:
47
+ expenditures_df = pd.concat(train_dfs, ignore_index=True)
48
+ else:
49
+ # Create dummy data for testing
50
+ print("Creating dummy data for demonstration")
51
+ expenditures_df = pd.DataFrame({
52
+ 'Id': ['1', '2', '3'],
53
+ 'MemberId': ['m1', 'm2', 'm3'],
54
+ 'MemberName': ['John Doe', 'Jane Smith', 'Bob Johnson'],
55
+ 'Constituency': ['Riding A', 'Riding B', 'Riding C'],
56
+ 'Party': ['Liberal', 'Conservative', 'NDP'],
57
+ 'Category': ['Travel', 'Hospitality', 'Contract'],
58
+ 'Amount': [1000.0, 2000.0, 1500.0],
59
+ 'Description': ['Flight to Ottawa', 'Meeting expenses', 'Consulting'],
60
+ 'Location': ['Toronto', 'Vancouver', 'Montreal'],
61
+ 'Supplier': ['Air Canada', 'Hotel XYZ', 'Consultant ABC'],
62
+ 'PeriodYear': [2024, 2024, 2024],
63
+ 'PeriodQuarter': [1, 1, 2],
64
+ 'DateIncurred': ['2024-01-15', '2024-02-20', '2024-04-10'],
65
+ 'ClaimId': ['c1', 'c2', 'c3'],
66
+ 'CreatedAt': ['2024-01-20', '2024-02-25', '2024-04-15'],
67
+ 'UpdatedAt': ['2024-01-20', '2024-02-25', '2024-04-15']
68
+ })
69
 
70
  # Convert column names to lowercase
71
  expenditures_df.columns = expenditures_df.columns.str.lower()
 
76
  expenditures_df['periodyear'] = pd.to_numeric(expenditures_df['periodyear'], errors='coerce')
77
  expenditures_df['periodquarter'] = pd.to_numeric(expenditures_df['periodquarter'], errors='coerce')
78
 
79
+ print(f"\nLoaded {len(expenditures_df)} total expenditure records")
80
  print(f"Columns: {list(expenditures_df.columns)}")
81
 
82
  def create_overview_plots(year_filter, party_filter, category_filter):