Spaces:

irf23
/

ParliamentExpenditureSpace

Sleeping

Irfan Savji commited on Jul 30

Commit

4dcee19

1 Parent(s): 975185d

Simplify data loading to avoid dataset schema conflicts

- Remove dataset library loading to avoid schema mismatch with members table
- Load parquet files directly from URLs using pandas
- This avoids the conflict between expenditures and members schemas

Files changed (1) hide show

app.py +59 -91

app.py CHANGED Viewed

@@ -5,99 +5,67 @@ import plotly.graph_objects as go
 from datasets import load_dataset
 import pyarrow.parquet as pq
-# Try to load the dataset with error handling
 print("Loading dataset...")
-try:
-    # Try loading with streaming first to check structure
-    dataset = load_dataset("irf23/canadian-parliamentary-expenditures", streaming=True)
-    print(f"Dataset keys: {list(dataset.keys())}")
-    # Try to peek at the data structure
-    for split_name in dataset.keys():
-        print(f"\nChecking split: {split_name}")
-        for i, example in enumerate(dataset[split_name]):
-            print(f"Columns in {split_name}: {list(example.keys())}")
-            break
-    # Now load the full dataset
-    dataset = load_dataset("irf23/canadian-parliamentary-expenditures", streaming=False, trust_remote_code=True)
-except Exception as e:
-    print(f"Error loading dataset directly: {e}")
-    print("Trying alternative loading method...")
-    # Alternative: Try loading the parquet files directly
     try:
-        # Load directly from Hugging Face using pandas
-        import requests
-        # Base URL for the dataset files
-        base_url = "https://huggingface.co/datasets/irf23/canadian-parliamentary-expenditures/resolve/main/data"
-        # Try to load train and test data
-        train_dfs = []
-        test_dfs = []
-        # List of expected files based on the dataset description
-        for year in range(2021, 2025):
-            for quarter in range(1, 5):
-                if year == 2021 and quarter == 1:
-                    continue  # Data starts from 2021 Q2
-                try:
-                    url = f"{base_url}/train/expenditures-{year}-q{quarter}.parquet"
-                    df = pd.read_parquet(url)
-                    train_dfs.append(df)
-                    print(f"Loaded {year} Q{quarter} train data")
-                except:
-                    pass
-        # Load 2025 test data
-        for quarter in range(1, 5):
-            try:
-                url = f"{base_url}/test/expenditures-2025-q{quarter}.parquet"
-                df = pd.read_parquet(url)
-                test_dfs.append(df)
-                print(f"Loaded 2025 Q{quarter} test data")
-            except:
-                pass
-        # Combine all dataframes
-        if train_dfs and test_dfs:
-            expenditures_df = pd.concat(train_dfs + test_dfs, ignore_index=True)
-        elif train_dfs:
-            expenditures_df = pd.concat(train_dfs, ignore_index=True)
-        else:
-            raise Exception("Could not load any data files")
-    except Exception as e2:
-        print(f"Alternative loading also failed: {e2}")
-        # Create dummy data for testing
-        expenditures_df = pd.DataFrame({
-            'Id': ['1', '2', '3'],
-            'MemberId': ['m1', 'm2', 'm3'],
-            'MemberName': ['John Doe', 'Jane Smith', 'Bob Johnson'],
-            'Constituency': ['Riding A', 'Riding B', 'Riding C'],
-            'Party': ['Liberal', 'Conservative', 'NDP'],
-            'Category': ['Travel', 'Hospitality', 'Contract'],
-            'Amount': [1000.0, 2000.0, 1500.0],
-            'Description': ['Flight to Ottawa', 'Meeting expenses', 'Consulting'],
-            'Location': ['Toronto', 'Vancouver', 'Montreal'],
-            'Supplier': ['Air Canada', 'Hotel XYZ', 'Consultant ABC'],
-            'PeriodYear': [2024, 2024, 2024],
-            'PeriodQuarter': [1, 1, 2],
-            'DateIncurred': ['2024-01-15', '2024-02-20', '2024-04-10'],
-            'ClaimId': ['c1', 'c2', 'c3'],
-            'CreatedAt': ['2024-01-20', '2024-02-25', '2024-04-15'],
-            'UpdatedAt': ['2024-01-20', '2024-02-25', '2024-04-15']
-        })
-        print("Using dummy data for demonstration")
-# If we successfully loaded data, process it
-if 'dataset' in locals() and not isinstance(dataset, Exception):
-    # Combine train and test splits
-    train_df = dataset['train'].to_pandas()
-    test_df = dataset['test'].to_pandas()
-    expenditures_df = pd.concat([train_df, test_df], ignore_index=True)
 # Convert column names to lowercase
 expenditures_df.columns = expenditures_df.columns.str.lower()
@@ -108,7 +76,7 @@ expenditures_df['amount'] = pd.to_numeric(expenditures_df['amount'], errors='coe
 expenditures_df['periodyear'] = pd.to_numeric(expenditures_df['periodyear'], errors='coerce')
 expenditures_df['periodquarter'] = pd.to_numeric(expenditures_df['periodquarter'], errors='coerce')
-print(f"Loaded {len(expenditures_df)} expenditure records")
 print(f"Columns: {list(expenditures_df.columns)}")
 def create_overview_plots(year_filter, party_filter, category_filter):

 from datasets import load_dataset
 import pyarrow.parquet as pq
+# Load the dataset using direct parquet file loading
 print("Loading dataset...")
+# Load directly from Hugging Face using pandas
+train_dfs = []
+test_dfs = []
+# Base URL for the dataset files
+base_url = "https://huggingface.co/datasets/irf23/canadian-parliamentary-expenditures/resolve/main/data"
+# List of expected files based on the dataset description
+print("Loading training data...")
+for year in range(2021, 2025):
+    for quarter in range(1, 5):
+        if year == 2021 and quarter == 1:
+            continue  # Data starts from 2021 Q2
+        try:
+            url = f"{base_url}/train/expenditures-{year}-q{quarter}.parquet"
+            df = pd.read_parquet(url)
+            train_dfs.append(df)
+            print(f"Loaded {year} Q{quarter} train data ({len(df)} records)")
+        except Exception as e:
+            print(f"Could not load {year} Q{quarter}: {e}")
+# Load 2025 test data
+print("\nLoading test data...")
+for quarter in range(1, 5):
     try:
+        url = f"{base_url}/test/expenditures-2025-q{quarter}.parquet"
+        df = pd.read_parquet(url)
+        test_dfs.append(df)
+        print(f"Loaded 2025 Q{quarter} test data ({len(df)} records)")
+    except Exception as e:
+        print(f"Could not load 2025 Q{quarter}: {e}")
+# Combine all dataframes
+if train_dfs and test_dfs:
+    expenditures_df = pd.concat(train_dfs + test_dfs, ignore_index=True)
+elif train_dfs:
+    expenditures_df = pd.concat(train_dfs, ignore_index=True)
+else:
+    # Create dummy data for testing
+    print("Creating dummy data for demonstration")
+    expenditures_df = pd.DataFrame({
+        'Id': ['1', '2', '3'],
+        'MemberId': ['m1', 'm2', 'm3'],
+        'MemberName': ['John Doe', 'Jane Smith', 'Bob Johnson'],
+        'Constituency': ['Riding A', 'Riding B', 'Riding C'],
+        'Party': ['Liberal', 'Conservative', 'NDP'],
+        'Category': ['Travel', 'Hospitality', 'Contract'],
+        'Amount': [1000.0, 2000.0, 1500.0],
+        'Description': ['Flight to Ottawa', 'Meeting expenses', 'Consulting'],
+        'Location': ['Toronto', 'Vancouver', 'Montreal'],
+        'Supplier': ['Air Canada', 'Hotel XYZ', 'Consultant ABC'],
+        'PeriodYear': [2024, 2024, 2024],
+        'PeriodQuarter': [1, 1, 2],
+        'DateIncurred': ['2024-01-15', '2024-02-20', '2024-04-10'],
+        'ClaimId': ['c1', 'c2', 'c3'],
+        'CreatedAt': ['2024-01-20', '2024-02-25', '2024-04-15'],
+        'UpdatedAt': ['2024-01-20', '2024-02-25', '2024-04-15']
+    })
 # Convert column names to lowercase
 expenditures_df.columns = expenditures_df.columns.str.lower()
 expenditures_df['periodyear'] = pd.to_numeric(expenditures_df['periodyear'], errors='coerce')
 expenditures_df['periodquarter'] = pd.to_numeric(expenditures_df['periodquarter'], errors='coerce')
+print(f"\nLoaded {len(expenditures_df)} total expenditure records")
 print(f"Columns: {list(expenditures_df.columns)}")
 def create_overview_plots(year_filter, party_filter, category_filter):