|
|
|
""" |
|
Data Validation Script |
|
Test the economic indicators and identify math issues |
|
""" |
|
|
|
import os |
|
import sys |
|
import pandas as pd |
|
import numpy as np |
|
from datetime import datetime |
|
|
|
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), 'src')) |
|
|
|
from src.core.enhanced_fred_client import EnhancedFREDClient |
|
|
|
def test_data_validation(): |
|
"""Test data validation and identify issues""" |
|
|
|
|
|
api_key = "demo" |
|
|
|
print("=== ECONOMIC DATA VALIDATION TEST ===\n") |
|
|
|
try: |
|
|
|
client = EnhancedFREDClient(api_key) |
|
|
|
|
|
indicators = ['GDPC1', 'CPIAUCSL', 'INDPRO', 'RSAFS', 'FEDFUNDS', 'DGS10'] |
|
|
|
print("1. Testing data fetching...") |
|
data = client.fetch_economic_data( |
|
indicators=indicators, |
|
start_date='2020-01-01', |
|
end_date='2024-12-31', |
|
frequency='auto' |
|
) |
|
|
|
print(f"Data shape: {data.shape}") |
|
print(f"Date range: {data.index.min()} to {data.index.max()}") |
|
print(f"Columns: {list(data.columns)}") |
|
|
|
print("\n2. Raw data sample (last 5 observations):") |
|
print(data.tail()) |
|
|
|
print("\n3. Data statistics:") |
|
print(data.describe()) |
|
|
|
print("\n4. Missing data analysis:") |
|
missing_data = data.isnull().sum() |
|
print(missing_data) |
|
|
|
print("\n5. Testing frequency standardization...") |
|
|
|
for indicator in indicators: |
|
if indicator in data.columns: |
|
series = data[indicator].dropna() |
|
print(f"{indicator}: {len(series)} observations, freq: {series.index.freq}") |
|
|
|
print("\n6. Testing growth rate calculation...") |
|
|
|
for indicator in indicators: |
|
if indicator in data.columns: |
|
series = data[indicator].dropna() |
|
if len(series) > 1: |
|
|
|
pct_change = series.pct_change().dropna() |
|
latest_change = pct_change.iloc[-1] * 100 if len(pct_change) > 0 else 0 |
|
print(f"{indicator}: Latest change = {latest_change:.2f}%") |
|
print(f" Raw values: {series.iloc[-2]:.2f} -> {series.iloc[-1]:.2f}") |
|
|
|
print("\n7. Testing unit normalization...") |
|
|
|
for indicator in indicators: |
|
if indicator in data.columns: |
|
series = data[indicator].dropna() |
|
if len(series) > 0: |
|
mean_val = series.mean() |
|
std_val = series.std() |
|
print(f"{indicator}: Mean={mean_val:.2f}, Std={std_val:.2f}") |
|
|
|
|
|
if mean_val > 1000000: |
|
print(f" WARNING: {indicator} has very large values - may need unit conversion") |
|
elif mean_val < 1 and indicator in ['FEDFUNDS', 'DGS10']: |
|
print(f" WARNING: {indicator} has small values - may be in decimal form instead of percentage") |
|
|
|
print("\n8. Testing data quality validation...") |
|
quality_report = client.validate_data_quality(data) |
|
print("Quality report summary:") |
|
for series, metrics in quality_report['missing_data'].items(): |
|
print(f" {series}: {metrics['completeness']:.1f}% complete") |
|
|
|
print("\n9. Testing frequency alignment...") |
|
|
|
frequencies = {} |
|
for indicator in indicators: |
|
if indicator in data.columns: |
|
series = data[indicator].dropna() |
|
if len(series) > 0: |
|
freq = pd.infer_freq(series.index) |
|
frequencies[indicator] = freq |
|
print(f" {indicator}: {freq}") |
|
|
|
|
|
unique_freqs = set(frequencies.values()) |
|
if len(unique_freqs) > 1: |
|
print(f" WARNING: Multiple frequencies detected: {unique_freqs}") |
|
print(" This may cause issues in modeling and forecasting") |
|
|
|
print("\n=== VALIDATION COMPLETE ===") |
|
|
|
|
|
print("\n=== POTENTIAL ISSUES IDENTIFIED ===") |
|
|
|
issues = [] |
|
|
|
|
|
for indicator in indicators: |
|
if indicator in data.columns: |
|
series = data[indicator].dropna() |
|
if len(series) > 0: |
|
mean_val = series.mean() |
|
if mean_val > 1000000: |
|
issues.append(f"Unit scale issue: {indicator} has very large values ({mean_val:.0f})") |
|
elif mean_val < 1 and indicator in ['FEDFUNDS', 'DGS10']: |
|
issues.append(f"Unit format issue: {indicator} may be in decimal form instead of percentage") |
|
|
|
|
|
if len(unique_freqs) > 1: |
|
issues.append(f"Frequency mismatch: Series have different frequencies {unique_freqs}") |
|
|
|
|
|
for series, metrics in quality_report['missing_data'].items(): |
|
if metrics['missing_percentage'] > 10: |
|
issues.append(f"Missing data: {series} has {metrics['missing_percentage']:.1f}% missing values") |
|
|
|
if issues: |
|
for issue in issues: |
|
print(f" • {issue}") |
|
else: |
|
print(" No major issues detected") |
|
|
|
except Exception as e: |
|
print(f"Error during validation: {e}") |
|
import traceback |
|
traceback.print_exc() |
|
|
|
if __name__ == "__main__": |
|
test_data_validation() |