Edwin Salguero
Enhanced FRED ML with improved Reports & Insights page, fixed alignment analysis, and comprehensive analytics improvements
2469150
#!/usr/bin/env python3 | |
""" | |
Data Validation Script | |
Test the economic indicators and identify math issues | |
""" | |
import os | |
import sys | |
import pandas as pd | |
import numpy as np | |
from datetime import datetime | |
# Add src to path | |
sys.path.append(os.path.join(os.path.dirname(__file__), 'src')) | |
from src.core.enhanced_fred_client import EnhancedFREDClient | |
def test_data_validation(): | |
"""Test data validation and identify issues""" | |
# Use a demo API key for testing (FRED allows limited access without key) | |
api_key = "demo" # FRED demo key for testing | |
print("=== ECONOMIC DATA VALIDATION TEST ===\n") | |
try: | |
# Initialize client | |
client = EnhancedFREDClient(api_key) | |
# Test indicators | |
indicators = ['GDPC1', 'CPIAUCSL', 'INDPRO', 'RSAFS', 'FEDFUNDS', 'DGS10'] | |
print("1. Testing data fetching...") | |
data = client.fetch_economic_data( | |
indicators=indicators, | |
start_date='2020-01-01', | |
end_date='2024-12-31', | |
frequency='auto' | |
) | |
print(f"Data shape: {data.shape}") | |
print(f"Date range: {data.index.min()} to {data.index.max()}") | |
print(f"Columns: {list(data.columns)}") | |
print("\n2. Raw data sample (last 5 observations):") | |
print(data.tail()) | |
print("\n3. Data statistics:") | |
print(data.describe()) | |
print("\n4. Missing data analysis:") | |
missing_data = data.isnull().sum() | |
print(missing_data) | |
print("\n5. Testing frequency standardization...") | |
# Test the frequency standardization | |
for indicator in indicators: | |
if indicator in data.columns: | |
series = data[indicator].dropna() | |
print(f"{indicator}: {len(series)} observations, freq: {series.index.freq}") | |
print("\n6. Testing growth rate calculation...") | |
# Test growth rate calculation | |
for indicator in indicators: | |
if indicator in data.columns: | |
series = data[indicator].dropna() | |
if len(series) > 1: | |
# Calculate percent change | |
pct_change = series.pct_change().dropna() | |
latest_change = pct_change.iloc[-1] * 100 if len(pct_change) > 0 else 0 | |
print(f"{indicator}: Latest change = {latest_change:.2f}%") | |
print(f" Raw values: {series.iloc[-2]:.2f} -> {series.iloc[-1]:.2f}") | |
print("\n7. Testing unit normalization...") | |
# Test unit normalization | |
for indicator in indicators: | |
if indicator in data.columns: | |
series = data[indicator].dropna() | |
if len(series) > 0: | |
mean_val = series.mean() | |
std_val = series.std() | |
print(f"{indicator}: Mean={mean_val:.2f}, Std={std_val:.2f}") | |
# Check for potential unit issues | |
if mean_val > 1000000: # Likely in billions/trillions | |
print(f" WARNING: {indicator} has very large values - may need unit conversion") | |
elif mean_val < 1 and indicator in ['FEDFUNDS', 'DGS10']: | |
print(f" WARNING: {indicator} has small values - may be in decimal form instead of percentage") | |
print("\n8. Testing data quality validation...") | |
quality_report = client.validate_data_quality(data) | |
print("Quality report summary:") | |
for series, metrics in quality_report['missing_data'].items(): | |
print(f" {series}: {metrics['completeness']:.1f}% complete") | |
print("\n9. Testing frequency alignment...") | |
# Check if all series have the same frequency | |
frequencies = {} | |
for indicator in indicators: | |
if indicator in data.columns: | |
series = data[indicator].dropna() | |
if len(series) > 0: | |
freq = pd.infer_freq(series.index) | |
frequencies[indicator] = freq | |
print(f" {indicator}: {freq}") | |
# Check for frequency mismatches | |
unique_freqs = set(frequencies.values()) | |
if len(unique_freqs) > 1: | |
print(f" WARNING: Multiple frequencies detected: {unique_freqs}") | |
print(" This may cause issues in modeling and forecasting") | |
print("\n=== VALIDATION COMPLETE ===") | |
# Summary of potential issues | |
print("\n=== POTENTIAL ISSUES IDENTIFIED ===") | |
issues = [] | |
# Check for unit scale issues | |
for indicator in indicators: | |
if indicator in data.columns: | |
series = data[indicator].dropna() | |
if len(series) > 0: | |
mean_val = series.mean() | |
if mean_val > 1000000: | |
issues.append(f"Unit scale issue: {indicator} has very large values ({mean_val:.0f})") | |
elif mean_val < 1 and indicator in ['FEDFUNDS', 'DGS10']: | |
issues.append(f"Unit format issue: {indicator} may be in decimal form instead of percentage") | |
# Check for frequency issues | |
if len(unique_freqs) > 1: | |
issues.append(f"Frequency mismatch: Series have different frequencies {unique_freqs}") | |
# Check for missing data | |
for series, metrics in quality_report['missing_data'].items(): | |
if metrics['missing_percentage'] > 10: | |
issues.append(f"Missing data: {series} has {metrics['missing_percentage']:.1f}% missing values") | |
if issues: | |
for issue in issues: | |
print(f" • {issue}") | |
else: | |
print(" No major issues detected") | |
except Exception as e: | |
print(f"Error during validation: {e}") | |
import traceback | |
traceback.print_exc() | |
if __name__ == "__main__": | |
test_data_validation() |