File size: 7,370 Bytes
099d8d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
#!/usr/bin/env python3
"""
Fixes Demonstration
Demonstrate the fixes applied to the economic analysis pipeline
"""
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
def create_test_data():
"""Create test data to demonstrate fixes"""
# Create date range
dates = pd.date_range('2020-01-01', '2024-12-31', freq='Q')
# Test data with the issues
data = {
'GDPC1': [22000, 22100, 22200, 22300, 22400, 22500, 22600, 22700, 22800, 22900, 23000, 23100, 23200, 23300, 23400, 23500, 23600, 23700, 23800, 23900], # Billions
'CPIAUCSL': [258.0, 258.5, 259.0, 259.5, 260.0, 260.5, 261.0, 261.5, 262.0, 262.5, 263.0, 263.5, 264.0, 264.5, 265.0, 265.5, 266.0, 266.5, 267.0, 267.5], # Index
'INDPRO': [100.0, 100.5, 101.0, 101.5, 102.0, 102.5, 103.0, 103.5, 104.0, 104.5, 105.0, 105.5, 106.0, 106.5, 107.0, 107.5, 108.0, 108.5, 109.0, 109.5], # Index
'RSAFS': [500000, 502000, 504000, 506000, 508000, 510000, 512000, 514000, 516000, 518000, 520000, 522000, 524000, 526000, 528000, 530000, 532000, 534000, 536000, 538000], # Millions
'FEDFUNDS': [0.08, 0.09, 0.10, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27], # Decimal form
'DGS10': [1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4] # Decimal form
}
df = pd.DataFrame(data, index=dates)
return df
def demonstrate_fixes():
"""Demonstrate the fixes applied"""
print("=== ECONOMIC ANALYSIS FIXES DEMONSTRATION ===\n")
# Create test data
raw_data = create_test_data()
print("1. ORIGINAL DATA (with issues):")
print(raw_data.tail())
print()
print("2. APPLYING FIXES:")
print()
# Fix 1: Unit Normalization
print("FIX 1: Unit Normalization")
print("-" * 30)
normalized_data = raw_data.copy()
# Apply unit fixes
normalized_data['GDPC1'] = raw_data['GDPC1'] / 1000 # Billions to trillions
normalized_data['RSAFS'] = raw_data['RSAFS'] / 1000 # Millions to billions
normalized_data['FEDFUNDS'] = raw_data['FEDFUNDS'] * 100 # Decimal to percentage
normalized_data['DGS10'] = raw_data['DGS10'] * 100 # Decimal to percentage
print("After unit normalization:")
print(normalized_data.tail())
print()
# Fix 2: Growth Rate Calculation
print("FIX 2: Proper Growth Rate Calculation")
print("-" * 40)
growth_data = normalized_data.pct_change() * 100
growth_data = growth_data.dropna()
print("Growth rates (percent change):")
print(growth_data.tail())
print()
# Fix 3: Safe MAPE Calculation
print("FIX 3: Safe MAPE Calculation")
print("-" * 30)
# Test MAPE with problematic data
actual_problematic = np.array([0.1, 0.2, 0.3, 0.4, 0.5])
forecast_problematic = np.array([0.15, 0.25, 0.35, 0.45, 0.55])
# Original MAPE (can fail)
try:
original_mape = np.mean(np.abs((actual_problematic - forecast_problematic) / actual_problematic)) * 100
print(f"Original MAPE: {original_mape:.2f}%")
except:
print("Original MAPE: ERROR (division by zero)")
# Fixed MAPE
denominator = np.maximum(np.abs(actual_problematic), 1e-5)
fixed_mape = np.mean(np.abs((actual_problematic - forecast_problematic) / denominator)) * 100
print(f"Fixed MAPE: {fixed_mape:.2f}%")
print()
# Fix 4: Forecast Period Scaling
print("FIX 4: Forecast Period Scaling")
print("-" * 35)
base_periods = 4
freq_scaling = {'D': 90, 'M': 3, 'Q': 1}
print("Original forecast_periods = 4")
print("Scaled by frequency:")
for freq, scale in freq_scaling.items():
scaled = base_periods * scale
print(f" {freq} (daily): {base_periods} -> {scaled} periods")
print()
# Fix 5: Correlation Analysis with Normalized Data
print("FIX 5: Correlation Analysis with Normalized Data")
print("-" * 50)
# Original correlation (dominated by scale)
original_corr = raw_data.corr()
print("Original correlation (scale-dominated):")
print(original_corr.round(3))
print()
# Fixed correlation (normalized)
fixed_corr = growth_data.corr()
print("Fixed correlation (normalized growth rates):")
print(fixed_corr.round(3))
print()
# Fix 6: Data Quality Metrics
print("FIX 6: Enhanced Data Quality Metrics")
print("-" * 40)
# Calculate comprehensive quality metrics
quality_metrics = {}
for column in growth_data.columns:
series = growth_data[column].dropna()
quality_metrics[column] = {
'mean': series.mean(),
'std': series.std(),
'skewness': series.skew(),
'kurtosis': series.kurtosis(),
'missing_pct': (growth_data[column].isna().sum() / len(growth_data)) * 100
}
print("Quality metrics for growth rates:")
for col, metrics in quality_metrics.items():
print(f" {col}:")
print(f" Mean: {metrics['mean']:.4f}%")
print(f" Std: {metrics['std']:.4f}%")
print(f" Skewness: {metrics['skewness']:.4f}")
print(f" Kurtosis: {metrics['kurtosis']:.4f}")
print(f" Missing: {metrics['missing_pct']:.1f}%")
print()
# Summary of fixes
print("=== SUMMARY OF FIXES APPLIED ===")
print()
fixes = [
"1. Unit Normalization:",
" • GDP: billions → trillions",
" • Retail Sales: millions → billions",
" • Interest Rates: decimal → percentage",
"",
"2. Growth Rate Calculation:",
" • Explicit percent change calculation",
" • Proper interpretation of results",
"",
"3. Safe MAPE Calculation:",
" • Added epsilon to prevent division by zero",
" • More robust error metrics",
"",
"4. Forecast Period Scaling:",
" • Scale periods by data frequency",
" • Appropriate horizons for different series",
"",
"5. Data Normalization:",
" • Z-score or growth rate normalization",
" • Prevents scale bias in correlations",
"",
"6. Stationarity Enforcement:",
" • ADF tests before causality analysis",
" • Differencing for non-stationary series",
"",
"7. Enhanced Error Handling:",
" • Robust missing data handling",
" • Graceful failure recovery",
""
]
for fix in fixes:
print(fix)
print("=== IMPACT OF FIXES ===")
print()
impacts = [
"• More accurate economic interpretations",
"• Proper scale comparisons between indicators",
"• Robust forecasting with appropriate horizons",
"• Reliable statistical tests and correlations",
"• Better error handling and data quality",
"• Consistent frequency alignment",
"• Safe mathematical operations"
]
for impact in impacts:
print(impact)
print()
print("These fixes address all the major math issues identified in the original analysis.")
if __name__ == "__main__":
demonstrate_fixes() |