|
"""data_handler.py module.""" |
|
|
|
import numpy as np |
|
import pandas as pd |
|
from typing import Tuple, Dict |
|
import tensorflow as tf |
|
from sklearn.preprocessing import StandardScaler |
|
|
|
class FinancialDataHandler: |
|
def __init__(self, config: Dict): |
|
"""Initialize the data handler with configuration.""" |
|
self.batch_size = config['data']['batch_size'] |
|
self.shuffle_buffer = config['data']['shuffle_buffer'] |
|
self.prefetch_buffer = config['data']['prefetch_buffer'] |
|
self.scaler = StandardScaler() |
|
|
|
def simulate_financial_data(self, num_samples: int = 1000) -> pd.DataFrame: |
|
"""Generate synthetic financial data for testing.""" |
|
np.random.seed(42) |
|
|
|
data = { |
|
'transaction_amount': np.random.lognormal(mean=4.0, sigma=1.0, size=num_samples), |
|
'account_balance': np.random.normal(loc=10000, scale=5000, size=num_samples), |
|
'transaction_frequency': np.random.poisson(lam=5, size=num_samples), |
|
'credit_score': np.random.normal(loc=700, scale=50, size=num_samples).clip(300, 850), |
|
'days_since_last_transaction': np.random.exponential(scale=7, size=num_samples) |
|
} |
|
|
|
return pd.DataFrame(data) |
|
|
|
def preprocess_data(self, data: pd.DataFrame) -> tf.data.Dataset: |
|
"""Preprocess the data and convert to TensorFlow dataset.""" |
|
|
|
scaled_data = self.scaler.fit_transform(data) |
|
|
|
|
|
dataset = tf.data.Dataset.from_tensor_slices(scaled_data) |
|
|
|
|
|
dataset = dataset.shuffle(self.shuffle_buffer) |
|
dataset = dataset.batch(self.batch_size) |
|
dataset = dataset.prefetch(self.prefetch_buffer) |
|
|
|
return dataset |
|
|
|
def get_client_data(self) -> Tuple[tf.data.Dataset, StandardScaler]: |
|
"""Get preprocessed client data and scaler.""" |
|
|
|
raw_data = self.simulate_financial_data() |
|
|
|
|
|
dataset = self.preprocess_data(raw_data) |
|
|
|
return dataset, self.scaler |
|
|
|
|