Edwin Salguero
Initial commit: Enhanced Algorithmic Trading System with Synthetic Data Generation, Comprehensive Logging, and Extensive Testing
859af74
import pytest | |
import pandas as pd | |
import numpy as np | |
import tempfile | |
import os | |
from unittest.mock import patch, MagicMock | |
from agentic_ai_system.data_ingestion import load_data, validate_data, _load_csv_data, _generate_synthetic_data | |
class TestDataIngestion: | |
"""Test cases for data ingestion module""" | |
def config(self): | |
"""Sample configuration for testing""" | |
return { | |
'data_source': { | |
'type': 'csv', | |
'path': 'data/market_data.csv' | |
}, | |
'synthetic_data': { | |
'base_price': 150.0, | |
'volatility': 0.02, | |
'trend': 0.001, | |
'noise_level': 0.005, | |
'data_path': 'data/synthetic_market_data.csv' | |
}, | |
'trading': { | |
'symbol': 'AAPL', | |
'timeframe': '1min' | |
} | |
} | |
def sample_csv_data(self): | |
"""Create sample CSV data for testing""" | |
dates = pd.date_range(start='2024-01-01', periods=100, freq='1min') | |
data = [] | |
for i, date in enumerate(dates): | |
base_price = 150.0 + (i * 0.1) | |
data.append({ | |
'timestamp': date, | |
'open': base_price + np.random.normal(0, 1), | |
'high': base_price + abs(np.random.normal(0, 2)), | |
'low': base_price - abs(np.random.normal(0, 2)), | |
'close': base_price + np.random.normal(0, 1), | |
'volume': np.random.randint(1000, 100000) | |
}) | |
return pd.DataFrame(data) | |
def test_load_data_csv_type(self, config, sample_csv_data): | |
"""Test loading data with CSV type""" | |
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as tmp_file: | |
sample_csv_data.to_csv(tmp_file.name, index=False) | |
config['data_source']['path'] = tmp_file.name | |
try: | |
result = load_data(config) | |
assert isinstance(result, pd.DataFrame) | |
assert len(result) == len(sample_csv_data) | |
assert list(result.columns) == list(sample_csv_data.columns) | |
finally: | |
os.unlink(tmp_file.name) | |
def test_load_data_synthetic_type(self, config): | |
"""Test loading data with synthetic type""" | |
config['data_source']['type'] = 'synthetic' | |
with patch('agentic_ai_system.data_ingestion._generate_synthetic_data') as mock_generate: | |
mock_df = pd.DataFrame({ | |
'timestamp': pd.date_range('2024-01-01', periods=10, freq='1min'), | |
'open': [150] * 10, | |
'high': [155] * 10, | |
'low': [145] * 10, | |
'close': [152] * 10, | |
'volume': [1000] * 10 | |
}) | |
mock_generate.return_value = mock_df | |
result = load_data(config) | |
assert isinstance(result, pd.DataFrame) | |
mock_generate.assert_called_once_with(config) | |
def test_load_data_invalid_type(self, config): | |
"""Test loading data with invalid type""" | |
config['data_source']['type'] = 'invalid_type' | |
with pytest.raises(ValueError, match="Unsupported data source type"): | |
load_data(config) | |
def test_load_csv_data_file_exists(self, config, sample_csv_data): | |
"""Test loading CSV data when file exists""" | |
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as tmp_file: | |
sample_csv_data.to_csv(tmp_file.name, index=False) | |
config['data_source']['path'] = tmp_file.name | |
try: | |
result = _load_csv_data(config) | |
assert isinstance(result, pd.DataFrame) | |
assert len(result) == len(sample_csv_data) | |
assert result['timestamp'].dtype == 'datetime64[ns]' | |
finally: | |
os.unlink(tmp_file.name) | |
def test_load_csv_data_file_not_exists(self, config): | |
"""Test loading CSV data when file doesn't exist""" | |
config['data_source']['path'] = 'nonexistent_file.csv' | |
with patch('agentic_ai_system.data_ingestion._generate_synthetic_data') as mock_generate: | |
mock_df = pd.DataFrame({'test': [1, 2, 3]}) | |
mock_generate.return_value = mock_df | |
result = _load_csv_data(config) | |
assert result is mock_df | |
mock_generate.assert_called_once_with(config) | |
def test_load_csv_data_missing_columns(self, config): | |
"""Test loading CSV data with missing columns""" | |
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as tmp_file: | |
# Create CSV with missing columns | |
incomplete_data = pd.DataFrame({ | |
'timestamp': pd.date_range('2024-01-01', periods=10, freq='1min'), | |
'open': [150] * 10, | |
'close': [152] * 10 | |
# Missing high, low, volume | |
}) | |
incomplete_data.to_csv(tmp_file.name, index=False) | |
config['data_source']['path'] = tmp_file.name | |
try: | |
with patch('agentic_ai_system.data_ingestion._generate_synthetic_data') as mock_generate: | |
mock_df = pd.DataFrame({'test': [1, 2, 3]}) | |
mock_generate.return_value = mock_df | |
result = _load_csv_data(config) | |
assert result is mock_df | |
mock_generate.assert_called_once_with(config) | |
finally: | |
os.unlink(tmp_file.name) | |
def test_generate_synthetic_data(self, config): | |
"""Test synthetic data generation""" | |
with patch('agentic_ai_system.synthetic_data_generator.SyntheticDataGenerator') as mock_generator_class: | |
mock_generator = MagicMock() | |
mock_generator_class.return_value = mock_generator | |
mock_df = pd.DataFrame({ | |
'timestamp': pd.date_range('2024-01-01', periods=10, freq='1min'), | |
'open': [150] * 10, | |
'high': [155] * 10, | |
'low': [145] * 10, | |
'close': [152] * 10, | |
'volume': [1000] * 10 | |
}) | |
mock_generator.generate_ohlcv_data.return_value = mock_df | |
result = _generate_synthetic_data(config) | |
assert isinstance(result, pd.DataFrame) | |
mock_generator.generate_ohlcv_data.assert_called_once() | |
mock_generator.save_to_csv.assert_called_once() | |
def test_validate_data_valid(self, sample_csv_data): | |
"""Test data validation with valid data""" | |
assert validate_data(sample_csv_data) == True | |
def test_validate_data_missing_columns(self): | |
"""Test data validation with missing columns""" | |
invalid_data = pd.DataFrame({ | |
'timestamp': pd.date_range('2024-01-01', periods=10, freq='1min'), | |
'open': [150] * 10 | |
# Missing required columns | |
}) | |
assert validate_data(invalid_data) == False | |
def test_validate_data_negative_prices(self): | |
"""Test data validation with negative prices""" | |
invalid_data = pd.DataFrame({ | |
'timestamp': pd.date_range('2024-01-01', periods=10, freq='1min'), | |
'open': [150] * 10, | |
'high': [155] * 10, | |
'low': [-145] * 10, # Negative low price | |
'close': [152] * 10, | |
'volume': [1000] * 10 | |
}) | |
assert validate_data(invalid_data) == False | |
def test_validate_data_negative_volumes(self): | |
"""Test data validation with negative volumes""" | |
invalid_data = pd.DataFrame({ | |
'timestamp': pd.date_range('2024-01-01', periods=10, freq='1min'), | |
'open': [150] * 10, | |
'high': [155] * 10, | |
'low': [145] * 10, | |
'close': [152] * 10, | |
'volume': [-1000] * 10 # Negative volume | |
}) | |
assert validate_data(invalid_data) == False | |
def test_validate_data_invalid_ohlc(self): | |
"""Test data validation with invalid OHLC relationships""" | |
invalid_data = pd.DataFrame({ | |
'timestamp': pd.date_range('2024-01-01', periods=10, freq='1min'), | |
'open': [150] * 10, | |
'high': [145] * 10, # High < Open | |
'low': [145] * 10, | |
'close': [152] * 10, | |
'volume': [1000] * 10 | |
}) | |
assert validate_data(invalid_data) == False | |
def test_validate_data_null_values(self): | |
"""Test data validation with null values""" | |
invalid_data = pd.DataFrame({ | |
'timestamp': pd.date_range('2024-01-01', periods=10, freq='1min'), | |
'open': [150] * 10, | |
'high': [155] * 10, | |
'low': [145] * 10, | |
'close': [152] * 10, | |
'volume': [1000] * 10 | |
}) | |
# Add null values | |
invalid_data.loc[0, 'open'] = None | |
assert validate_data(invalid_data) == False | |
def test_validate_data_empty_dataframe(self): | |
"""Test data validation with empty DataFrame""" | |
empty_data = pd.DataFrame() | |
assert validate_data(empty_data) == False | |
def test_load_data_error_handling(self, config): | |
"""Test error handling in load_data""" | |
config['data_source']['type'] = 'csv' | |
config['data_source']['path'] = 'nonexistent_file.csv' | |
with patch('agentic_ai_system.data_ingestion._generate_synthetic_data', side_effect=Exception("Test error")): | |
with pytest.raises(Exception, match="Test error"): | |
load_data(config) | |
def test_csv_data_timestamp_conversion(self, config, sample_csv_data): | |
"""Test timestamp conversion in CSV loading""" | |
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as tmp_file: | |
# Convert timestamp to string for CSV | |
sample_csv_data['timestamp'] = sample_csv_data['timestamp'].astype(str) | |
sample_csv_data.to_csv(tmp_file.name, index=False) | |
config['data_source']['path'] = tmp_file.name | |
try: | |
result = _load_csv_data(config) | |
# Check that timestamp is converted to datetime | |
assert result['timestamp'].dtype == 'datetime64[ns]' | |
finally: | |
os.unlink(tmp_file.name) | |
def test_synthetic_data_directory_creation(self, config): | |
"""Test that synthetic data directory is created if it doesn't exist""" | |
with patch('os.makedirs') as mock_makedirs: | |
with patch('agentic_ai_system.synthetic_data_generator.SyntheticDataGenerator') as mock_generator_class: | |
mock_generator = MagicMock() | |
mock_generator_class.return_value = mock_generator | |
mock_df = pd.DataFrame({'test': [1, 2, 3]}) | |
mock_generator.generate_ohlcv_data.return_value = mock_df | |
_generate_synthetic_data(config) | |
# Check that makedirs was called | |
mock_makedirs.assert_called_once() | |
def test_data_validation_edge_cases(self): | |
"""Test data validation with edge cases""" | |
# Test with single row | |
single_row_data = pd.DataFrame({ | |
'timestamp': [pd.Timestamp('2024-01-01')], | |
'open': [150], | |
'high': [155], | |
'low': [145], | |
'close': [152], | |
'volume': [1000] | |
}) | |
assert validate_data(single_row_data) == True | |
# Test with very large numbers | |
large_data = pd.DataFrame({ | |
'timestamp': pd.date_range('2024-01-01', periods=5, freq='1min'), | |
'open': [1e6] * 5, | |
'high': [1e6 + 100] * 5, | |
'low': [1e6 - 100] * 5, | |
'close': [1e6 + 50] * 5, | |
'volume': [1e9] * 5 | |
}) | |
assert validate_data(large_data) == True |