mboukabous commited on
Commit
829e3ac
·
1 Parent(s): 3fee6e0

Add application file

Browse files
Files changed (36) hide show
  1. app.py +296 -0
  2. data/README.md +1 -0
  3. data/datasets/README.md +21 -0
  4. data/datasets/kaggle_data.py +115 -0
  5. data/preprocessing/README.md +1 -0
  6. data/raw/README.md +1 -0
  7. data/utils/README.md +1 -0
  8. models/README.md +1 -0
  9. models/computer_vision/README.md +1 -0
  10. models/deep_learning/README.md +1 -0
  11. models/nlp/README.md +1 -0
  12. models/reinforcement_learning/README.md +1 -0
  13. models/supervised/classification/README.md +1 -0
  14. models/supervised/regression/README.md +1 -0
  15. models/supervised/regression/adaboost_regressor.py +30 -0
  16. models/supervised/regression/catboost_regressor.py +32 -0
  17. models/supervised/regression/decision_tree_regressor.py +31 -0
  18. models/supervised/regression/elasticnet_regression.py +35 -0
  19. models/supervised/regression/extra_trees_regressor.py +33 -0
  20. models/supervised/regression/gradient_boosting_regressor.py +33 -0
  21. models/supervised/regression/knn_regressor.py +34 -0
  22. models/supervised/regression/lasso_regression.py +33 -0
  23. models/supervised/regression/lightgbm_regressor.py +37 -0
  24. models/supervised/regression/linear_regression.py +29 -0
  25. models/supervised/regression/mlp_regressor.py +35 -0
  26. models/supervised/regression/random_forest_regressor.py +32 -0
  27. models/supervised/regression/ridge_regression.py +33 -0
  28. models/supervised/regression/support_vector_regressor.py +34 -0
  29. models/supervised/regression/xgboost_regressor.py +35 -0
  30. models/unsupervised/README.md +1 -0
  31. requirements.txt +12 -0
  32. scripts/README.md +52 -0
  33. scripts/train_classification_model.py +185 -0
  34. scripts/train_regression_model.py +200 -0
  35. utils/README.md +45 -0
  36. utils/supervised_hyperparameter_tuning.py +115 -0
app.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio Interface for Training Regression Models
3
+
4
+ This script provides a Gradio-based user interface to train regression models using various datasets
5
+ and algorithms. It enables seamless interaction by allowing users to select models, preprocess data,
6
+ and specify hyperparameters through an intuitive UI.
7
+
8
+ Features:
9
+ - **Model Selection**: Choose from a list of available regression algorithms located in `models/supervised/regression`.
10
+ - **Dataset Input Options**:
11
+ - Upload a local CSV file.
12
+ - Specify a path to a dataset.
13
+ - Download datasets from Kaggle using the `kaggle.json` API credentials.
14
+ - **Hyperparameter Customization**: Modify model parameters like test size, random state, cross-validation folds,
15
+ and more directly in the interface.
16
+ - **Visualizations**: Generate plots like actual vs. predicted graphs after training.
17
+ - **Live Feedback**: Outputs training metrics, best hyperparameters, and paths to saved models.
18
+
19
+ Structure:
20
+ 1. **Helper Functions**:
21
+ - `get_model_modules`: Dynamically fetches available regression models.
22
+ - `download_kaggle_data`: Handles Kaggle dataset downloads.
23
+ - `train_model`: Constructs and executes the command for training models.
24
+ - `get_columns_from_data`: Extracts column names from the dataset for UI selection.
25
+
26
+ 2. **Gradio UI Components**:
27
+ - Allows users to toggle between different dataset input methods.
28
+ - Updates column dropdowns dynamically based on the dataset.
29
+ - Executes the training script and displays results and visualizations.
30
+
31
+ Usage:
32
+ - Place this script in the `interfaces/gradio/` directory of the project.
33
+ - Ensure that the project structure adheres to the specified layout.
34
+ - Run the script, and a Gradio interface will be launched for training models interactively.
35
+
36
+ Requirements:
37
+ - Python 3.7 or higher
38
+ - Required Python libraries specified in `requirements.txt`
39
+ - Properly structured project with `train_regression_model.py` and model modules.
40
+
41
+ """
42
+
43
+ import gradio as gr
44
+ import pandas as pd
45
+ import os
46
+ import subprocess
47
+ import sys
48
+ import glob
49
+ import re
50
+
51
+ # Add the project root directory to the Python path
52
+ current_dir = os.path.dirname(os.path.abspath(__file__))
53
+ project_root = os.path.abspath(os.path.join(current_dir, '../../'))
54
+ sys.path.append(project_root)
55
+
56
+ def get_model_modules():
57
+ # Get the list of available model modules
58
+ models_dir = os.path.join(project_root, 'models', 'supervised', 'regression')
59
+ model_files = glob.glob(os.path.join(models_dir, '*.py'))
60
+
61
+ # Debugging: Print the models directory and found files
62
+ print(f"Looking for model files in: {models_dir}")
63
+ print(f"Found model files: {model_files}")
64
+
65
+ models = [os.path.splitext(os.path.basename(f))[0] for f in model_files if not f.endswith('__init__.py')]
66
+ model_modules = [f"{model}" for model in models]
67
+ return model_modules
68
+
69
+ def download_kaggle_data(json_path, competition_name):
70
+ # Import the get_kaggle_data function
71
+ from data.datasets.kaggle_data import get_kaggle_data
72
+
73
+ data_path = get_kaggle_data(json_path=json_path, data_name=competition_name, is_competition=True)
74
+ return data_path
75
+
76
+ def train_model(model_module, data_option, data_file, data_path, data_name_kaggle, kaggle_json_file, competition_name,
77
+ target_variable, drop_columns, test_size, random_state, log_transform, cv_folds,
78
+ scoring_metric, model_save_path, results_save_path, visualize):
79
+
80
+ # Determine data_path
81
+ if data_option == 'Upload Data File':
82
+ if data_file is None:
83
+ return "Please upload a data file.", None
84
+ data_path = data_file # data_file is the path to the uploaded file
85
+ elif data_option == 'Provide Data Path':
86
+ if not os.path.exists(data_path):
87
+ return "Provided data path does not exist.", None
88
+ elif data_option == 'Download from Kaggle':
89
+ if kaggle_json_file is None:
90
+ return "Please upload your kaggle.json file.", None
91
+ else:
92
+ # Save the kaggle.json file to ~/.kaggle/kaggle.json
93
+ import shutil
94
+ kaggle_config_dir = os.path.expanduser('~/.kaggle')
95
+ os.makedirs(kaggle_config_dir, exist_ok=True)
96
+ kaggle_json_path = os.path.join(kaggle_config_dir, 'kaggle.json')
97
+ shutil.copy(kaggle_json_file.name, kaggle_json_path)
98
+ os.chmod(kaggle_json_path, 0o600)
99
+ data_dir = download_kaggle_data(json_path=kaggle_json_path, competition_name=competition_name)
100
+ if data_dir is None:
101
+ return "Failed to download data from Kaggle.", None
102
+ # Use the specified data_name_kaggle
103
+ data_path = os.path.join(data_dir, data_name_kaggle)
104
+ if not os.path.exists(data_path):
105
+ return f"{data_name_kaggle} not found in the downloaded Kaggle data.", None
106
+ else:
107
+ return "Invalid data option selected.", None
108
+
109
+ # Prepare command-line arguments
110
+ cmd = [sys.executable, os.path.join(project_root, 'scripts', 'train_regression_model.py')]
111
+ cmd.extend(['--model_module', model_module])
112
+ cmd.extend(['--data_path', data_path])
113
+ cmd.extend(['--target_variable', target_variable])
114
+
115
+ if drop_columns:
116
+ cmd.extend(['--drop_columns', ','.join(drop_columns)])
117
+ if test_size != 0.2:
118
+ cmd.extend(['--test_size', str(test_size)])
119
+ if random_state != 42:
120
+ cmd.extend(['--random_state', str(int(random_state))])
121
+ if log_transform:
122
+ cmd.append('--log_transform')
123
+ if cv_folds != 5:
124
+ cmd.extend(['--cv_folds', str(int(cv_folds))])
125
+ if scoring_metric:
126
+ cmd.extend(['--scoring_metric', scoring_metric])
127
+ if model_save_path:
128
+ cmd.extend(['--model_path', model_save_path])
129
+ if results_save_path:
130
+ cmd.extend(['--results_path', results_save_path])
131
+ if visualize:
132
+ cmd.append('--visualize')
133
+
134
+ # Debugging: Print the command being executed
135
+ print(f"Executing command: {' '.join(cmd)}")
136
+
137
+ # Execute the command
138
+ try:
139
+ result = subprocess.run(cmd, capture_output=True, text=True)
140
+ output = result.stdout
141
+ errors = result.stderr
142
+ if result.returncode != 0:
143
+ return f"Error during training:\n{errors}", None
144
+ else:
145
+ # Delete usless "Figure (600x400)" text
146
+ output = re.sub(r"Figure\(\d+x\d+\)", "", output).strip()
147
+ # Try to load the plot image
148
+ if results_save_path:
149
+ plot_image_path = os.path.join(results_save_path, 'actual_vs_predicted.png')
150
+ else:
151
+ # Default path if results_save_path is not provided
152
+ plot_image_path = output.split('Visualization saved to ')[1].strip()
153
+ if os.path.exists(plot_image_path):
154
+ return f"Training completed successfully.\n\n{output}", plot_image_path
155
+ else:
156
+ return f"Training completed successfully.\n\n{output}", None
157
+ except Exception as e:
158
+ return f"An error occurred:\n{str(e)}", None
159
+
160
+ def get_columns_from_data(data_option, data_file, data_path, data_name_kaggle, kaggle_json_file, competition_name):
161
+ # Determine data_path
162
+ if data_option == 'Upload Data File':
163
+ if data_file is None:
164
+ return []
165
+ data_path = data_file
166
+ elif data_option == 'Provide Data Path':
167
+ if not os.path.exists(data_path):
168
+ return []
169
+ elif data_option == 'Download from Kaggle':
170
+ if kaggle_json_file is None:
171
+ return []
172
+ else:
173
+ # Save the kaggle.json file to ~/.kaggle/kaggle.json
174
+ import shutil
175
+ kaggle_config_dir = os.path.expanduser('~/.kaggle')
176
+ os.makedirs(kaggle_config_dir, exist_ok=True)
177
+ kaggle_json_path = os.path.join(kaggle_config_dir, 'kaggle.json')
178
+ shutil.copy(kaggle_json_file.name, kaggle_json_path)
179
+ os.chmod(kaggle_json_path, 0o600)
180
+ data_dir = download_kaggle_data(json_path=kaggle_json_path, competition_name=competition_name)
181
+ if data_dir is None:
182
+ return []
183
+ data_path = os.path.join(data_dir, data_name_kaggle)
184
+ if not os.path.exists(data_path):
185
+ return []
186
+ else:
187
+ return []
188
+
189
+ try:
190
+ data = pd.read_csv(data_path)
191
+ columns = data.columns.tolist()
192
+ return columns
193
+ except Exception as e:
194
+ print(f"Error reading data file: {e}")
195
+ return []
196
+
197
+ # Define Gradio interface components
198
+
199
+ def update_columns(data_option, data_file, data_path, data_name_kaggle, kaggle_json_file, competition_name):
200
+ columns = get_columns_from_data(data_option, data_file, data_path, data_name_kaggle, kaggle_json_file, competition_name)
201
+ if not columns:
202
+ return gr.update(choices=[]), gr.update(choices=[])
203
+ else:
204
+ return gr.update(choices=columns), gr.update(choices=columns)
205
+
206
+ model_modules = get_model_modules()
207
+
208
+ if not model_modules:
209
+ print("No model modules found. Please check the 'models/supervised/regression' directory.")
210
+ # You can handle this case appropriately, e.g., show an error message in the interface or exit.
211
+
212
+ with gr.Blocks() as demo:
213
+ gr.Markdown("# Train a Regression Model")
214
+
215
+ with gr.Row():
216
+ model_module_input = gr.Dropdown(choices=model_modules, label="Select Model Module")
217
+ scoring_metric_input = gr.Textbox(value='neg_root_mean_squared_error', label="Scoring Metric")
218
+
219
+ with gr.Row():
220
+ test_size_input = gr.Slider(minimum=0.1, maximum=0.5, step=0.05, value=0.2, label="Test Size")
221
+ random_state_input = gr.Number(value=42, label="Random State")
222
+ cv_folds_input = gr.Number(value=5, label="CV Folds", precision=0)
223
+
224
+ log_transform_input = gr.Checkbox(label="Log Transform Target Variable", value=False)
225
+ visualize_input = gr.Checkbox(label="Generate Visualizations", value=True)
226
+
227
+ with gr.Row():
228
+ model_save_path_input = gr.Textbox(value='', label="Model Save Path (optional)")
229
+ results_save_path_input = gr.Textbox(value='', label="Results Save Path (optional)")
230
+
231
+ with gr.Tab("Data Input"):
232
+ data_option_input = gr.Radio(choices=['Upload Data File', 'Provide Data Path', 'Download from Kaggle'], label="Data Input Option", value='Upload Data File')
233
+
234
+ upload_data_col = gr.Column(visible=True)
235
+ with upload_data_col:
236
+ data_file_input = gr.File(label="Upload CSV Data File", type="filepath")
237
+
238
+ data_path_col = gr.Column(visible=False)
239
+ with data_path_col:
240
+ data_path_input = gr.Textbox(value='', label="Data File Path")
241
+
242
+ kaggle_data_col = gr.Column(visible=False)
243
+ with kaggle_data_col:
244
+ kaggle_json_file_input = gr.File(label="Upload kaggle.json File", type="filepath")
245
+ competition_name_input = gr.Textbox(value='house-prices-advanced-regression-techniques', label="Kaggle Competition Name")
246
+ data_name_kaggle_input = gr.Textbox(value='train.csv', label="Data File Name (in Kaggle dataset)")
247
+
248
+ def toggle_data_input(option):
249
+ if option == 'Upload Data File':
250
+ return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
251
+ elif option == 'Provide Data Path':
252
+ return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
253
+ elif option == 'Download from Kaggle':
254
+ return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
255
+
256
+ data_option_input.change(
257
+ fn=toggle_data_input,
258
+ inputs=[data_option_input],
259
+ outputs=[upload_data_col, data_path_col, kaggle_data_col]
260
+ )
261
+
262
+ update_cols_btn = gr.Button("Update Columns")
263
+
264
+ target_variable_input = gr.Dropdown(choices=[], label="Select Target Variable")
265
+ drop_columns_input = gr.CheckboxGroup(choices=[], label="Columns to Drop")
266
+
267
+ update_cols_btn.click(
268
+ fn=update_columns,
269
+ inputs=[data_option_input, data_file_input, data_path_input, data_name_kaggle_input, kaggle_json_file_input, competition_name_input],
270
+ outputs=[target_variable_input, drop_columns_input]
271
+ )
272
+
273
+ train_btn = gr.Button("Train Model")
274
+ output_display = gr.Textbox(label="Output")
275
+ image_display = gr.Image(label="Visualization", visible=True)
276
+
277
+ def run_training(*args):
278
+ output_text, plot_image_path = train_model(*args)
279
+ if plot_image_path and os.path.exists(plot_image_path):
280
+ return output_text, plot_image_path
281
+ else:
282
+ return output_text, None
283
+
284
+ train_btn.click(
285
+ fn=run_training,
286
+ inputs=[
287
+ model_module_input, data_option_input, data_file_input, data_path_input,
288
+ data_name_kaggle_input, kaggle_json_file_input, competition_name_input,
289
+ target_variable_input, drop_columns_input, test_size_input, random_state_input, log_transform_input, cv_folds_input,
290
+ scoring_metric_input, model_save_path_input, results_save_path_input, visualize_input
291
+ ],
292
+ outputs=[output_display, image_display]
293
+ )
294
+
295
+ if __name__ == "__main__":
296
+ demo.launch(share=True)
data/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # data
data/datasets/README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Datasets Utilities
2
+
3
+ This folder contains utility scripts for handling datasets, including downloading data from Kaggle.
4
+
5
+ ## 📄 Scripts
6
+
7
+ ### `kaggle_data.py`
8
+
9
+ - **Description**: A Python script to download Kaggle datasets or competition data seamlessly, supporting Google Colab, local Linux/Mac, and Windows environments.
10
+ - **Path**: [`data/datasets/kaggle_data.py`](kaggle_data.py)
11
+ - **Key Function**: `get_kaggle_data(json_path, data_name, is_competition=False, output_dir='data/raw')`
12
+ - **Example**:
13
+
14
+ ```python
15
+ from kaggle_data import get_kaggle_data
16
+
17
+ # Download a standard Kaggle dataset
18
+ dataset_path = get_kaggle_data("kaggle.json", "paultimothymooney/chest-xray-pneumonia")
19
+
20
+ # Download competition data
21
+ competition_path = get_kaggle_data("kaggle.json", "house-prices-advanced-regression-techniques", is_competition=True)
data/datasets/kaggle_data.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module provides a utility function to download Kaggle datasets or competition data.
3
+
4
+ The function automatically detects whether it is running in a Google Colab environment, a local Linux/Mac environment, or a Windows environment, and sets up the Kaggle API accordingly.
5
+
6
+ Requirements:
7
+ - Kaggle API installed (`pip install kaggle`)
8
+ - Kaggle API key (`kaggle.json`) with appropriate permissions.
9
+
10
+ Environment Detection:
11
+ - Google Colab: Uses `/root/.config/kaggle/kaggle.json`.
12
+ - Local Linux/Mac: Uses `~/.kaggle/kaggle.json`.
13
+ - Windows: Uses `C:\\Users\\<Username>\\.kaggle\\kaggle.json`.
14
+
15
+ Functions:
16
+ get_kaggle_data(json_path: str, data_name: str, is_competition: bool = False, output_dir: str = "data/raw") -> str
17
+ """
18
+
19
+ import os
20
+ import zipfile
21
+ import sys
22
+ import shutil
23
+ import platform
24
+
25
+ def get_kaggle_data(json_path: str, data_name: str, is_competition: bool = False, output_dir: str = "data/raw") -> str:
26
+ """
27
+ Downloads a Kaggle dataset or competition data using the Kaggle API in Google Colab, local Linux/Mac, or Windows environment.
28
+
29
+ Parameters:
30
+ json_path (str): Path to your 'kaggle.json' file.
31
+ data_name (str): Kaggle dataset or competition name (e.g., 'paultimothymooney/chest-xray-pneumonia' or 'house-prices-advanced-regression-techniques').
32
+ is_competition (bool): Set to True if downloading competition data. Default is False (for datasets).
33
+ output_dir (str): Directory to save and extract the data. Default is 'data'.
34
+
35
+ Returns:
36
+ str: Path to the extracted dataset folder.
37
+
38
+ Raises:
39
+ OSError: If 'kaggle.json' is not found or cannot be copied.
40
+ Exception: If there is an error during download or extraction.
41
+
42
+ Example of Usage:
43
+ # For downloading a standard dataset
44
+ dataset_path = get_kaggle_data("kaggle.json", "paultimothymooney/chest-xray-pneumonia")
45
+ print(f"Dataset is available at: {dataset_path}")
46
+
47
+ # For downloading competition data
48
+ competition_path = get_kaggle_data("kaggle.json", "house-prices-advanced-regression-techniques", is_competition=True)
49
+ print(f"Competition data is available at: {competition_path}")
50
+ """
51
+ # Detect environment (Colab, local Linux/Mac, or Windows)
52
+ is_colab = "google.colab" in sys.modules
53
+ is_windows = platform.system() == "Windows"
54
+
55
+ # Step 1: Setup Kaggle API credentials
56
+ try:
57
+ if is_colab:
58
+ config_dir = "/root/.config/kaggle"
59
+ os.makedirs(config_dir, exist_ok=True)
60
+ print("Setting up Kaggle API credentials for Colab environment.")
61
+ shutil.copy(json_path, os.path.join(config_dir, "kaggle.json"))
62
+ os.chmod(os.path.join(config_dir, "kaggle.json"), 0o600)
63
+ else:
64
+ # For both local Linux/Mac and Windows, use the home directory
65
+ config_dir = os.path.join(os.path.expanduser("~"), ".kaggle")
66
+ os.makedirs(config_dir, exist_ok=True)
67
+ print("Setting up Kaggle API credentials for local environment.")
68
+ kaggle_json_dest = os.path.join(config_dir, "kaggle.json")
69
+ if not os.path.exists(kaggle_json_dest):
70
+ shutil.copy(json_path, kaggle_json_dest)
71
+ if not is_windows:
72
+ os.chmod(kaggle_json_dest, 0o600)
73
+ except Exception as e:
74
+ raise OSError(f"Could not set up Kaggle API credentials: {e}")
75
+
76
+ # Step 2: Create output directory
77
+ dataset_dir = os.path.join(output_dir, data_name.split('/')[-1])
78
+ os.makedirs(dataset_dir, exist_ok=True)
79
+ original_dir = os.getcwd()
80
+ os.chdir(dataset_dir)
81
+
82
+ # Step 3: Download the dataset or competition data
83
+ try:
84
+ if is_competition:
85
+ print(f"Downloading competition data: {data_name}")
86
+ cmd = f"kaggle competitions download -c {data_name}"
87
+ else:
88
+ print(f"Downloading dataset: {data_name}")
89
+ cmd = f"kaggle datasets download -d {data_name}"
90
+ os.system(cmd)
91
+ except Exception as e:
92
+ print(f"Error during download: {e}")
93
+ os.chdir(original_dir)
94
+ return None
95
+
96
+ # Step 4: Unzip all downloaded files
97
+ zip_files = [f for f in os.listdir() if f.endswith(".zip")]
98
+ if not zip_files:
99
+ print("No zip files found. Please check the dataset or competition name.")
100
+ os.chdir(original_dir)
101
+ return None
102
+
103
+ for zip_file in zip_files:
104
+ try:
105
+ with zipfile.ZipFile(zip_file, "r") as zip_ref:
106
+ zip_ref.extractall()
107
+ print(f"Extracted: {zip_file}")
108
+ os.remove(zip_file)
109
+ except Exception as e:
110
+ print(f"Error extracting {zip_file}: {e}")
111
+
112
+ # Step 5: Navigate back to the original directory
113
+ os.chdir(original_dir)
114
+
115
+ return dataset_dir
data/preprocessing/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # preprocessing
data/raw/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # raw
data/utils/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # utils
models/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # models
models/computer_vision/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # computer_vision
models/deep_learning/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # deep_learning
models/nlp/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # nlp
models/reinforcement_learning/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # reinforcement_learning
models/supervised/classification/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # classification
models/supervised/regression/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # regression
models/supervised/regression/adaboost_regressor.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ This module sets up an AdaBoost Regressor with hyperparameter tuning.
4
+
5
+ Features:
6
+ - Uses `AdaBoostRegressor` estimator from scikit-learn.
7
+ - Defines a hyperparameter grid for boosting parameters.
8
+ - Combines weak learners to form a strong predictor.
9
+
10
+ Special Considerations:
11
+ - Sensitive to outliers.
12
+ - Not sensitive to feature scaling.
13
+ - Base estimator is a Decision Tree by default.
14
+ """
15
+
16
+ from sklearn.ensemble import AdaBoostRegressor
17
+
18
+ # Define the estimator
19
+ estimator = AdaBoostRegressor(random_state=42)
20
+
21
+ # Define the hyperparameter grid
22
+ param_grid = {
23
+ 'model__n_estimators': [50, 100], # Focus on a narrower range of estimators
24
+ 'model__learning_rate': [0.001, 0.01, 0.1, 1.0], # Keep a good spread for learning rates
25
+ 'model__loss': ['linear'], # Focus on the most commonly used loss function
26
+ 'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
27
+ }
28
+
29
+ # Optional: Define the default scoring metric
30
+ default_scoring = 'neg_root_mean_squared_error'
models/supervised/regression/catboost_regressor.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ This module sets up a CatBoost Regressor with hyperparameter tuning.
4
+
5
+ Features:
6
+ - Uses `CatBoostRegressor` estimator from CatBoost.
7
+ - Defines a hyperparameter grid for boosting parameters.
8
+ - Handles categorical features natively.
9
+
10
+ Special Considerations:
11
+ - Requires the `catboost` library (`pip install catboost`).
12
+ - Adjust the preprocessing pipeline to skip encoding categorical features.
13
+ - Not sensitive to feature scaling.
14
+ - Can be slower to train compared to other boosting algorithms.
15
+ """
16
+
17
+ from catboost import CatBoostRegressor
18
+
19
+ # Define the estimator
20
+ estimator = CatBoostRegressor(random_state=42, verbose=0)
21
+
22
+ # Define the hyperparameter grid
23
+ param_grid = {
24
+ 'model__iterations': [500], # Fixed to a reasonable value for faster tuning
25
+ 'model__learning_rate': [0.05, 0.1], # Common learning rates
26
+ 'model__depth': [6, 8], # Typical depths for balance between speed and accuracy
27
+ 'model__l2_leaf_reg': [3], # Most impactful regularization value
28
+ 'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
29
+ }
30
+
31
+ # Optional: Define the default scoring metric
32
+ default_scoring = 'neg_root_mean_squared_error'
models/supervised/regression/decision_tree_regressor.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ This module sets up a Decision Tree Regressor with hyperparameter tuning.
4
+
5
+ Features:
6
+ - Uses `DecisionTreeRegressor` estimator from scikit-learn.
7
+ - Defines a hyperparameter grid for tree-specific parameters.
8
+ - Handles non-linear relationships and interactions.
9
+
10
+ Special Considerations:
11
+ - Decision Trees are not affected by feature scaling.
12
+ - Can easily overfit; control tree depth and splitting criteria.
13
+ - No need for scaling transformers in the preprocessing pipeline.
14
+ """
15
+
16
+ from sklearn.tree import DecisionTreeRegressor
17
+
18
+ # Define the estimator
19
+ estimator = DecisionTreeRegressor(random_state=42)
20
+
21
+ # Define the hyperparameter grid
22
+ param_grid = {
23
+ 'model__criterion': ['squared_error', 'absolute_error'], # Only two key criteria
24
+ 'model__max_depth': [5, 10, 20, None], # Depth variations
25
+ 'model__min_samples_split': [2, 10], # Commonly used values
26
+ 'model__min_samples_leaf': [1, 4], # Few values for leaves
27
+ 'preprocessor__num__imputer__strategy': ['mean'], # Focused on a single strategy
28
+ }
29
+
30
+ # Optional: Define the default scoring metric
31
+ default_scoring = 'neg_root_mean_squared_error'
models/supervised/regression/elasticnet_regression.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ This module sets up an ElasticNet Regression model with hyperparameter tuning.
4
+
5
+ Features:
6
+ - Uses `ElasticNet` estimator from scikit-learn.
7
+ - Combines L1 and L2 regularization.
8
+ - Increases `max_iter` to address convergence warnings.
9
+
10
+ Special Considerations:
11
+ - May produce convergence warnings if `max_iter` is insufficient.
12
+ - Adjust `l1_ratio` to balance between Lasso and Ridge penalties.
13
+ - Applying a log transformation (`log_transform`) to the target variable can be beneficial if it's skewed.
14
+ - Ensure `OneHotEncoder` outputs dense arrays.
15
+ """
16
+
17
+ from sklearn.linear_model import ElasticNet
18
+
19
+ # Define the estimator
20
+ estimator = ElasticNet()
21
+
22
+ # Define the hyperparameter grid
23
+ param_grid = {
24
+ 'model__alpha': [0.01, 0.1, 1.0, 10.0], # Regularization strength
25
+ 'model__l1_ratio': [0.2, 0.5, 0.8], # Balance between L1 (Lasso) and L2 (Ridge)
26
+ 'model__max_iter': [5000], # Sufficient to avoid convergence warnings
27
+ 'model__fit_intercept': [True], # Assume intercept is important
28
+ 'model__selection': ['cyclic'], # Focus on the default cyclic selection
29
+ 'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
30
+ 'preprocessor__num__scaler__with_mean': [True], # StandardScaler
31
+ 'preprocessor__num__scaler__with_std': [True], # StandardScaler
32
+ }
33
+
34
+ # Optional: Define the default scoring metric
35
+ default_scoring = 'neg_root_mean_squared_error'
models/supervised/regression/extra_trees_regressor.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ This module sets up an Extra Trees Regressor with hyperparameter tuning.
4
+
5
+ Features:
6
+ - Uses `ExtraTreesRegressor` estimator from scikit-learn.
7
+ - Defines a hyperparameter grid for ensemble parameters.
8
+ - Similar to Random Forest but uses random thresholds for splitting.
9
+
10
+ Special Considerations:
11
+ - Not sensitive to feature scaling.
12
+ - Can handle large datasets efficiently.
13
+ - Less prone to overfitting compared to single decision trees.
14
+ """
15
+
16
+ from sklearn.ensemble import ExtraTreesRegressor
17
+
18
+ # Define the estimator
19
+ estimator = ExtraTreesRegressor(random_state=42, n_jobs=-1)
20
+
21
+ # Define the hyperparameter grid
22
+ param_grid = {
23
+ 'model__n_estimators': [100, 200], # Common range for estimators
24
+ 'model__criterion': ['squared_error'], # Focus on the most widely used criterion
25
+ 'model__max_depth': [None, 10, 20], # Unrestricted depth and reasonable constraints
26
+ 'model__min_samples_split': [2, 5], # Commonly used values
27
+ 'model__min_samples_leaf': [1, 2], # Prevent overfitting with larger leaves
28
+ 'model__max_features': ['sqrt', 'log2'], # Reduce to most common feature sampling strategies
29
+ 'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
30
+ }
31
+
32
+ # Optional: Define the default scoring metric
33
+ default_scoring = 'neg_root_mean_squared_error'
models/supervised/regression/gradient_boosting_regressor.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ This module sets up a Gradient Boosting Regressor with hyperparameter tuning.
4
+
5
+ Features:
6
+ - Uses `GradientBoostingRegressor` estimator from scikit-learn.
7
+ - Defines a hyperparameter grid for boosting parameters.
8
+ - Builds sequential models to minimize errors.
9
+
10
+ Special Considerations:
11
+ - Sensitive to overfitting; tune `n_estimators` and `learning_rate`.
12
+ - Not sensitive to feature scaling.
13
+ - Longer training times compared to other models.
14
+ """
15
+
16
+ from sklearn.ensemble import GradientBoostingRegressor
17
+
18
+ # Define the estimator
19
+ estimator = GradientBoostingRegressor(random_state=42)
20
+
21
+ # Define the hyperparameter grid
22
+ param_grid = {
23
+ 'model__n_estimators': [100, 200], # Focused range of estimators
24
+ 'model__learning_rate': [0.001, 0.01, 0.1, 1], # Commonly used learning rates
25
+ 'model__max_depth': [3, 5], # Standard depth values
26
+ 'model__subsample': [0.8], # Single value to focus on speed
27
+ 'model__min_samples_split': [2], # Default value
28
+ 'model__min_samples_leaf': [1], # Default value
29
+ 'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
30
+ }
31
+
32
+ # Optional: Define the default scoring metric
33
+ default_scoring = 'neg_root_mean_squared_error'
models/supervised/regression/knn_regressor.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ This module sets up a K-Nearest Neighbors Regressor with hyperparameter tuning.
4
+
5
+ Features:
6
+ - Uses `KNeighborsRegressor` estimator from scikit-learn.
7
+ - Defines a hyperparameter grid for neighbor parameters.
8
+ - Non-parametric method useful for capturing local patterns.
9
+
10
+ Special Considerations:
11
+ - Feature scaling is crucial for KNN.
12
+ - Sensitive to the choice of `n_neighbors`.
13
+ - Training is fast, but prediction can be slow on large datasets.
14
+ """
15
+
16
+ from sklearn.neighbors import KNeighborsRegressor
17
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler
18
+
19
+ # Define the estimator
20
+ estimator = KNeighborsRegressor(n_jobs=-1)
21
+
22
+ # Define the hyperparameter grid
23
+ param_grid = {
24
+ 'model__n_neighbors': [3, 5, 7], # Focus on common neighbor values
25
+ 'model__weights': ['uniform', 'distance'], # Standard options
26
+ 'model__algorithm': ['auto', 'ball_tree'], # Reduce algorithms to commonly used ones
27
+ 'model__p': [1, 2], # Manhattan and Euclidean distances
28
+ 'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
29
+ 'preprocessor__num__scaler__with_mean': [True], # StandardScaler
30
+ 'preprocessor__num__scaler__with_std': [True], # StandardScaler
31
+ }
32
+
33
+ # Optional: Define the default scoring metric
34
+ default_scoring = 'neg_root_mean_squared_error'
models/supervised/regression/lasso_regression.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ This module sets up a Lasso Regression model with hyperparameter tuning.
4
+
5
+ Features:
6
+ - Uses `Lasso` estimator from scikit-learn.
7
+ - Defines a hyperparameter grid for preprocessing and model-specific parameters.
8
+ - Increases `max_iter` to address convergence warnings.
9
+
10
+ Special Considerations:
11
+ - Lasso Regression may produce convergence warnings if `max_iter` is insufficient.
12
+ - Applying a log transformation (`log_transform`) to the target variable can be beneficial if it's skewed.
13
+ - Ensure `OneHotEncoder` outputs dense arrays to avoid compatibility issues.
14
+ """
15
+
16
+ from sklearn.linear_model import Lasso
17
+
18
+ # Define the estimator
19
+ estimator = Lasso()
20
+
21
+ # Define the hyperparameter grid
22
+ param_grid = {
23
+ 'model__alpha': [0.01, 0.1, 1.0, 10.0], # Regularization strength
24
+ 'model__max_iter': [5000], # Single value to ensure convergence
25
+ 'model__fit_intercept': [True], # Assume the intercept is important
26
+ 'model__selection': ['cyclic'], # Focus on the default cyclic selection
27
+ 'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
28
+ 'preprocessor__num__scaler__with_mean': [True], # StandardScaler
29
+ 'preprocessor__num__scaler__with_std': [True], # StandardScaler
30
+ }
31
+
32
+ # Optional: Define the default scoring metric
33
+ default_scoring = 'neg_root_mean_squared_error'
models/supervised/regression/lightgbm_regressor.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ This module sets up a LightGBM Regressor with hyperparameter tuning.
4
+
5
+ Features:
6
+ - Uses `LGBMRegressor` estimator from LightGBM.
7
+ - Defines a hyperparameter grid for boosting parameters.
8
+ - Optimized for speed and performance.
9
+
10
+ Special Considerations:
11
+ - Requires the `lightgbm` library (`pip install lightgbm`).
12
+ - Can handle categorical features if provided appropriately.
13
+ - Not sensitive to feature scaling.
14
+ """
15
+
16
+ from lightgbm import LGBMRegressor
17
+
18
+ # Define the estimator
19
+ estimator = LGBMRegressor(
20
+ random_state=42,
21
+ n_jobs=-1,
22
+ verbose=-1
23
+ )
24
+
25
+ # Define hyperparameter grid
26
+ param_grid = {
27
+ 'model__n_estimators': [100, 200],
28
+ 'model__learning_rate': [0.01, 0.05],
29
+ 'model__num_leaves': [15, 31],
30
+ 'model__max_depth': [10, 20],
31
+ 'model__min_data_in_leaf': [20, 50],
32
+ 'model__colsample_bytree': [0.8],
33
+ 'preprocessor__num__imputer__strategy': ['mean'],
34
+ }
35
+
36
+ # Optional: Define the default scoring metric
37
+ default_scoring = 'neg_root_mean_squared_error'
models/supervised/regression/linear_regression.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ This module defines the setup for performing Linear Regression with hyperparameter tuning.
4
+
5
+ Features:
6
+ - Sets up a `LinearRegression` estimator from scikit-learn.
7
+ - Defines a hyperparameter grid for preprocessing and model-specific parameters.
8
+ - Specifies an optional default scoring metric for evaluating the model.
9
+
10
+ Special Considerations:
11
+ - Linear Regression doesn't typically require special handling.
12
+ - Applying a log transformation to the target variable (`log_transform`) can be beneficial if it's skewed.
13
+ """
14
+
15
+ from sklearn.linear_model import LinearRegression
16
+
17
+ # Define the estimator
18
+ estimator = LinearRegression()
19
+
20
+ # Define the hyperparameter grid
21
+ param_grid = {
22
+ 'model__fit_intercept': [True, False],
23
+ 'preprocessor__num__imputer__strategy': ['mean', 'median'],
24
+ 'preprocessor__num__scaler__with_mean': [True, False],
25
+ 'preprocessor__num__scaler__with_std': [True, False],
26
+ }
27
+
28
+ # Optional: Define the default scoring metric
29
+ default_scoring = 'neg_root_mean_squared_error'
models/supervised/regression/mlp_regressor.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ This module sets up a Multilayer Perceptron Regressor with hyperparameter tuning.
4
+
5
+ Features:
6
+ - Uses `MLPRegressor` estimator from scikit-learn.
7
+ - Defines a hyperparameter grid for neural network parameters.
8
+ - Capable of capturing complex non-linear relationships.
9
+
10
+ Special Considerations:
11
+ - Feature scaling is crucial for MLP.
12
+ - May produce convergence warnings; increase `max_iter` to address this.
13
+ - Can be sensitive to hyperparameter settings; tuning is important.
14
+ """
15
+
16
+ from sklearn.neural_network import MLPRegressor
17
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler
18
+
19
+ # Define the estimator
20
+ estimator = MLPRegressor(random_state=42, max_iter=1000)
21
+
22
+ # Define the hyperparameter grid
23
+ param_grid = {
24
+ 'model__hidden_layer_sizes': [(50,), (100,), (50, 50)], # Simplified layer sizes
25
+ 'model__activation': ['relu'], # Focused on ReLU, the most commonly effective activation
26
+ 'model__solver': ['adam'], # Retain 'adam' for efficiency; drop 'lbfgs' (slower for larger datasets)
27
+ 'model__alpha': [0.0001, 0.001], # Regularization strengths
28
+ 'model__learning_rate': ['constant', 'adaptive'], # Common learning rate strategies
29
+ 'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
30
+ 'preprocessor__num__scaler__with_mean': [True], # StandardScaler
31
+ 'preprocessor__num__scaler__with_std': [True], # StandardScaler
32
+ }
33
+
34
+ # Optional: Default scoring metric
35
+ default_scoring = 'neg_root_mean_squared_error'
models/supervised/regression/random_forest_regressor.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ This module sets up a Random Forest Regressor with hyperparameter tuning.
4
+
5
+ Features:
6
+ - Uses `RandomForestRegressor` estimator from scikit-learn.
7
+ - Defines a hyperparameter grid for ensemble parameters.
8
+ - Handles non-linear relationships and reduces overfitting through averaging.
9
+
10
+ Special Considerations:
11
+ - Random Forests are robust to outliers and can handle non-linear data.
12
+ - Not sensitive to feature scaling.
13
+ - Set `n_jobs=-1` to utilize all available CPU cores.
14
+ """
15
+
16
+ from sklearn.ensemble import RandomForestRegressor
17
+
18
+ # Define the estimator
19
+ estimator = RandomForestRegressor(random_state=42, n_jobs=-1)
20
+
21
+ # Define the hyperparameter grid
22
+ param_grid = {
23
+ 'model__n_estimators': [100, 200], # Focus on a small range of estimators
24
+ 'model__max_depth': [10, 20, None], # Commonly used depth variations
25
+ 'model__min_samples_split': [2, 5], # Commonly used split values
26
+ 'model__min_samples_leaf': [1, 2], # Focused leaf size
27
+ 'model__max_features': ['sqrt'], # "sqrt" is often optimal for Random Forests
28
+ 'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
29
+ }
30
+
31
+ # Optional: Define the default scoring metric
32
+ default_scoring = 'neg_root_mean_squared_error'
models/supervised/regression/ridge_regression.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ This module sets up a Ridge Regression model with hyperparameter tuning.
4
+
5
+ Features:
6
+ - Uses `Ridge` estimator from scikit-learn.
7
+ - Defines a hyperparameter grid for preprocessing and model-specific parameters.
8
+ - Addresses potential convergence warnings by increasing `max_iter`.
9
+ - Considers solvers compatible with dense data after modifying `OneHotEncoder`.
10
+
11
+ Special Considerations:
12
+ - Ridge Regression may produce convergence warnings if `max_iter` is insufficient.
13
+ - Applying a log transformation (`log_transform`) to the target variable can be beneficial if it's skewed.
14
+ - Ensure `OneHotEncoder` outputs dense arrays to avoid solver compatibility issues.
15
+ """
16
+
17
+ from sklearn.linear_model import Ridge
18
+
19
+ # Define the estimator
20
+ estimator = Ridge()
21
+
22
+ # Define the hyperparameter grid
23
+ param_grid = {
24
+ 'model__alpha': [0.1, 1.0, 10.0],
25
+ 'model__solver': ['auto', 'svd', 'cholesky'],
26
+ 'model__max_iter': [1000, 5000],
27
+ 'preprocessor__num__imputer__strategy': ['mean', 'median'],
28
+ 'preprocessor__num__scaler__with_mean': [True, False],
29
+ 'preprocessor__num__scaler__with_std': [True, False],
30
+ }
31
+
32
+ # Optional: Define the default scoring metric
33
+ default_scoring = 'neg_root_mean_squared_error'
models/supervised/regression/support_vector_regressor.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ This module sets up a Support Vector Regressor (SVR) with hyperparameter tuning.
4
+
5
+ Features:
6
+ - Uses `SVR` estimator from scikit-learn.
7
+ - Defines a hyperparameter grid for kernel parameters.
8
+ - Effective in high-dimensional spaces.
9
+
10
+ Special Considerations:
11
+ - Feature scaling is crucial for SVR.
12
+ - Training time can be significant for large datasets.
13
+ - Applying a log transformation (`log_transform`) can be beneficial if the target variable is skewed.
14
+ """
15
+
16
+ from sklearn.svm import SVR
17
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler
18
+
19
+ # Define the estimator
20
+ estimator = SVR()
21
+
22
+ # Define the hyperparameter grid
23
+ param_grid = {
24
+ 'model__kernel': ['rbf'], # Stick to the most effective kernel
25
+ 'model__C': [0.1, 1.0, 10.0], # Focus on a narrower range
26
+ 'model__epsilon': [0.1, 0.2, 0.5], # Retain small deviations
27
+ 'model__gamma': ['scale', 0.1], # Simplify gamma
28
+ 'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
29
+ 'preprocessor__num__scaler__with_mean': [True],
30
+ 'preprocessor__num__scaler__with_std': [True],
31
+ }
32
+
33
+ # Optional: Define the default scoring metric
34
+ default_scoring = 'neg_root_mean_squared_error'
models/supervised/regression/xgboost_regressor.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ This module sets up an XGBoost Regressor with hyperparameter tuning.
4
+
5
+ Features:
6
+ - Uses `XGBRegressor` estimator from XGBoost.
7
+ - Defines a hyperparameter grid for boosting parameters.
8
+ - Efficient and scalable implementation of gradient boosting.
9
+
10
+ Special Considerations:
11
+ - Requires the `xgboost` library (`pip install xgboost`).
12
+ - Handles missing values internally.
13
+ - Not sensitive to feature scaling.
14
+ - May require setting `tree_method` to 'gpu_hist' for GPU acceleration if available.
15
+ """
16
+
17
+ from xgboost import XGBRegressor
18
+
19
+ # Define the estimator
20
+ estimator = XGBRegressor(random_state=42, n_jobs=-1, verbosity=0)
21
+
22
+ # Define the hyperparameter grid
23
+ param_grid = {
24
+ 'model__n_estimators': [100, 200], # Common range for estimators
25
+ 'model__learning_rate': [0.05, 0.1], # Common learning rates
26
+ 'model__max_depth': [3, 5], # Typical depth for gradient boosting
27
+ 'model__subsample': [0.8], # Fixed subsample value to reduce complexity
28
+ 'model__colsample_bytree': [0.8], # Fixed colsample value to reduce complexity
29
+ 'model__reg_alpha': [0, 0.1], # Focus on smaller values for L1 regularization
30
+ 'model__reg_lambda': [1], # Default L2 regularization
31
+ 'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
32
+ }
33
+
34
+ # Optional: Define the default scoring metric
35
+ default_scoring = 'neg_root_mean_squared_error'
models/unsupervised/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # unsupervised
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas==2.2.2
2
+ numpy==1.26.4
3
+ matplotlib==3.8.0
4
+ seaborn==0.13.2
5
+ kaggle==1.6.17
6
+ scikit-learn==1.5.2
7
+ catboost==1.2.7
8
+ dask[dataframe]==2024.10.0
9
+ xgboost==2.1.2
10
+ lightgbm==4.5.0
11
+ joblib==1.4.2
12
+ gradio==5.7.1
scripts/README.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Scripts
2
+
3
+ This directory contains executable scripts for training, testing, and other tasks related to model development and evaluation.
4
+
5
+ ## Contents
6
+
7
+ - [`train_regression_model.py`](#train_regression_model.py)
8
+
9
+ ### `train_regression_model.py`
10
+
11
+ A script for training supervised learning regression models using scikit-learn. It handles data loading, preprocessing, optional log transformation, hyperparameter tuning, model evaluation, and saving of models, metrics, and visualizations.
12
+
13
+ #### Features
14
+
15
+ - Supports various regression models defined in the `models/supervised/regression` directory.
16
+ - Performs hyperparameter tuning using grid search cross-validation.
17
+ - Saves trained models and evaluation metrics.
18
+ - Generates visualizations if specified.
19
+
20
+ #### Usage
21
+
22
+ ```bash
23
+ python train_regression_model.py --model_module MODEL_MODULE \
24
+ --data_path DATA_PATH/DATA_NAME.csv \
25
+ --target_variable TARGET_VARIABLE [OPTIONS]
26
+
27
+ ```
28
+
29
+ - **Required Arguments:**
30
+ - `model_module`: Name of the model module to import (e.g., `linear_regression`).
31
+ - `data_path`: Path to the dataset directory, including the data file name.
32
+ - `target_variable`: Name of the target variable.
33
+
34
+ - **Optional Arguments:**
35
+ - `test_size`: Proportion of the dataset to include in the test split (default: 0.2).
36
+ - `random_state`: Random seed for reproducibility (default: 42).
37
+ - `log_transform`: Apply log transformation to the target variable (regression only).
38
+ - `cv_folds`: Number of cross-validation folds (default: 5).
39
+ - `scoring_metric`: Scoring metric for model evaluation.
40
+ - `model_path`: Path to save the trained model.
41
+ - `results_path`: Path to save results and metrics.
42
+ - `visualize`: Generate and save visualizations.
43
+ - `drop_columns`: Comma-separated column names to drop from the dataset.
44
+
45
+ #### Usage Example
46
+
47
+ ```bash
48
+ python train_regression_model.py --model_module linear_regression \
49
+ --data_path data/house_prices/train.csv \
50
+ --target_variable SalePrice --drop_columns Id \
51
+ --log_transform --visualize
52
+ ```
scripts/train_classification_model.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This script trains classification models using scikit-learn.
3
+ It includes data loading, preprocessing, encoding of target variable,
4
+ hyperparameter tuning, model evaluation, and saving of models, metrics,
5
+ and visualizations.
6
+
7
+ Usage:
8
+ python train_classification_model.py --model_module MODEL_MODULE --data_path DATA_PATH/DATA_NAME.csv
9
+ --target_variable TARGET_VARIABLE
10
+
11
+ Optional arguments:
12
+ --test_size TEST_SIZE
13
+ --random_state RANDOM_STATE
14
+ --cv_folds CV_FOLDS
15
+ --scoring_metric SCORING_METRIC
16
+ --model_path MODEL_PATH
17
+ --results_path RESULTS_PATH
18
+ --visualize
19
+ --drop_columns COLUMN_NAMES
20
+
21
+ Example:
22
+ python train_classification_model.py --model_module logistic_regression
23
+ --data_path data/titanic/train.csv
24
+ --target_variable Survived --drop_columns PassengerId
25
+ --visualize
26
+ """
27
+
28
+ import os
29
+ import sys
30
+ import argparse
31
+ import importlib
32
+ import pandas as pd
33
+ import numpy as np
34
+ import matplotlib.pyplot as plt
35
+ from sklearn.model_selection import train_test_split
36
+ from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
37
+ confusion_matrix, ConfusionMatrixDisplay)
38
+ import joblib
39
+
40
+ def main(args):
41
+ # Change to the root directory of the project
42
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
43
+ os.chdir(project_root)
44
+ sys.path.insert(0, project_root)
45
+
46
+ # Import the hyperparameter tuning and the model modules
47
+ from utils.supervised_hyperparameter_tuning import classification_hyperparameter_tuning
48
+ model_module_path = f"models.supervised.classification.{args.model_module}"
49
+ model_module = importlib.import_module(model_module_path)
50
+
51
+ # Get the model estimator, parameters grid, and the scoring metric
52
+ estimator = model_module.estimator
53
+ param_grid = model_module.param_grid
54
+ scoring_metric = args.scoring_metric or getattr(model_module, 'default_scoring', 'accuracy')
55
+ model_name = estimator.__class__.__name__
56
+
57
+ # Set default paths if not provided
58
+ args.model_path = args.model_path or os.path.join('saved_models', model_name)
59
+ args.results_path = args.results_path or os.path.join('results', model_name)
60
+ os.makedirs(args.results_path, exist_ok=True)
61
+
62
+ # Load the dataset
63
+ df = pd.read_csv(os.path.join(args.data_path))
64
+
65
+ # Drop specified columns
66
+ if args.drop_columns:
67
+ columns_to_drop = args.drop_columns.split(',')
68
+ df = df.drop(columns=columns_to_drop)
69
+
70
+ # Define target variable and features
71
+ target_variable = args.target_variable
72
+ X = df.drop(columns=[target_variable])
73
+ y = df[target_variable]
74
+
75
+ # Ensure target variable is categorical
76
+ if np.issubdtype(y.dtype, np.number) and len(np.unique(y)) > 20:
77
+ raise ValueError(f"The target variable '{target_variable}' seems to be continuous. Please ensure it's categorical for classification tasks.")
78
+
79
+ # Encode target variable if not numeric
80
+ if y.dtype == 'object' or not np.issubdtype(y.dtype, np.number):
81
+ from sklearn.preprocessing import LabelEncoder
82
+ le = LabelEncoder()
83
+ y = le.fit_transform(y)
84
+ # Save label encoder for inverse transformation
85
+ joblib.dump(le, os.path.join(args.model_path, 'label_encoder.pkl'))
86
+
87
+ # Split the data
88
+ X_train, X_test, y_train, y_test = train_test_split(
89
+ X, y, test_size=args.test_size, random_state=args.random_state, stratify=y)
90
+
91
+ # Perform hyperparameter tuning
92
+ best_model, best_params = classification_hyperparameter_tuning(
93
+ X_train, y_train, estimator, param_grid,
94
+ cv=args.cv_folds, scoring=scoring_metric)
95
+
96
+ # Evaluate the best model on the test set
97
+ y_pred = best_model.predict(X_test)
98
+ y_test_actual = y_test
99
+
100
+ # Save the trained model
101
+ model_output_path = os.path.join(args.model_path, 'best_model.pkl')
102
+ os.makedirs(args.model_path, exist_ok=True)
103
+ joblib.dump(best_model, model_output_path)
104
+ print(f"Trained model saved to {model_output_path}")
105
+
106
+ # Calculate metrics
107
+ accuracy = accuracy_score(y_test_actual, y_pred)
108
+ precision = precision_score(y_test_actual, y_pred, average='weighted', zero_division=0)
109
+ recall = recall_score(y_test_actual, y_pred, average='weighted', zero_division=0)
110
+ f1 = f1_score(y_test_actual, y_pred, average='weighted', zero_division=0)
111
+ print(f"\n{model_name} Classification Metrics on Test Set:")
112
+ print(f"- Accuracy: {accuracy:.4f}")
113
+ print(f"- Precision: {precision:.4f}")
114
+ print(f"- Recall: {recall:.4f}")
115
+ print(f"- F1 Score: {f1:.4f}")
116
+ # Save metrics
117
+ metrics = {'Accuracy': [accuracy], 'Precision': [precision], 'Recall': [recall], 'F1 Score': [f1]}
118
+
119
+ # Save metrics to CSV
120
+ results_df = pd.DataFrame(metrics)
121
+ results_df.to_csv(os.path.join(args.results_path, 'metrics.csv'), index=False)
122
+ print(f"\nMetrics saved to {os.path.join(args.results_path, 'metrics.csv')}")
123
+
124
+ if args.visualize:
125
+ # Plot Classification Metrics
126
+ plt.figure(figsize=(8, 6))
127
+ # Extract metrics and values
128
+ metric_names = list(metrics.keys())
129
+ metric_values = [value[0] for value in metrics.values()] # Extract the single value from each list
130
+
131
+ # Create bar chart
132
+ plt.bar(metric_names, metric_values, color='skyblue', alpha=0.8)
133
+ plt.ylim(0, 1) # Metrics like accuracy, precision, etc., are between 0 and 1
134
+ plt.xlabel('Metrics')
135
+ plt.ylabel('Scores')
136
+ plt.title('Classification Metrics')
137
+
138
+ # Save and display the plot
139
+ plt.savefig(os.path.join(args.results_path, 'classification_metrics.png'))
140
+ plt.show()
141
+ print(f"Visualization saved to {os.path.join(args.results_path, 'classification_metrics.png')}")
142
+
143
+ # Display and save the confusion matrix
144
+ conf_matrix = confusion_matrix(y_test_actual, y_pred)
145
+ disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
146
+ disp.plot(cmap=plt.cm.Blues, values_format='d') # Format as integers for counts
147
+ plt.title(f'{model_name} Confusion Matrix')
148
+
149
+ # Save the confusion matrix plot
150
+ conf_matrix_path = os.path.join(args.results_path, 'confusion_matrix.png')
151
+ plt.savefig(conf_matrix_path)
152
+ plt.show()
153
+ print(f"Confusion matrix saved to {conf_matrix_path}")
154
+
155
+ if __name__ == "__main__":
156
+ parser = argparse.ArgumentParser(description="Train a classification model.")
157
+ # Model module argument
158
+ parser.add_argument('--model_module', type=str, required=True,
159
+ help='Name of the classification model module to import.')
160
+ # Data arguments
161
+ parser.add_argument('--data_path', type=str, required=True,
162
+ help='Path to the dataset file including data name.')
163
+ parser.add_argument('--target_variable', type=str, required=True,
164
+ help='Name of the target variable.')
165
+ parser.add_argument('--drop_columns', type=str, default='',
166
+ help='Columns to drop from the dataset.')
167
+ # Model arguments
168
+ parser.add_argument('--test_size', type=float, default=0.2,
169
+ help='Proportion for test split.')
170
+ parser.add_argument('--random_state', type=int, default=42,
171
+ help='Random seed.')
172
+ parser.add_argument('--cv_folds', type=int, default=5,
173
+ help='Number of cross-validation folds.')
174
+ parser.add_argument('--scoring_metric', type=str, default=None,
175
+ help='Scoring metric for model evaluation.')
176
+ # Output arguments
177
+ parser.add_argument('--model_path', type=str, default=None,
178
+ help='Path to save the trained model.')
179
+ parser.add_argument('--results_path', type=str, default=None,
180
+ help='Path to save results and metrics.')
181
+ parser.add_argument('--visualize', action='store_true',
182
+ help='Generate and save visualizations.')
183
+
184
+ args = parser.parse_args()
185
+ main(args)
scripts/train_regression_model.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This script trains regression models using scikit-learn.
3
+ It includes data loading, preprocessing, optional log transformation,
4
+ hyperparameter tuning, model evaluation, and saving of models, metrics,
5
+ and visualizations.
6
+
7
+ Usage:
8
+ python train_regression_model.py --model_module MODEL_MODULE --data_path DATA_PATH/DATA_NAME.csv
9
+ --target_variable TARGET_VARIABLE
10
+
11
+ Optional arguments:
12
+ --test_size TEST_SIZE
13
+ --random_state RANDOM_STATE
14
+ --log_transform
15
+ --cv_folds CV_FOLDS
16
+ --scoring_metric SCORING_METRIC
17
+ --model_path MODEL_PATH
18
+ --results_path RESULTS_PATH
19
+ --visualize
20
+ --drop_columns COLUMN_NAMES
21
+
22
+ Example:
23
+ python train_regression_model.py --model_module linear_regression
24
+ --data_path data/house_prices/train.csv
25
+ --target_variable SalePrice --drop_columns Id
26
+ --log_transform --visualize
27
+ """
28
+
29
+ import os
30
+ import sys
31
+ import argparse
32
+ import importlib
33
+ import pandas as pd
34
+ import numpy as np
35
+ import matplotlib.pyplot as plt
36
+ import seaborn as sns
37
+ from sklearn.model_selection import train_test_split
38
+ from sklearn.metrics import root_mean_squared_error, mean_squared_error, r2_score, mean_absolute_error
39
+ import joblib
40
+ from timeit import default_timer as timer
41
+
42
+ def main(args):
43
+ # Change to the root directory of the project
44
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
45
+ os.chdir(project_root)
46
+ sys.path.insert(0, project_root)
47
+
48
+ # Import the hyperparameter tuning and the model modules
49
+ from utils.supervised_hyperparameter_tuning import regression_hyperparameter_tuning
50
+ model_module_path = f"models.supervised.regression.{args.model_module}"
51
+ model_module = importlib.import_module(model_module_path)
52
+
53
+ # Get the model estimator, parameters grid, and the scoring metric
54
+ estimator = model_module.estimator
55
+ param_grid = model_module.param_grid
56
+ scoring_metric = args.scoring_metric or getattr(model_module, 'default_scoring', 'neg_root_mean_squared_error')
57
+ model_name = estimator.__class__.__name__
58
+
59
+ # Set default paths if not provided
60
+ args.model_path = args.model_path or os.path.join('saved_models', model_name)
61
+ args.results_path = args.results_path or os.path.join('results', model_name)
62
+ os.makedirs(args.results_path, exist_ok=True)
63
+
64
+ # Load the dataset
65
+ df = pd.read_csv(os.path.join(args.data_path))
66
+
67
+ # Drop specified columns
68
+ if args.drop_columns:
69
+ columns_to_drop = args.drop_columns.split(',')
70
+ df = df.drop(columns=columns_to_drop)
71
+
72
+ # Define target variable and features
73
+ target_variable = args.target_variable
74
+ X = df.drop(columns=[target_variable])
75
+ y = df[target_variable]
76
+
77
+ # Ensure target variable is numeric
78
+ if not np.issubdtype(y.dtype, np.number):
79
+ raise ValueError(f"The target variable '{target_variable}' must be numeric for regression tasks.")
80
+
81
+ # Split the data
82
+ X_train, X_test, y_train, y_test = train_test_split(
83
+ X, y, test_size=args.test_size, random_state=args.random_state)
84
+
85
+ # Visualize target variable distribution
86
+ if args.visualize:
87
+ plt.figure(figsize=(6, 4))
88
+ sns.histplot(y_train, kde=True)
89
+ plt.title(f'{target_variable} Distribution Before Transformation')
90
+ plt.savefig(os.path.join(args.results_path, 'target_distribution_before.png'))
91
+ plt.show()
92
+
93
+ # Optional: Apply log transformation
94
+ if args.log_transform:
95
+ y_train_transformed = np.log1p(y_train)
96
+ y_test_transformed = np.log1p(y_test)
97
+ if args.visualize:
98
+ plt.figure(figsize=(6, 4))
99
+ sns.histplot(y_train_transformed, kde=True, color='green')
100
+ plt.title(f'{target_variable} Distribution After Log Transform')
101
+ plt.savefig(os.path.join(args.results_path, 'target_distribution_after.png'))
102
+ plt.show()
103
+ else:
104
+ y_train_transformed = y_train
105
+ y_test_transformed = y_test
106
+
107
+ # Start the timer
108
+ start_time = timer()
109
+
110
+ # Perform hyperparameter tuning
111
+ best_model, best_params = regression_hyperparameter_tuning(
112
+ X_train, y_train_transformed, estimator, param_grid,
113
+ cv=args.cv_folds, scoring=scoring_metric)
114
+
115
+ # End the timer and calculate how long it took
116
+ end_time = timer()
117
+ train_time = end_time-start_time
118
+
119
+ # Evaluate the best model on the test set
120
+ y_pred_transformed = best_model.predict(X_test)
121
+
122
+ # Reverse transformation if applied
123
+ if args.log_transform:
124
+ y_pred = np.expm1(y_pred_transformed)
125
+ y_test_actual = np.expm1(y_test_transformed)
126
+ else:
127
+ y_pred = y_pred_transformed
128
+ y_test_actual = y_test_transformed
129
+
130
+ # Save the trained model
131
+ model_output_path = os.path.join(args.model_path, 'best_model.pkl')
132
+ os.makedirs(args.model_path, exist_ok=True)
133
+ joblib.dump(best_model, model_output_path)
134
+ print(f"Trained model saved to {model_output_path}")
135
+
136
+ # Calculate metrics
137
+ rmse = root_mean_squared_error(y_test_actual, y_pred)
138
+ r2 = r2_score(y_test_actual, y_pred)
139
+ mae = mean_absolute_error(y_test_actual, y_pred)
140
+ mse = mean_squared_error(y_test_actual, y_pred)
141
+ print(f"\n{model_name} Regression Metrics on Test Set:")
142
+ print(f"- RMSE: {rmse:.4f}")
143
+ print(f"- R² Score: {r2:.4f}")
144
+ print(f"- MAE: {mae:.4f}")
145
+ print(f"- MSE: {mse:.4f}")
146
+ print(f"- Training time: {train_time:.4f} seconds")
147
+ # Save metrics
148
+ metrics = {'RMSE': [rmse], 'R2': [r2], 'MAE': [mae], 'MSE': [mse], 'train_time': [train_time]}
149
+
150
+ # Save metrics to CSV
151
+ results_df = pd.DataFrame(metrics)
152
+ results_df.to_csv(os.path.join(args.results_path, 'metrics.csv'), index=False)
153
+ print(f"\nMetrics saved to {os.path.join(args.results_path, 'metrics.csv')}")
154
+
155
+ if args.visualize:
156
+ # Plot Actual vs. Predicted
157
+ plt.figure(figsize=(8, 6))
158
+ plt.scatter(y_test_actual, y_pred, alpha=0.6, color='blue')
159
+ plt.plot([y_test_actual.min(), y_test_actual.max()],
160
+ [y_test_actual.min(), y_test_actual.max()], 'r--')
161
+ plt.xlabel(f'Actual {target_variable}')
162
+ plt.ylabel(f'Predicted {target_variable}')
163
+ plt.title(f'Actual vs. Predicted {target_variable}')
164
+ plt.savefig(os.path.join(args.results_path, 'actual_vs_predicted.png'))
165
+ plt.show()
166
+ print(f"Visualization saved to {os.path.join(args.results_path, 'actual_vs_predicted.png')}")
167
+
168
+ if __name__ == "__main__":
169
+ parser = argparse.ArgumentParser(description="Train a regression model.")
170
+ # Model module argument
171
+ parser.add_argument('--model_module', type=str, required=True,
172
+ help='Name of the regression model module to import.')
173
+ # Data arguments
174
+ parser.add_argument('--data_path', type=str, required=True,
175
+ help='Path to the dataset file including data name.')
176
+ parser.add_argument('--target_variable', type=str, required=True,
177
+ help='Name of the target variable.')
178
+ parser.add_argument('--drop_columns', type=str, default='',
179
+ help='Columns to drop from the dataset.')
180
+ # Model arguments
181
+ parser.add_argument('--test_size', type=float, default=0.2,
182
+ help='Proportion for test split.')
183
+ parser.add_argument('--random_state', type=int, default=42,
184
+ help='Random seed.')
185
+ parser.add_argument('--log_transform', action='store_true',
186
+ help='Apply log transformation to the target variable.')
187
+ parser.add_argument('--cv_folds', type=int, default=5,
188
+ help='Number of cross-validation folds.')
189
+ parser.add_argument('--scoring_metric', type=str, default=None,
190
+ help='Scoring metric for model evaluation.')
191
+ # Output arguments
192
+ parser.add_argument('--model_path', type=str, default=None,
193
+ help='Path to save the trained model.')
194
+ parser.add_argument('--results_path', type=str, default=None,
195
+ help='Path to save results and metrics.')
196
+ parser.add_argument('--visualize', action='store_true',
197
+ help='Generate and save visualizations.')
198
+
199
+ args = parser.parse_args()
200
+ main(args)
utils/README.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Utils
2
+
3
+ This directory contains utility scripts and helper functions that are used throughout the project. These scripts provide common functionalities such as data preprocessing, hyperparameter tuning, and other support functions that assist in model training and evaluation.
4
+
5
+ ## Contents
6
+
7
+ - [`supervised_hyperparameter_tuning.py`](#supervised_hyperparameter_tuning.py)
8
+
9
+ ### `supervised_hyperparameter_tuning.py`
10
+
11
+ This script contains functions for performing hyperparameter tuning on supervised learning models using scikit-learn's `Pipeline` and `GridSearchCV`.
12
+
13
+ #### Functions
14
+
15
+ - **`regression_hyperparameter_tuning(X_train, y_train, estimator, param_grid, cv=5, scoring=None)`**
16
+
17
+ Performs hyperparameter tuning using grid search cross-validation.
18
+
19
+ - **Parameters:**
20
+ - `X_train`: Training features.
21
+ - `y_train`: Training target variable.
22
+ - `estimator`: A scikit-learn estimator (e.g., `LinearRegression()`).
23
+ - `param_grid`: Dictionary with parameters names (`str`) as keys and lists of parameter settings to try as values.
24
+ - `cv`: Number of cross-validation folds. Default is 5.
25
+ - `scoring`: Scoring metric to use. Default depends on the estimator.
26
+
27
+ - **Returns:**
28
+ - `best_model`: The estimator with the best found parameters.
29
+ - `best_params`: Dictionary of the best parameters.
30
+
31
+ #### Usage Example
32
+
33
+ ```python
34
+ from utils.supervised_hyperparameter_tuning import regression_hyperparameter_tuning
35
+ from sklearn.linear_model import LinearRegression
36
+
37
+ # Define estimator and parameter grid
38
+ estimator = LinearRegression()
39
+ param_grid = {
40
+ 'model__fit_intercept': [True, False],
41
+ # Add other parameters
42
+ }
43
+
44
+ # Perform hyperparameter tuning
45
+ best_model, best_params = regression_hyperparameter_tuning(X_train, y_train, estimator, param_grid)
utils/supervised_hyperparameter_tuning.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module provides a function for hyperparameter tuning with preprocessing using scikit-learn's GridSearchCV specifically for regression models.
3
+
4
+ Features:
5
+ - Handles numerical and categorical preprocessing using pipelines.
6
+ - Automates hyperparameter tuning for any scikit-learn regressor.
7
+ - Uses GridSearchCV for cross-validation and hyperparameter search.
8
+ - Applies algorithm-specific preprocessing when necessary.
9
+
10
+ Functions:
11
+ - hyperparameter_tuning_model: Performs hyperparameter tuning on a given dataset and estimator.
12
+
13
+ Example Usage:
14
+ from sklearn.ensemble import RandomForestRegressor
15
+ from supervised_hyperparameter_tuning import hyperparameter_tuning_model
16
+
17
+ X = ... # Your feature DataFrame
18
+ y = ... # Your target variable
19
+ param_grid = {
20
+ 'model__n_estimators': [100, 200, 500],
21
+ 'model__max_depth': [None, 10, 20]
22
+ }
23
+ best_model, best_params = hyperparameter_tuning_model(X, y, RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')
24
+ """
25
+
26
+ from sklearn.compose import ColumnTransformer
27
+ from sklearn.impute import SimpleImputer
28
+ from sklearn.pipeline import Pipeline
29
+ from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
30
+ from sklearn.model_selection import GridSearchCV, KFold
31
+
32
+ def regression_hyperparameter_tuning(X, y, estimator, param_grid, cv=5, scoring=None):
33
+ """
34
+ Performs hyperparameter tuning for a given regression model using GridSearchCV with preprocessing.
35
+
36
+ Args:
37
+ X (pd.DataFrame): Features.
38
+ y (pd.Series): Target variable.
39
+ estimator: The scikit-learn regressor to use (e.g., LinearRegression(), RandomForestRegressor()).
40
+ param_grid (dict): Hyperparameter grid for GridSearchCV.
41
+ cv (int or cross-validation generator): Number of cross-validation folds or a cross-validation generator.
42
+ scoring (str or None): Scoring metric to use.
43
+
44
+ Returns:
45
+ best_model (Pipeline): Best model within a pipeline from GridSearch.
46
+ best_params (dict): Best hyperparameters.
47
+ """
48
+ # Identify numerical and categorical columns
49
+ numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
50
+ categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
51
+
52
+ # Define preprocessing for numerical data
53
+ numerical_transformer = Pipeline(steps=[
54
+ ('imputer', SimpleImputer(strategy='median')),
55
+ ('scaler', StandardScaler())
56
+ ])
57
+
58
+ # Conditional preprocessing for categorical data
59
+ estimator_name = estimator.__class__.__name__
60
+
61
+ if estimator_name in [
62
+ 'DecisionTreeRegressor', 'RandomForestRegressor', 'ExtraTreesRegressor',
63
+ 'GradientBoostingRegressor', 'XGBRegressor', 'LGBMRegressor', 'CatBoostRegressor'
64
+ ]:
65
+ # Use Ordinal Encoding for tree-based models
66
+ categorical_transformer = Pipeline(steps=[
67
+ ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
68
+ ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
69
+ ])
70
+ else:
71
+ # Use OneHotEncoder for other models
72
+ categorical_transformer = Pipeline(steps=[
73
+ ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
74
+ ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
75
+ ])
76
+
77
+ # Create preprocessing pipeline
78
+ preprocessor = ColumnTransformer(
79
+ transformers=[
80
+ ('num', numerical_transformer, numerical_cols),
81
+ ('cat', categorical_transformer, categorical_cols)
82
+ ]
83
+ )
84
+
85
+ # Create a pipeline that combines preprocessing and the estimator
86
+ pipeline = Pipeline(steps=[
87
+ ('preprocessor', preprocessor),
88
+ ('model', estimator)
89
+ ])
90
+
91
+ # Define cross-validation strategy
92
+ if isinstance(cv, int):
93
+ cv = KFold(n_splits=cv, shuffle=True, random_state=42)
94
+
95
+ # Initialize GridSearchCV
96
+ grid_search = GridSearchCV(
97
+ estimator=pipeline,
98
+ param_grid=param_grid,
99
+ cv=cv,
100
+ scoring=scoring,
101
+ n_jobs=-1
102
+ )
103
+
104
+ # Perform Grid Search
105
+ grid_search.fit(X, y)
106
+
107
+ # Get the best model and parameters
108
+ best_model = grid_search.best_estimator_
109
+ best_params = grid_search.best_params_
110
+
111
+ print(f"Best Hyperparameters for {estimator_name}:")
112
+ for param_name in sorted(best_params.keys()):
113
+ print(f"{param_name}: {best_params[param_name]}")
114
+
115
+ return best_model, best_params