Spaces:
Sleeping
Sleeping
Commit
·
829e3ac
1
Parent(s):
3fee6e0
Add application file
Browse files- app.py +296 -0
- data/README.md +1 -0
- data/datasets/README.md +21 -0
- data/datasets/kaggle_data.py +115 -0
- data/preprocessing/README.md +1 -0
- data/raw/README.md +1 -0
- data/utils/README.md +1 -0
- models/README.md +1 -0
- models/computer_vision/README.md +1 -0
- models/deep_learning/README.md +1 -0
- models/nlp/README.md +1 -0
- models/reinforcement_learning/README.md +1 -0
- models/supervised/classification/README.md +1 -0
- models/supervised/regression/README.md +1 -0
- models/supervised/regression/adaboost_regressor.py +30 -0
- models/supervised/regression/catboost_regressor.py +32 -0
- models/supervised/regression/decision_tree_regressor.py +31 -0
- models/supervised/regression/elasticnet_regression.py +35 -0
- models/supervised/regression/extra_trees_regressor.py +33 -0
- models/supervised/regression/gradient_boosting_regressor.py +33 -0
- models/supervised/regression/knn_regressor.py +34 -0
- models/supervised/regression/lasso_regression.py +33 -0
- models/supervised/regression/lightgbm_regressor.py +37 -0
- models/supervised/regression/linear_regression.py +29 -0
- models/supervised/regression/mlp_regressor.py +35 -0
- models/supervised/regression/random_forest_regressor.py +32 -0
- models/supervised/regression/ridge_regression.py +33 -0
- models/supervised/regression/support_vector_regressor.py +34 -0
- models/supervised/regression/xgboost_regressor.py +35 -0
- models/unsupervised/README.md +1 -0
- requirements.txt +12 -0
- scripts/README.md +52 -0
- scripts/train_classification_model.py +185 -0
- scripts/train_regression_model.py +200 -0
- utils/README.md +45 -0
- utils/supervised_hyperparameter_tuning.py +115 -0
app.py
ADDED
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Gradio Interface for Training Regression Models
|
3 |
+
|
4 |
+
This script provides a Gradio-based user interface to train regression models using various datasets
|
5 |
+
and algorithms. It enables seamless interaction by allowing users to select models, preprocess data,
|
6 |
+
and specify hyperparameters through an intuitive UI.
|
7 |
+
|
8 |
+
Features:
|
9 |
+
- **Model Selection**: Choose from a list of available regression algorithms located in `models/supervised/regression`.
|
10 |
+
- **Dataset Input Options**:
|
11 |
+
- Upload a local CSV file.
|
12 |
+
- Specify a path to a dataset.
|
13 |
+
- Download datasets from Kaggle using the `kaggle.json` API credentials.
|
14 |
+
- **Hyperparameter Customization**: Modify model parameters like test size, random state, cross-validation folds,
|
15 |
+
and more directly in the interface.
|
16 |
+
- **Visualizations**: Generate plots like actual vs. predicted graphs after training.
|
17 |
+
- **Live Feedback**: Outputs training metrics, best hyperparameters, and paths to saved models.
|
18 |
+
|
19 |
+
Structure:
|
20 |
+
1. **Helper Functions**:
|
21 |
+
- `get_model_modules`: Dynamically fetches available regression models.
|
22 |
+
- `download_kaggle_data`: Handles Kaggle dataset downloads.
|
23 |
+
- `train_model`: Constructs and executes the command for training models.
|
24 |
+
- `get_columns_from_data`: Extracts column names from the dataset for UI selection.
|
25 |
+
|
26 |
+
2. **Gradio UI Components**:
|
27 |
+
- Allows users to toggle between different dataset input methods.
|
28 |
+
- Updates column dropdowns dynamically based on the dataset.
|
29 |
+
- Executes the training script and displays results and visualizations.
|
30 |
+
|
31 |
+
Usage:
|
32 |
+
- Place this script in the `interfaces/gradio/` directory of the project.
|
33 |
+
- Ensure that the project structure adheres to the specified layout.
|
34 |
+
- Run the script, and a Gradio interface will be launched for training models interactively.
|
35 |
+
|
36 |
+
Requirements:
|
37 |
+
- Python 3.7 or higher
|
38 |
+
- Required Python libraries specified in `requirements.txt`
|
39 |
+
- Properly structured project with `train_regression_model.py` and model modules.
|
40 |
+
|
41 |
+
"""
|
42 |
+
|
43 |
+
import gradio as gr
|
44 |
+
import pandas as pd
|
45 |
+
import os
|
46 |
+
import subprocess
|
47 |
+
import sys
|
48 |
+
import glob
|
49 |
+
import re
|
50 |
+
|
51 |
+
# Add the project root directory to the Python path
|
52 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
53 |
+
project_root = os.path.abspath(os.path.join(current_dir, '../../'))
|
54 |
+
sys.path.append(project_root)
|
55 |
+
|
56 |
+
def get_model_modules():
|
57 |
+
# Get the list of available model modules
|
58 |
+
models_dir = os.path.join(project_root, 'models', 'supervised', 'regression')
|
59 |
+
model_files = glob.glob(os.path.join(models_dir, '*.py'))
|
60 |
+
|
61 |
+
# Debugging: Print the models directory and found files
|
62 |
+
print(f"Looking for model files in: {models_dir}")
|
63 |
+
print(f"Found model files: {model_files}")
|
64 |
+
|
65 |
+
models = [os.path.splitext(os.path.basename(f))[0] for f in model_files if not f.endswith('__init__.py')]
|
66 |
+
model_modules = [f"{model}" for model in models]
|
67 |
+
return model_modules
|
68 |
+
|
69 |
+
def download_kaggle_data(json_path, competition_name):
|
70 |
+
# Import the get_kaggle_data function
|
71 |
+
from data.datasets.kaggle_data import get_kaggle_data
|
72 |
+
|
73 |
+
data_path = get_kaggle_data(json_path=json_path, data_name=competition_name, is_competition=True)
|
74 |
+
return data_path
|
75 |
+
|
76 |
+
def train_model(model_module, data_option, data_file, data_path, data_name_kaggle, kaggle_json_file, competition_name,
|
77 |
+
target_variable, drop_columns, test_size, random_state, log_transform, cv_folds,
|
78 |
+
scoring_metric, model_save_path, results_save_path, visualize):
|
79 |
+
|
80 |
+
# Determine data_path
|
81 |
+
if data_option == 'Upload Data File':
|
82 |
+
if data_file is None:
|
83 |
+
return "Please upload a data file.", None
|
84 |
+
data_path = data_file # data_file is the path to the uploaded file
|
85 |
+
elif data_option == 'Provide Data Path':
|
86 |
+
if not os.path.exists(data_path):
|
87 |
+
return "Provided data path does not exist.", None
|
88 |
+
elif data_option == 'Download from Kaggle':
|
89 |
+
if kaggle_json_file is None:
|
90 |
+
return "Please upload your kaggle.json file.", None
|
91 |
+
else:
|
92 |
+
# Save the kaggle.json file to ~/.kaggle/kaggle.json
|
93 |
+
import shutil
|
94 |
+
kaggle_config_dir = os.path.expanduser('~/.kaggle')
|
95 |
+
os.makedirs(kaggle_config_dir, exist_ok=True)
|
96 |
+
kaggle_json_path = os.path.join(kaggle_config_dir, 'kaggle.json')
|
97 |
+
shutil.copy(kaggle_json_file.name, kaggle_json_path)
|
98 |
+
os.chmod(kaggle_json_path, 0o600)
|
99 |
+
data_dir = download_kaggle_data(json_path=kaggle_json_path, competition_name=competition_name)
|
100 |
+
if data_dir is None:
|
101 |
+
return "Failed to download data from Kaggle.", None
|
102 |
+
# Use the specified data_name_kaggle
|
103 |
+
data_path = os.path.join(data_dir, data_name_kaggle)
|
104 |
+
if not os.path.exists(data_path):
|
105 |
+
return f"{data_name_kaggle} not found in the downloaded Kaggle data.", None
|
106 |
+
else:
|
107 |
+
return "Invalid data option selected.", None
|
108 |
+
|
109 |
+
# Prepare command-line arguments
|
110 |
+
cmd = [sys.executable, os.path.join(project_root, 'scripts', 'train_regression_model.py')]
|
111 |
+
cmd.extend(['--model_module', model_module])
|
112 |
+
cmd.extend(['--data_path', data_path])
|
113 |
+
cmd.extend(['--target_variable', target_variable])
|
114 |
+
|
115 |
+
if drop_columns:
|
116 |
+
cmd.extend(['--drop_columns', ','.join(drop_columns)])
|
117 |
+
if test_size != 0.2:
|
118 |
+
cmd.extend(['--test_size', str(test_size)])
|
119 |
+
if random_state != 42:
|
120 |
+
cmd.extend(['--random_state', str(int(random_state))])
|
121 |
+
if log_transform:
|
122 |
+
cmd.append('--log_transform')
|
123 |
+
if cv_folds != 5:
|
124 |
+
cmd.extend(['--cv_folds', str(int(cv_folds))])
|
125 |
+
if scoring_metric:
|
126 |
+
cmd.extend(['--scoring_metric', scoring_metric])
|
127 |
+
if model_save_path:
|
128 |
+
cmd.extend(['--model_path', model_save_path])
|
129 |
+
if results_save_path:
|
130 |
+
cmd.extend(['--results_path', results_save_path])
|
131 |
+
if visualize:
|
132 |
+
cmd.append('--visualize')
|
133 |
+
|
134 |
+
# Debugging: Print the command being executed
|
135 |
+
print(f"Executing command: {' '.join(cmd)}")
|
136 |
+
|
137 |
+
# Execute the command
|
138 |
+
try:
|
139 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
140 |
+
output = result.stdout
|
141 |
+
errors = result.stderr
|
142 |
+
if result.returncode != 0:
|
143 |
+
return f"Error during training:\n{errors}", None
|
144 |
+
else:
|
145 |
+
# Delete usless "Figure (600x400)" text
|
146 |
+
output = re.sub(r"Figure\(\d+x\d+\)", "", output).strip()
|
147 |
+
# Try to load the plot image
|
148 |
+
if results_save_path:
|
149 |
+
plot_image_path = os.path.join(results_save_path, 'actual_vs_predicted.png')
|
150 |
+
else:
|
151 |
+
# Default path if results_save_path is not provided
|
152 |
+
plot_image_path = output.split('Visualization saved to ')[1].strip()
|
153 |
+
if os.path.exists(plot_image_path):
|
154 |
+
return f"Training completed successfully.\n\n{output}", plot_image_path
|
155 |
+
else:
|
156 |
+
return f"Training completed successfully.\n\n{output}", None
|
157 |
+
except Exception as e:
|
158 |
+
return f"An error occurred:\n{str(e)}", None
|
159 |
+
|
160 |
+
def get_columns_from_data(data_option, data_file, data_path, data_name_kaggle, kaggle_json_file, competition_name):
|
161 |
+
# Determine data_path
|
162 |
+
if data_option == 'Upload Data File':
|
163 |
+
if data_file is None:
|
164 |
+
return []
|
165 |
+
data_path = data_file
|
166 |
+
elif data_option == 'Provide Data Path':
|
167 |
+
if not os.path.exists(data_path):
|
168 |
+
return []
|
169 |
+
elif data_option == 'Download from Kaggle':
|
170 |
+
if kaggle_json_file is None:
|
171 |
+
return []
|
172 |
+
else:
|
173 |
+
# Save the kaggle.json file to ~/.kaggle/kaggle.json
|
174 |
+
import shutil
|
175 |
+
kaggle_config_dir = os.path.expanduser('~/.kaggle')
|
176 |
+
os.makedirs(kaggle_config_dir, exist_ok=True)
|
177 |
+
kaggle_json_path = os.path.join(kaggle_config_dir, 'kaggle.json')
|
178 |
+
shutil.copy(kaggle_json_file.name, kaggle_json_path)
|
179 |
+
os.chmod(kaggle_json_path, 0o600)
|
180 |
+
data_dir = download_kaggle_data(json_path=kaggle_json_path, competition_name=competition_name)
|
181 |
+
if data_dir is None:
|
182 |
+
return []
|
183 |
+
data_path = os.path.join(data_dir, data_name_kaggle)
|
184 |
+
if not os.path.exists(data_path):
|
185 |
+
return []
|
186 |
+
else:
|
187 |
+
return []
|
188 |
+
|
189 |
+
try:
|
190 |
+
data = pd.read_csv(data_path)
|
191 |
+
columns = data.columns.tolist()
|
192 |
+
return columns
|
193 |
+
except Exception as e:
|
194 |
+
print(f"Error reading data file: {e}")
|
195 |
+
return []
|
196 |
+
|
197 |
+
# Define Gradio interface components
|
198 |
+
|
199 |
+
def update_columns(data_option, data_file, data_path, data_name_kaggle, kaggle_json_file, competition_name):
|
200 |
+
columns = get_columns_from_data(data_option, data_file, data_path, data_name_kaggle, kaggle_json_file, competition_name)
|
201 |
+
if not columns:
|
202 |
+
return gr.update(choices=[]), gr.update(choices=[])
|
203 |
+
else:
|
204 |
+
return gr.update(choices=columns), gr.update(choices=columns)
|
205 |
+
|
206 |
+
model_modules = get_model_modules()
|
207 |
+
|
208 |
+
if not model_modules:
|
209 |
+
print("No model modules found. Please check the 'models/supervised/regression' directory.")
|
210 |
+
# You can handle this case appropriately, e.g., show an error message in the interface or exit.
|
211 |
+
|
212 |
+
with gr.Blocks() as demo:
|
213 |
+
gr.Markdown("# Train a Regression Model")
|
214 |
+
|
215 |
+
with gr.Row():
|
216 |
+
model_module_input = gr.Dropdown(choices=model_modules, label="Select Model Module")
|
217 |
+
scoring_metric_input = gr.Textbox(value='neg_root_mean_squared_error', label="Scoring Metric")
|
218 |
+
|
219 |
+
with gr.Row():
|
220 |
+
test_size_input = gr.Slider(minimum=0.1, maximum=0.5, step=0.05, value=0.2, label="Test Size")
|
221 |
+
random_state_input = gr.Number(value=42, label="Random State")
|
222 |
+
cv_folds_input = gr.Number(value=5, label="CV Folds", precision=0)
|
223 |
+
|
224 |
+
log_transform_input = gr.Checkbox(label="Log Transform Target Variable", value=False)
|
225 |
+
visualize_input = gr.Checkbox(label="Generate Visualizations", value=True)
|
226 |
+
|
227 |
+
with gr.Row():
|
228 |
+
model_save_path_input = gr.Textbox(value='', label="Model Save Path (optional)")
|
229 |
+
results_save_path_input = gr.Textbox(value='', label="Results Save Path (optional)")
|
230 |
+
|
231 |
+
with gr.Tab("Data Input"):
|
232 |
+
data_option_input = gr.Radio(choices=['Upload Data File', 'Provide Data Path', 'Download from Kaggle'], label="Data Input Option", value='Upload Data File')
|
233 |
+
|
234 |
+
upload_data_col = gr.Column(visible=True)
|
235 |
+
with upload_data_col:
|
236 |
+
data_file_input = gr.File(label="Upload CSV Data File", type="filepath")
|
237 |
+
|
238 |
+
data_path_col = gr.Column(visible=False)
|
239 |
+
with data_path_col:
|
240 |
+
data_path_input = gr.Textbox(value='', label="Data File Path")
|
241 |
+
|
242 |
+
kaggle_data_col = gr.Column(visible=False)
|
243 |
+
with kaggle_data_col:
|
244 |
+
kaggle_json_file_input = gr.File(label="Upload kaggle.json File", type="filepath")
|
245 |
+
competition_name_input = gr.Textbox(value='house-prices-advanced-regression-techniques', label="Kaggle Competition Name")
|
246 |
+
data_name_kaggle_input = gr.Textbox(value='train.csv', label="Data File Name (in Kaggle dataset)")
|
247 |
+
|
248 |
+
def toggle_data_input(option):
|
249 |
+
if option == 'Upload Data File':
|
250 |
+
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
|
251 |
+
elif option == 'Provide Data Path':
|
252 |
+
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
253 |
+
elif option == 'Download from Kaggle':
|
254 |
+
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
|
255 |
+
|
256 |
+
data_option_input.change(
|
257 |
+
fn=toggle_data_input,
|
258 |
+
inputs=[data_option_input],
|
259 |
+
outputs=[upload_data_col, data_path_col, kaggle_data_col]
|
260 |
+
)
|
261 |
+
|
262 |
+
update_cols_btn = gr.Button("Update Columns")
|
263 |
+
|
264 |
+
target_variable_input = gr.Dropdown(choices=[], label="Select Target Variable")
|
265 |
+
drop_columns_input = gr.CheckboxGroup(choices=[], label="Columns to Drop")
|
266 |
+
|
267 |
+
update_cols_btn.click(
|
268 |
+
fn=update_columns,
|
269 |
+
inputs=[data_option_input, data_file_input, data_path_input, data_name_kaggle_input, kaggle_json_file_input, competition_name_input],
|
270 |
+
outputs=[target_variable_input, drop_columns_input]
|
271 |
+
)
|
272 |
+
|
273 |
+
train_btn = gr.Button("Train Model")
|
274 |
+
output_display = gr.Textbox(label="Output")
|
275 |
+
image_display = gr.Image(label="Visualization", visible=True)
|
276 |
+
|
277 |
+
def run_training(*args):
|
278 |
+
output_text, plot_image_path = train_model(*args)
|
279 |
+
if plot_image_path and os.path.exists(plot_image_path):
|
280 |
+
return output_text, plot_image_path
|
281 |
+
else:
|
282 |
+
return output_text, None
|
283 |
+
|
284 |
+
train_btn.click(
|
285 |
+
fn=run_training,
|
286 |
+
inputs=[
|
287 |
+
model_module_input, data_option_input, data_file_input, data_path_input,
|
288 |
+
data_name_kaggle_input, kaggle_json_file_input, competition_name_input,
|
289 |
+
target_variable_input, drop_columns_input, test_size_input, random_state_input, log_transform_input, cv_folds_input,
|
290 |
+
scoring_metric_input, model_save_path_input, results_save_path_input, visualize_input
|
291 |
+
],
|
292 |
+
outputs=[output_display, image_display]
|
293 |
+
)
|
294 |
+
|
295 |
+
if __name__ == "__main__":
|
296 |
+
demo.launch(share=True)
|
data/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# data
|
data/datasets/README.md
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Datasets Utilities
|
2 |
+
|
3 |
+
This folder contains utility scripts for handling datasets, including downloading data from Kaggle.
|
4 |
+
|
5 |
+
## 📄 Scripts
|
6 |
+
|
7 |
+
### `kaggle_data.py`
|
8 |
+
|
9 |
+
- **Description**: A Python script to download Kaggle datasets or competition data seamlessly, supporting Google Colab, local Linux/Mac, and Windows environments.
|
10 |
+
- **Path**: [`data/datasets/kaggle_data.py`](kaggle_data.py)
|
11 |
+
- **Key Function**: `get_kaggle_data(json_path, data_name, is_competition=False, output_dir='data/raw')`
|
12 |
+
- **Example**:
|
13 |
+
|
14 |
+
```python
|
15 |
+
from kaggle_data import get_kaggle_data
|
16 |
+
|
17 |
+
# Download a standard Kaggle dataset
|
18 |
+
dataset_path = get_kaggle_data("kaggle.json", "paultimothymooney/chest-xray-pneumonia")
|
19 |
+
|
20 |
+
# Download competition data
|
21 |
+
competition_path = get_kaggle_data("kaggle.json", "house-prices-advanced-regression-techniques", is_competition=True)
|
data/datasets/kaggle_data.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module provides a utility function to download Kaggle datasets or competition data.
|
3 |
+
|
4 |
+
The function automatically detects whether it is running in a Google Colab environment, a local Linux/Mac environment, or a Windows environment, and sets up the Kaggle API accordingly.
|
5 |
+
|
6 |
+
Requirements:
|
7 |
+
- Kaggle API installed (`pip install kaggle`)
|
8 |
+
- Kaggle API key (`kaggle.json`) with appropriate permissions.
|
9 |
+
|
10 |
+
Environment Detection:
|
11 |
+
- Google Colab: Uses `/root/.config/kaggle/kaggle.json`.
|
12 |
+
- Local Linux/Mac: Uses `~/.kaggle/kaggle.json`.
|
13 |
+
- Windows: Uses `C:\\Users\\<Username>\\.kaggle\\kaggle.json`.
|
14 |
+
|
15 |
+
Functions:
|
16 |
+
get_kaggle_data(json_path: str, data_name: str, is_competition: bool = False, output_dir: str = "data/raw") -> str
|
17 |
+
"""
|
18 |
+
|
19 |
+
import os
|
20 |
+
import zipfile
|
21 |
+
import sys
|
22 |
+
import shutil
|
23 |
+
import platform
|
24 |
+
|
25 |
+
def get_kaggle_data(json_path: str, data_name: str, is_competition: bool = False, output_dir: str = "data/raw") -> str:
|
26 |
+
"""
|
27 |
+
Downloads a Kaggle dataset or competition data using the Kaggle API in Google Colab, local Linux/Mac, or Windows environment.
|
28 |
+
|
29 |
+
Parameters:
|
30 |
+
json_path (str): Path to your 'kaggle.json' file.
|
31 |
+
data_name (str): Kaggle dataset or competition name (e.g., 'paultimothymooney/chest-xray-pneumonia' or 'house-prices-advanced-regression-techniques').
|
32 |
+
is_competition (bool): Set to True if downloading competition data. Default is False (for datasets).
|
33 |
+
output_dir (str): Directory to save and extract the data. Default is 'data'.
|
34 |
+
|
35 |
+
Returns:
|
36 |
+
str: Path to the extracted dataset folder.
|
37 |
+
|
38 |
+
Raises:
|
39 |
+
OSError: If 'kaggle.json' is not found or cannot be copied.
|
40 |
+
Exception: If there is an error during download or extraction.
|
41 |
+
|
42 |
+
Example of Usage:
|
43 |
+
# For downloading a standard dataset
|
44 |
+
dataset_path = get_kaggle_data("kaggle.json", "paultimothymooney/chest-xray-pneumonia")
|
45 |
+
print(f"Dataset is available at: {dataset_path}")
|
46 |
+
|
47 |
+
# For downloading competition data
|
48 |
+
competition_path = get_kaggle_data("kaggle.json", "house-prices-advanced-regression-techniques", is_competition=True)
|
49 |
+
print(f"Competition data is available at: {competition_path}")
|
50 |
+
"""
|
51 |
+
# Detect environment (Colab, local Linux/Mac, or Windows)
|
52 |
+
is_colab = "google.colab" in sys.modules
|
53 |
+
is_windows = platform.system() == "Windows"
|
54 |
+
|
55 |
+
# Step 1: Setup Kaggle API credentials
|
56 |
+
try:
|
57 |
+
if is_colab:
|
58 |
+
config_dir = "/root/.config/kaggle"
|
59 |
+
os.makedirs(config_dir, exist_ok=True)
|
60 |
+
print("Setting up Kaggle API credentials for Colab environment.")
|
61 |
+
shutil.copy(json_path, os.path.join(config_dir, "kaggle.json"))
|
62 |
+
os.chmod(os.path.join(config_dir, "kaggle.json"), 0o600)
|
63 |
+
else:
|
64 |
+
# For both local Linux/Mac and Windows, use the home directory
|
65 |
+
config_dir = os.path.join(os.path.expanduser("~"), ".kaggle")
|
66 |
+
os.makedirs(config_dir, exist_ok=True)
|
67 |
+
print("Setting up Kaggle API credentials for local environment.")
|
68 |
+
kaggle_json_dest = os.path.join(config_dir, "kaggle.json")
|
69 |
+
if not os.path.exists(kaggle_json_dest):
|
70 |
+
shutil.copy(json_path, kaggle_json_dest)
|
71 |
+
if not is_windows:
|
72 |
+
os.chmod(kaggle_json_dest, 0o600)
|
73 |
+
except Exception as e:
|
74 |
+
raise OSError(f"Could not set up Kaggle API credentials: {e}")
|
75 |
+
|
76 |
+
# Step 2: Create output directory
|
77 |
+
dataset_dir = os.path.join(output_dir, data_name.split('/')[-1])
|
78 |
+
os.makedirs(dataset_dir, exist_ok=True)
|
79 |
+
original_dir = os.getcwd()
|
80 |
+
os.chdir(dataset_dir)
|
81 |
+
|
82 |
+
# Step 3: Download the dataset or competition data
|
83 |
+
try:
|
84 |
+
if is_competition:
|
85 |
+
print(f"Downloading competition data: {data_name}")
|
86 |
+
cmd = f"kaggle competitions download -c {data_name}"
|
87 |
+
else:
|
88 |
+
print(f"Downloading dataset: {data_name}")
|
89 |
+
cmd = f"kaggle datasets download -d {data_name}"
|
90 |
+
os.system(cmd)
|
91 |
+
except Exception as e:
|
92 |
+
print(f"Error during download: {e}")
|
93 |
+
os.chdir(original_dir)
|
94 |
+
return None
|
95 |
+
|
96 |
+
# Step 4: Unzip all downloaded files
|
97 |
+
zip_files = [f for f in os.listdir() if f.endswith(".zip")]
|
98 |
+
if not zip_files:
|
99 |
+
print("No zip files found. Please check the dataset or competition name.")
|
100 |
+
os.chdir(original_dir)
|
101 |
+
return None
|
102 |
+
|
103 |
+
for zip_file in zip_files:
|
104 |
+
try:
|
105 |
+
with zipfile.ZipFile(zip_file, "r") as zip_ref:
|
106 |
+
zip_ref.extractall()
|
107 |
+
print(f"Extracted: {zip_file}")
|
108 |
+
os.remove(zip_file)
|
109 |
+
except Exception as e:
|
110 |
+
print(f"Error extracting {zip_file}: {e}")
|
111 |
+
|
112 |
+
# Step 5: Navigate back to the original directory
|
113 |
+
os.chdir(original_dir)
|
114 |
+
|
115 |
+
return dataset_dir
|
data/preprocessing/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# preprocessing
|
data/raw/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# raw
|
data/utils/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# utils
|
models/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# models
|
models/computer_vision/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# computer_vision
|
models/deep_learning/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# deep_learning
|
models/nlp/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# nlp
|
models/reinforcement_learning/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# reinforcement_learning
|
models/supervised/classification/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# classification
|
models/supervised/regression/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# regression
|
models/supervised/regression/adaboost_regressor.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
"""
|
3 |
+
This module sets up an AdaBoost Regressor with hyperparameter tuning.
|
4 |
+
|
5 |
+
Features:
|
6 |
+
- Uses `AdaBoostRegressor` estimator from scikit-learn.
|
7 |
+
- Defines a hyperparameter grid for boosting parameters.
|
8 |
+
- Combines weak learners to form a strong predictor.
|
9 |
+
|
10 |
+
Special Considerations:
|
11 |
+
- Sensitive to outliers.
|
12 |
+
- Not sensitive to feature scaling.
|
13 |
+
- Base estimator is a Decision Tree by default.
|
14 |
+
"""
|
15 |
+
|
16 |
+
from sklearn.ensemble import AdaBoostRegressor
|
17 |
+
|
18 |
+
# Define the estimator
|
19 |
+
estimator = AdaBoostRegressor(random_state=42)
|
20 |
+
|
21 |
+
# Define the hyperparameter grid
|
22 |
+
param_grid = {
|
23 |
+
'model__n_estimators': [50, 100], # Focus on a narrower range of estimators
|
24 |
+
'model__learning_rate': [0.001, 0.01, 0.1, 1.0], # Keep a good spread for learning rates
|
25 |
+
'model__loss': ['linear'], # Focus on the most commonly used loss function
|
26 |
+
'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
|
27 |
+
}
|
28 |
+
|
29 |
+
# Optional: Define the default scoring metric
|
30 |
+
default_scoring = 'neg_root_mean_squared_error'
|
models/supervised/regression/catboost_regressor.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
"""
|
3 |
+
This module sets up a CatBoost Regressor with hyperparameter tuning.
|
4 |
+
|
5 |
+
Features:
|
6 |
+
- Uses `CatBoostRegressor` estimator from CatBoost.
|
7 |
+
- Defines a hyperparameter grid for boosting parameters.
|
8 |
+
- Handles categorical features natively.
|
9 |
+
|
10 |
+
Special Considerations:
|
11 |
+
- Requires the `catboost` library (`pip install catboost`).
|
12 |
+
- Adjust the preprocessing pipeline to skip encoding categorical features.
|
13 |
+
- Not sensitive to feature scaling.
|
14 |
+
- Can be slower to train compared to other boosting algorithms.
|
15 |
+
"""
|
16 |
+
|
17 |
+
from catboost import CatBoostRegressor
|
18 |
+
|
19 |
+
# Define the estimator
|
20 |
+
estimator = CatBoostRegressor(random_state=42, verbose=0)
|
21 |
+
|
22 |
+
# Define the hyperparameter grid
|
23 |
+
param_grid = {
|
24 |
+
'model__iterations': [500], # Fixed to a reasonable value for faster tuning
|
25 |
+
'model__learning_rate': [0.05, 0.1], # Common learning rates
|
26 |
+
'model__depth': [6, 8], # Typical depths for balance between speed and accuracy
|
27 |
+
'model__l2_leaf_reg': [3], # Most impactful regularization value
|
28 |
+
'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
|
29 |
+
}
|
30 |
+
|
31 |
+
# Optional: Define the default scoring metric
|
32 |
+
default_scoring = 'neg_root_mean_squared_error'
|
models/supervised/regression/decision_tree_regressor.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
"""
|
3 |
+
This module sets up a Decision Tree Regressor with hyperparameter tuning.
|
4 |
+
|
5 |
+
Features:
|
6 |
+
- Uses `DecisionTreeRegressor` estimator from scikit-learn.
|
7 |
+
- Defines a hyperparameter grid for tree-specific parameters.
|
8 |
+
- Handles non-linear relationships and interactions.
|
9 |
+
|
10 |
+
Special Considerations:
|
11 |
+
- Decision Trees are not affected by feature scaling.
|
12 |
+
- Can easily overfit; control tree depth and splitting criteria.
|
13 |
+
- No need for scaling transformers in the preprocessing pipeline.
|
14 |
+
"""
|
15 |
+
|
16 |
+
from sklearn.tree import DecisionTreeRegressor
|
17 |
+
|
18 |
+
# Define the estimator
|
19 |
+
estimator = DecisionTreeRegressor(random_state=42)
|
20 |
+
|
21 |
+
# Define the hyperparameter grid
|
22 |
+
param_grid = {
|
23 |
+
'model__criterion': ['squared_error', 'absolute_error'], # Only two key criteria
|
24 |
+
'model__max_depth': [5, 10, 20, None], # Depth variations
|
25 |
+
'model__min_samples_split': [2, 10], # Commonly used values
|
26 |
+
'model__min_samples_leaf': [1, 4], # Few values for leaves
|
27 |
+
'preprocessor__num__imputer__strategy': ['mean'], # Focused on a single strategy
|
28 |
+
}
|
29 |
+
|
30 |
+
# Optional: Define the default scoring metric
|
31 |
+
default_scoring = 'neg_root_mean_squared_error'
|
models/supervised/regression/elasticnet_regression.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
"""
|
3 |
+
This module sets up an ElasticNet Regression model with hyperparameter tuning.
|
4 |
+
|
5 |
+
Features:
|
6 |
+
- Uses `ElasticNet` estimator from scikit-learn.
|
7 |
+
- Combines L1 and L2 regularization.
|
8 |
+
- Increases `max_iter` to address convergence warnings.
|
9 |
+
|
10 |
+
Special Considerations:
|
11 |
+
- May produce convergence warnings if `max_iter` is insufficient.
|
12 |
+
- Adjust `l1_ratio` to balance between Lasso and Ridge penalties.
|
13 |
+
- Applying a log transformation (`log_transform`) to the target variable can be beneficial if it's skewed.
|
14 |
+
- Ensure `OneHotEncoder` outputs dense arrays.
|
15 |
+
"""
|
16 |
+
|
17 |
+
from sklearn.linear_model import ElasticNet
|
18 |
+
|
19 |
+
# Define the estimator
|
20 |
+
estimator = ElasticNet()
|
21 |
+
|
22 |
+
# Define the hyperparameter grid
|
23 |
+
param_grid = {
|
24 |
+
'model__alpha': [0.01, 0.1, 1.0, 10.0], # Regularization strength
|
25 |
+
'model__l1_ratio': [0.2, 0.5, 0.8], # Balance between L1 (Lasso) and L2 (Ridge)
|
26 |
+
'model__max_iter': [5000], # Sufficient to avoid convergence warnings
|
27 |
+
'model__fit_intercept': [True], # Assume intercept is important
|
28 |
+
'model__selection': ['cyclic'], # Focus on the default cyclic selection
|
29 |
+
'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
|
30 |
+
'preprocessor__num__scaler__with_mean': [True], # StandardScaler
|
31 |
+
'preprocessor__num__scaler__with_std': [True], # StandardScaler
|
32 |
+
}
|
33 |
+
|
34 |
+
# Optional: Define the default scoring metric
|
35 |
+
default_scoring = 'neg_root_mean_squared_error'
|
models/supervised/regression/extra_trees_regressor.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
"""
|
3 |
+
This module sets up an Extra Trees Regressor with hyperparameter tuning.
|
4 |
+
|
5 |
+
Features:
|
6 |
+
- Uses `ExtraTreesRegressor` estimator from scikit-learn.
|
7 |
+
- Defines a hyperparameter grid for ensemble parameters.
|
8 |
+
- Similar to Random Forest but uses random thresholds for splitting.
|
9 |
+
|
10 |
+
Special Considerations:
|
11 |
+
- Not sensitive to feature scaling.
|
12 |
+
- Can handle large datasets efficiently.
|
13 |
+
- Less prone to overfitting compared to single decision trees.
|
14 |
+
"""
|
15 |
+
|
16 |
+
from sklearn.ensemble import ExtraTreesRegressor
|
17 |
+
|
18 |
+
# Define the estimator
|
19 |
+
estimator = ExtraTreesRegressor(random_state=42, n_jobs=-1)
|
20 |
+
|
21 |
+
# Define the hyperparameter grid
|
22 |
+
param_grid = {
|
23 |
+
'model__n_estimators': [100, 200], # Common range for estimators
|
24 |
+
'model__criterion': ['squared_error'], # Focus on the most widely used criterion
|
25 |
+
'model__max_depth': [None, 10, 20], # Unrestricted depth and reasonable constraints
|
26 |
+
'model__min_samples_split': [2, 5], # Commonly used values
|
27 |
+
'model__min_samples_leaf': [1, 2], # Prevent overfitting with larger leaves
|
28 |
+
'model__max_features': ['sqrt', 'log2'], # Reduce to most common feature sampling strategies
|
29 |
+
'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
|
30 |
+
}
|
31 |
+
|
32 |
+
# Optional: Define the default scoring metric
|
33 |
+
default_scoring = 'neg_root_mean_squared_error'
|
models/supervised/regression/gradient_boosting_regressor.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
"""
|
3 |
+
This module sets up a Gradient Boosting Regressor with hyperparameter tuning.
|
4 |
+
|
5 |
+
Features:
|
6 |
+
- Uses `GradientBoostingRegressor` estimator from scikit-learn.
|
7 |
+
- Defines a hyperparameter grid for boosting parameters.
|
8 |
+
- Builds sequential models to minimize errors.
|
9 |
+
|
10 |
+
Special Considerations:
|
11 |
+
- Sensitive to overfitting; tune `n_estimators` and `learning_rate`.
|
12 |
+
- Not sensitive to feature scaling.
|
13 |
+
- Longer training times compared to other models.
|
14 |
+
"""
|
15 |
+
|
16 |
+
from sklearn.ensemble import GradientBoostingRegressor
|
17 |
+
|
18 |
+
# Define the estimator
|
19 |
+
estimator = GradientBoostingRegressor(random_state=42)
|
20 |
+
|
21 |
+
# Define the hyperparameter grid
|
22 |
+
param_grid = {
|
23 |
+
'model__n_estimators': [100, 200], # Focused range of estimators
|
24 |
+
'model__learning_rate': [0.001, 0.01, 0.1, 1], # Commonly used learning rates
|
25 |
+
'model__max_depth': [3, 5], # Standard depth values
|
26 |
+
'model__subsample': [0.8], # Single value to focus on speed
|
27 |
+
'model__min_samples_split': [2], # Default value
|
28 |
+
'model__min_samples_leaf': [1], # Default value
|
29 |
+
'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
|
30 |
+
}
|
31 |
+
|
32 |
+
# Optional: Define the default scoring metric
|
33 |
+
default_scoring = 'neg_root_mean_squared_error'
|
models/supervised/regression/knn_regressor.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
"""
|
3 |
+
This module sets up a K-Nearest Neighbors Regressor with hyperparameter tuning.
|
4 |
+
|
5 |
+
Features:
|
6 |
+
- Uses `KNeighborsRegressor` estimator from scikit-learn.
|
7 |
+
- Defines a hyperparameter grid for neighbor parameters.
|
8 |
+
- Non-parametric method useful for capturing local patterns.
|
9 |
+
|
10 |
+
Special Considerations:
|
11 |
+
- Feature scaling is crucial for KNN.
|
12 |
+
- Sensitive to the choice of `n_neighbors`.
|
13 |
+
- Training is fast, but prediction can be slow on large datasets.
|
14 |
+
"""
|
15 |
+
|
16 |
+
from sklearn.neighbors import KNeighborsRegressor
|
17 |
+
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
18 |
+
|
19 |
+
# Define the estimator
|
20 |
+
estimator = KNeighborsRegressor(n_jobs=-1)
|
21 |
+
|
22 |
+
# Define the hyperparameter grid
|
23 |
+
param_grid = {
|
24 |
+
'model__n_neighbors': [3, 5, 7], # Focus on common neighbor values
|
25 |
+
'model__weights': ['uniform', 'distance'], # Standard options
|
26 |
+
'model__algorithm': ['auto', 'ball_tree'], # Reduce algorithms to commonly used ones
|
27 |
+
'model__p': [1, 2], # Manhattan and Euclidean distances
|
28 |
+
'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
|
29 |
+
'preprocessor__num__scaler__with_mean': [True], # StandardScaler
|
30 |
+
'preprocessor__num__scaler__with_std': [True], # StandardScaler
|
31 |
+
}
|
32 |
+
|
33 |
+
# Optional: Define the default scoring metric
|
34 |
+
default_scoring = 'neg_root_mean_squared_error'
|
models/supervised/regression/lasso_regression.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
"""
|
3 |
+
This module sets up a Lasso Regression model with hyperparameter tuning.
|
4 |
+
|
5 |
+
Features:
|
6 |
+
- Uses `Lasso` estimator from scikit-learn.
|
7 |
+
- Defines a hyperparameter grid for preprocessing and model-specific parameters.
|
8 |
+
- Increases `max_iter` to address convergence warnings.
|
9 |
+
|
10 |
+
Special Considerations:
|
11 |
+
- Lasso Regression may produce convergence warnings if `max_iter` is insufficient.
|
12 |
+
- Applying a log transformation (`log_transform`) to the target variable can be beneficial if it's skewed.
|
13 |
+
- Ensure `OneHotEncoder` outputs dense arrays to avoid compatibility issues.
|
14 |
+
"""
|
15 |
+
|
16 |
+
from sklearn.linear_model import Lasso
|
17 |
+
|
18 |
+
# Define the estimator
|
19 |
+
estimator = Lasso()
|
20 |
+
|
21 |
+
# Define the hyperparameter grid
|
22 |
+
param_grid = {
|
23 |
+
'model__alpha': [0.01, 0.1, 1.0, 10.0], # Regularization strength
|
24 |
+
'model__max_iter': [5000], # Single value to ensure convergence
|
25 |
+
'model__fit_intercept': [True], # Assume the intercept is important
|
26 |
+
'model__selection': ['cyclic'], # Focus on the default cyclic selection
|
27 |
+
'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
|
28 |
+
'preprocessor__num__scaler__with_mean': [True], # StandardScaler
|
29 |
+
'preprocessor__num__scaler__with_std': [True], # StandardScaler
|
30 |
+
}
|
31 |
+
|
32 |
+
# Optional: Define the default scoring metric
|
33 |
+
default_scoring = 'neg_root_mean_squared_error'
|
models/supervised/regression/lightgbm_regressor.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
"""
|
3 |
+
This module sets up a LightGBM Regressor with hyperparameter tuning.
|
4 |
+
|
5 |
+
Features:
|
6 |
+
- Uses `LGBMRegressor` estimator from LightGBM.
|
7 |
+
- Defines a hyperparameter grid for boosting parameters.
|
8 |
+
- Optimized for speed and performance.
|
9 |
+
|
10 |
+
Special Considerations:
|
11 |
+
- Requires the `lightgbm` library (`pip install lightgbm`).
|
12 |
+
- Can handle categorical features if provided appropriately.
|
13 |
+
- Not sensitive to feature scaling.
|
14 |
+
"""
|
15 |
+
|
16 |
+
from lightgbm import LGBMRegressor
|
17 |
+
|
18 |
+
# Define the estimator
|
19 |
+
estimator = LGBMRegressor(
|
20 |
+
random_state=42,
|
21 |
+
n_jobs=-1,
|
22 |
+
verbose=-1
|
23 |
+
)
|
24 |
+
|
25 |
+
# Define hyperparameter grid
|
26 |
+
param_grid = {
|
27 |
+
'model__n_estimators': [100, 200],
|
28 |
+
'model__learning_rate': [0.01, 0.05],
|
29 |
+
'model__num_leaves': [15, 31],
|
30 |
+
'model__max_depth': [10, 20],
|
31 |
+
'model__min_data_in_leaf': [20, 50],
|
32 |
+
'model__colsample_bytree': [0.8],
|
33 |
+
'preprocessor__num__imputer__strategy': ['mean'],
|
34 |
+
}
|
35 |
+
|
36 |
+
# Optional: Define the default scoring metric
|
37 |
+
default_scoring = 'neg_root_mean_squared_error'
|
models/supervised/regression/linear_regression.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
"""
|
3 |
+
This module defines the setup for performing Linear Regression with hyperparameter tuning.
|
4 |
+
|
5 |
+
Features:
|
6 |
+
- Sets up a `LinearRegression` estimator from scikit-learn.
|
7 |
+
- Defines a hyperparameter grid for preprocessing and model-specific parameters.
|
8 |
+
- Specifies an optional default scoring metric for evaluating the model.
|
9 |
+
|
10 |
+
Special Considerations:
|
11 |
+
- Linear Regression doesn't typically require special handling.
|
12 |
+
- Applying a log transformation to the target variable (`log_transform`) can be beneficial if it's skewed.
|
13 |
+
"""
|
14 |
+
|
15 |
+
from sklearn.linear_model import LinearRegression
|
16 |
+
|
17 |
+
# Define the estimator
|
18 |
+
estimator = LinearRegression()
|
19 |
+
|
20 |
+
# Define the hyperparameter grid
|
21 |
+
param_grid = {
|
22 |
+
'model__fit_intercept': [True, False],
|
23 |
+
'preprocessor__num__imputer__strategy': ['mean', 'median'],
|
24 |
+
'preprocessor__num__scaler__with_mean': [True, False],
|
25 |
+
'preprocessor__num__scaler__with_std': [True, False],
|
26 |
+
}
|
27 |
+
|
28 |
+
# Optional: Define the default scoring metric
|
29 |
+
default_scoring = 'neg_root_mean_squared_error'
|
models/supervised/regression/mlp_regressor.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
"""
|
3 |
+
This module sets up a Multilayer Perceptron Regressor with hyperparameter tuning.
|
4 |
+
|
5 |
+
Features:
|
6 |
+
- Uses `MLPRegressor` estimator from scikit-learn.
|
7 |
+
- Defines a hyperparameter grid for neural network parameters.
|
8 |
+
- Capable of capturing complex non-linear relationships.
|
9 |
+
|
10 |
+
Special Considerations:
|
11 |
+
- Feature scaling is crucial for MLP.
|
12 |
+
- May produce convergence warnings; increase `max_iter` to address this.
|
13 |
+
- Can be sensitive to hyperparameter settings; tuning is important.
|
14 |
+
"""
|
15 |
+
|
16 |
+
from sklearn.neural_network import MLPRegressor
|
17 |
+
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
18 |
+
|
19 |
+
# Define the estimator
|
20 |
+
estimator = MLPRegressor(random_state=42, max_iter=1000)
|
21 |
+
|
22 |
+
# Define the hyperparameter grid
|
23 |
+
param_grid = {
|
24 |
+
'model__hidden_layer_sizes': [(50,), (100,), (50, 50)], # Simplified layer sizes
|
25 |
+
'model__activation': ['relu'], # Focused on ReLU, the most commonly effective activation
|
26 |
+
'model__solver': ['adam'], # Retain 'adam' for efficiency; drop 'lbfgs' (slower for larger datasets)
|
27 |
+
'model__alpha': [0.0001, 0.001], # Regularization strengths
|
28 |
+
'model__learning_rate': ['constant', 'adaptive'], # Common learning rate strategies
|
29 |
+
'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
|
30 |
+
'preprocessor__num__scaler__with_mean': [True], # StandardScaler
|
31 |
+
'preprocessor__num__scaler__with_std': [True], # StandardScaler
|
32 |
+
}
|
33 |
+
|
34 |
+
# Optional: Default scoring metric
|
35 |
+
default_scoring = 'neg_root_mean_squared_error'
|
models/supervised/regression/random_forest_regressor.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
"""
|
3 |
+
This module sets up a Random Forest Regressor with hyperparameter tuning.
|
4 |
+
|
5 |
+
Features:
|
6 |
+
- Uses `RandomForestRegressor` estimator from scikit-learn.
|
7 |
+
- Defines a hyperparameter grid for ensemble parameters.
|
8 |
+
- Handles non-linear relationships and reduces overfitting through averaging.
|
9 |
+
|
10 |
+
Special Considerations:
|
11 |
+
- Random Forests are robust to outliers and can handle non-linear data.
|
12 |
+
- Not sensitive to feature scaling.
|
13 |
+
- Set `n_jobs=-1` to utilize all available CPU cores.
|
14 |
+
"""
|
15 |
+
|
16 |
+
from sklearn.ensemble import RandomForestRegressor
|
17 |
+
|
18 |
+
# Define the estimator
|
19 |
+
estimator = RandomForestRegressor(random_state=42, n_jobs=-1)
|
20 |
+
|
21 |
+
# Define the hyperparameter grid
|
22 |
+
param_grid = {
|
23 |
+
'model__n_estimators': [100, 200], # Focus on a small range of estimators
|
24 |
+
'model__max_depth': [10, 20, None], # Commonly used depth variations
|
25 |
+
'model__min_samples_split': [2, 5], # Commonly used split values
|
26 |
+
'model__min_samples_leaf': [1, 2], # Focused leaf size
|
27 |
+
'model__max_features': ['sqrt'], # "sqrt" is often optimal for Random Forests
|
28 |
+
'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
|
29 |
+
}
|
30 |
+
|
31 |
+
# Optional: Define the default scoring metric
|
32 |
+
default_scoring = 'neg_root_mean_squared_error'
|
models/supervised/regression/ridge_regression.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
"""
|
3 |
+
This module sets up a Ridge Regression model with hyperparameter tuning.
|
4 |
+
|
5 |
+
Features:
|
6 |
+
- Uses `Ridge` estimator from scikit-learn.
|
7 |
+
- Defines a hyperparameter grid for preprocessing and model-specific parameters.
|
8 |
+
- Addresses potential convergence warnings by increasing `max_iter`.
|
9 |
+
- Considers solvers compatible with dense data after modifying `OneHotEncoder`.
|
10 |
+
|
11 |
+
Special Considerations:
|
12 |
+
- Ridge Regression may produce convergence warnings if `max_iter` is insufficient.
|
13 |
+
- Applying a log transformation (`log_transform`) to the target variable can be beneficial if it's skewed.
|
14 |
+
- Ensure `OneHotEncoder` outputs dense arrays to avoid solver compatibility issues.
|
15 |
+
"""
|
16 |
+
|
17 |
+
from sklearn.linear_model import Ridge
|
18 |
+
|
19 |
+
# Define the estimator
|
20 |
+
estimator = Ridge()
|
21 |
+
|
22 |
+
# Define the hyperparameter grid
|
23 |
+
param_grid = {
|
24 |
+
'model__alpha': [0.1, 1.0, 10.0],
|
25 |
+
'model__solver': ['auto', 'svd', 'cholesky'],
|
26 |
+
'model__max_iter': [1000, 5000],
|
27 |
+
'preprocessor__num__imputer__strategy': ['mean', 'median'],
|
28 |
+
'preprocessor__num__scaler__with_mean': [True, False],
|
29 |
+
'preprocessor__num__scaler__with_std': [True, False],
|
30 |
+
}
|
31 |
+
|
32 |
+
# Optional: Define the default scoring metric
|
33 |
+
default_scoring = 'neg_root_mean_squared_error'
|
models/supervised/regression/support_vector_regressor.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
"""
|
3 |
+
This module sets up a Support Vector Regressor (SVR) with hyperparameter tuning.
|
4 |
+
|
5 |
+
Features:
|
6 |
+
- Uses `SVR` estimator from scikit-learn.
|
7 |
+
- Defines a hyperparameter grid for kernel parameters.
|
8 |
+
- Effective in high-dimensional spaces.
|
9 |
+
|
10 |
+
Special Considerations:
|
11 |
+
- Feature scaling is crucial for SVR.
|
12 |
+
- Training time can be significant for large datasets.
|
13 |
+
- Applying a log transformation (`log_transform`) can be beneficial if the target variable is skewed.
|
14 |
+
"""
|
15 |
+
|
16 |
+
from sklearn.svm import SVR
|
17 |
+
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
18 |
+
|
19 |
+
# Define the estimator
|
20 |
+
estimator = SVR()
|
21 |
+
|
22 |
+
# Define the hyperparameter grid
|
23 |
+
param_grid = {
|
24 |
+
'model__kernel': ['rbf'], # Stick to the most effective kernel
|
25 |
+
'model__C': [0.1, 1.0, 10.0], # Focus on a narrower range
|
26 |
+
'model__epsilon': [0.1, 0.2, 0.5], # Retain small deviations
|
27 |
+
'model__gamma': ['scale', 0.1], # Simplify gamma
|
28 |
+
'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
|
29 |
+
'preprocessor__num__scaler__with_mean': [True],
|
30 |
+
'preprocessor__num__scaler__with_std': [True],
|
31 |
+
}
|
32 |
+
|
33 |
+
# Optional: Define the default scoring metric
|
34 |
+
default_scoring = 'neg_root_mean_squared_error'
|
models/supervised/regression/xgboost_regressor.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
"""
|
3 |
+
This module sets up an XGBoost Regressor with hyperparameter tuning.
|
4 |
+
|
5 |
+
Features:
|
6 |
+
- Uses `XGBRegressor` estimator from XGBoost.
|
7 |
+
- Defines a hyperparameter grid for boosting parameters.
|
8 |
+
- Efficient and scalable implementation of gradient boosting.
|
9 |
+
|
10 |
+
Special Considerations:
|
11 |
+
- Requires the `xgboost` library (`pip install xgboost`).
|
12 |
+
- Handles missing values internally.
|
13 |
+
- Not sensitive to feature scaling.
|
14 |
+
- May require setting `tree_method` to 'gpu_hist' for GPU acceleration if available.
|
15 |
+
"""
|
16 |
+
|
17 |
+
from xgboost import XGBRegressor
|
18 |
+
|
19 |
+
# Define the estimator
|
20 |
+
estimator = XGBRegressor(random_state=42, n_jobs=-1, verbosity=0)
|
21 |
+
|
22 |
+
# Define the hyperparameter grid
|
23 |
+
param_grid = {
|
24 |
+
'model__n_estimators': [100, 200], # Common range for estimators
|
25 |
+
'model__learning_rate': [0.05, 0.1], # Common learning rates
|
26 |
+
'model__max_depth': [3, 5], # Typical depth for gradient boosting
|
27 |
+
'model__subsample': [0.8], # Fixed subsample value to reduce complexity
|
28 |
+
'model__colsample_bytree': [0.8], # Fixed colsample value to reduce complexity
|
29 |
+
'model__reg_alpha': [0, 0.1], # Focus on smaller values for L1 regularization
|
30 |
+
'model__reg_lambda': [1], # Default L2 regularization
|
31 |
+
'preprocessor__num__imputer__strategy': ['mean'], # Single imputation strategy
|
32 |
+
}
|
33 |
+
|
34 |
+
# Optional: Define the default scoring metric
|
35 |
+
default_scoring = 'neg_root_mean_squared_error'
|
models/unsupervised/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# unsupervised
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas==2.2.2
|
2 |
+
numpy==1.26.4
|
3 |
+
matplotlib==3.8.0
|
4 |
+
seaborn==0.13.2
|
5 |
+
kaggle==1.6.17
|
6 |
+
scikit-learn==1.5.2
|
7 |
+
catboost==1.2.7
|
8 |
+
dask[dataframe]==2024.10.0
|
9 |
+
xgboost==2.1.2
|
10 |
+
lightgbm==4.5.0
|
11 |
+
joblib==1.4.2
|
12 |
+
gradio==5.7.1
|
scripts/README.md
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Scripts
|
2 |
+
|
3 |
+
This directory contains executable scripts for training, testing, and other tasks related to model development and evaluation.
|
4 |
+
|
5 |
+
## Contents
|
6 |
+
|
7 |
+
- [`train_regression_model.py`](#train_regression_model.py)
|
8 |
+
|
9 |
+
### `train_regression_model.py`
|
10 |
+
|
11 |
+
A script for training supervised learning regression models using scikit-learn. It handles data loading, preprocessing, optional log transformation, hyperparameter tuning, model evaluation, and saving of models, metrics, and visualizations.
|
12 |
+
|
13 |
+
#### Features
|
14 |
+
|
15 |
+
- Supports various regression models defined in the `models/supervised/regression` directory.
|
16 |
+
- Performs hyperparameter tuning using grid search cross-validation.
|
17 |
+
- Saves trained models and evaluation metrics.
|
18 |
+
- Generates visualizations if specified.
|
19 |
+
|
20 |
+
#### Usage
|
21 |
+
|
22 |
+
```bash
|
23 |
+
python train_regression_model.py --model_module MODEL_MODULE \
|
24 |
+
--data_path DATA_PATH/DATA_NAME.csv \
|
25 |
+
--target_variable TARGET_VARIABLE [OPTIONS]
|
26 |
+
|
27 |
+
```
|
28 |
+
|
29 |
+
- **Required Arguments:**
|
30 |
+
- `model_module`: Name of the model module to import (e.g., `linear_regression`).
|
31 |
+
- `data_path`: Path to the dataset directory, including the data file name.
|
32 |
+
- `target_variable`: Name of the target variable.
|
33 |
+
|
34 |
+
- **Optional Arguments:**
|
35 |
+
- `test_size`: Proportion of the dataset to include in the test split (default: 0.2).
|
36 |
+
- `random_state`: Random seed for reproducibility (default: 42).
|
37 |
+
- `log_transform`: Apply log transformation to the target variable (regression only).
|
38 |
+
- `cv_folds`: Number of cross-validation folds (default: 5).
|
39 |
+
- `scoring_metric`: Scoring metric for model evaluation.
|
40 |
+
- `model_path`: Path to save the trained model.
|
41 |
+
- `results_path`: Path to save results and metrics.
|
42 |
+
- `visualize`: Generate and save visualizations.
|
43 |
+
- `drop_columns`: Comma-separated column names to drop from the dataset.
|
44 |
+
|
45 |
+
#### Usage Example
|
46 |
+
|
47 |
+
```bash
|
48 |
+
python train_regression_model.py --model_module linear_regression \
|
49 |
+
--data_path data/house_prices/train.csv \
|
50 |
+
--target_variable SalePrice --drop_columns Id \
|
51 |
+
--log_transform --visualize
|
52 |
+
```
|
scripts/train_classification_model.py
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This script trains classification models using scikit-learn.
|
3 |
+
It includes data loading, preprocessing, encoding of target variable,
|
4 |
+
hyperparameter tuning, model evaluation, and saving of models, metrics,
|
5 |
+
and visualizations.
|
6 |
+
|
7 |
+
Usage:
|
8 |
+
python train_classification_model.py --model_module MODEL_MODULE --data_path DATA_PATH/DATA_NAME.csv
|
9 |
+
--target_variable TARGET_VARIABLE
|
10 |
+
|
11 |
+
Optional arguments:
|
12 |
+
--test_size TEST_SIZE
|
13 |
+
--random_state RANDOM_STATE
|
14 |
+
--cv_folds CV_FOLDS
|
15 |
+
--scoring_metric SCORING_METRIC
|
16 |
+
--model_path MODEL_PATH
|
17 |
+
--results_path RESULTS_PATH
|
18 |
+
--visualize
|
19 |
+
--drop_columns COLUMN_NAMES
|
20 |
+
|
21 |
+
Example:
|
22 |
+
python train_classification_model.py --model_module logistic_regression
|
23 |
+
--data_path data/titanic/train.csv
|
24 |
+
--target_variable Survived --drop_columns PassengerId
|
25 |
+
--visualize
|
26 |
+
"""
|
27 |
+
|
28 |
+
import os
|
29 |
+
import sys
|
30 |
+
import argparse
|
31 |
+
import importlib
|
32 |
+
import pandas as pd
|
33 |
+
import numpy as np
|
34 |
+
import matplotlib.pyplot as plt
|
35 |
+
from sklearn.model_selection import train_test_split
|
36 |
+
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
|
37 |
+
confusion_matrix, ConfusionMatrixDisplay)
|
38 |
+
import joblib
|
39 |
+
|
40 |
+
def main(args):
|
41 |
+
# Change to the root directory of the project
|
42 |
+
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
43 |
+
os.chdir(project_root)
|
44 |
+
sys.path.insert(0, project_root)
|
45 |
+
|
46 |
+
# Import the hyperparameter tuning and the model modules
|
47 |
+
from utils.supervised_hyperparameter_tuning import classification_hyperparameter_tuning
|
48 |
+
model_module_path = f"models.supervised.classification.{args.model_module}"
|
49 |
+
model_module = importlib.import_module(model_module_path)
|
50 |
+
|
51 |
+
# Get the model estimator, parameters grid, and the scoring metric
|
52 |
+
estimator = model_module.estimator
|
53 |
+
param_grid = model_module.param_grid
|
54 |
+
scoring_metric = args.scoring_metric or getattr(model_module, 'default_scoring', 'accuracy')
|
55 |
+
model_name = estimator.__class__.__name__
|
56 |
+
|
57 |
+
# Set default paths if not provided
|
58 |
+
args.model_path = args.model_path or os.path.join('saved_models', model_name)
|
59 |
+
args.results_path = args.results_path or os.path.join('results', model_name)
|
60 |
+
os.makedirs(args.results_path, exist_ok=True)
|
61 |
+
|
62 |
+
# Load the dataset
|
63 |
+
df = pd.read_csv(os.path.join(args.data_path))
|
64 |
+
|
65 |
+
# Drop specified columns
|
66 |
+
if args.drop_columns:
|
67 |
+
columns_to_drop = args.drop_columns.split(',')
|
68 |
+
df = df.drop(columns=columns_to_drop)
|
69 |
+
|
70 |
+
# Define target variable and features
|
71 |
+
target_variable = args.target_variable
|
72 |
+
X = df.drop(columns=[target_variable])
|
73 |
+
y = df[target_variable]
|
74 |
+
|
75 |
+
# Ensure target variable is categorical
|
76 |
+
if np.issubdtype(y.dtype, np.number) and len(np.unique(y)) > 20:
|
77 |
+
raise ValueError(f"The target variable '{target_variable}' seems to be continuous. Please ensure it's categorical for classification tasks.")
|
78 |
+
|
79 |
+
# Encode target variable if not numeric
|
80 |
+
if y.dtype == 'object' or not np.issubdtype(y.dtype, np.number):
|
81 |
+
from sklearn.preprocessing import LabelEncoder
|
82 |
+
le = LabelEncoder()
|
83 |
+
y = le.fit_transform(y)
|
84 |
+
# Save label encoder for inverse transformation
|
85 |
+
joblib.dump(le, os.path.join(args.model_path, 'label_encoder.pkl'))
|
86 |
+
|
87 |
+
# Split the data
|
88 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
89 |
+
X, y, test_size=args.test_size, random_state=args.random_state, stratify=y)
|
90 |
+
|
91 |
+
# Perform hyperparameter tuning
|
92 |
+
best_model, best_params = classification_hyperparameter_tuning(
|
93 |
+
X_train, y_train, estimator, param_grid,
|
94 |
+
cv=args.cv_folds, scoring=scoring_metric)
|
95 |
+
|
96 |
+
# Evaluate the best model on the test set
|
97 |
+
y_pred = best_model.predict(X_test)
|
98 |
+
y_test_actual = y_test
|
99 |
+
|
100 |
+
# Save the trained model
|
101 |
+
model_output_path = os.path.join(args.model_path, 'best_model.pkl')
|
102 |
+
os.makedirs(args.model_path, exist_ok=True)
|
103 |
+
joblib.dump(best_model, model_output_path)
|
104 |
+
print(f"Trained model saved to {model_output_path}")
|
105 |
+
|
106 |
+
# Calculate metrics
|
107 |
+
accuracy = accuracy_score(y_test_actual, y_pred)
|
108 |
+
precision = precision_score(y_test_actual, y_pred, average='weighted', zero_division=0)
|
109 |
+
recall = recall_score(y_test_actual, y_pred, average='weighted', zero_division=0)
|
110 |
+
f1 = f1_score(y_test_actual, y_pred, average='weighted', zero_division=0)
|
111 |
+
print(f"\n{model_name} Classification Metrics on Test Set:")
|
112 |
+
print(f"- Accuracy: {accuracy:.4f}")
|
113 |
+
print(f"- Precision: {precision:.4f}")
|
114 |
+
print(f"- Recall: {recall:.4f}")
|
115 |
+
print(f"- F1 Score: {f1:.4f}")
|
116 |
+
# Save metrics
|
117 |
+
metrics = {'Accuracy': [accuracy], 'Precision': [precision], 'Recall': [recall], 'F1 Score': [f1]}
|
118 |
+
|
119 |
+
# Save metrics to CSV
|
120 |
+
results_df = pd.DataFrame(metrics)
|
121 |
+
results_df.to_csv(os.path.join(args.results_path, 'metrics.csv'), index=False)
|
122 |
+
print(f"\nMetrics saved to {os.path.join(args.results_path, 'metrics.csv')}")
|
123 |
+
|
124 |
+
if args.visualize:
|
125 |
+
# Plot Classification Metrics
|
126 |
+
plt.figure(figsize=(8, 6))
|
127 |
+
# Extract metrics and values
|
128 |
+
metric_names = list(metrics.keys())
|
129 |
+
metric_values = [value[0] for value in metrics.values()] # Extract the single value from each list
|
130 |
+
|
131 |
+
# Create bar chart
|
132 |
+
plt.bar(metric_names, metric_values, color='skyblue', alpha=0.8)
|
133 |
+
plt.ylim(0, 1) # Metrics like accuracy, precision, etc., are between 0 and 1
|
134 |
+
plt.xlabel('Metrics')
|
135 |
+
plt.ylabel('Scores')
|
136 |
+
plt.title('Classification Metrics')
|
137 |
+
|
138 |
+
# Save and display the plot
|
139 |
+
plt.savefig(os.path.join(args.results_path, 'classification_metrics.png'))
|
140 |
+
plt.show()
|
141 |
+
print(f"Visualization saved to {os.path.join(args.results_path, 'classification_metrics.png')}")
|
142 |
+
|
143 |
+
# Display and save the confusion matrix
|
144 |
+
conf_matrix = confusion_matrix(y_test_actual, y_pred)
|
145 |
+
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
|
146 |
+
disp.plot(cmap=plt.cm.Blues, values_format='d') # Format as integers for counts
|
147 |
+
plt.title(f'{model_name} Confusion Matrix')
|
148 |
+
|
149 |
+
# Save the confusion matrix plot
|
150 |
+
conf_matrix_path = os.path.join(args.results_path, 'confusion_matrix.png')
|
151 |
+
plt.savefig(conf_matrix_path)
|
152 |
+
plt.show()
|
153 |
+
print(f"Confusion matrix saved to {conf_matrix_path}")
|
154 |
+
|
155 |
+
if __name__ == "__main__":
|
156 |
+
parser = argparse.ArgumentParser(description="Train a classification model.")
|
157 |
+
# Model module argument
|
158 |
+
parser.add_argument('--model_module', type=str, required=True,
|
159 |
+
help='Name of the classification model module to import.')
|
160 |
+
# Data arguments
|
161 |
+
parser.add_argument('--data_path', type=str, required=True,
|
162 |
+
help='Path to the dataset file including data name.')
|
163 |
+
parser.add_argument('--target_variable', type=str, required=True,
|
164 |
+
help='Name of the target variable.')
|
165 |
+
parser.add_argument('--drop_columns', type=str, default='',
|
166 |
+
help='Columns to drop from the dataset.')
|
167 |
+
# Model arguments
|
168 |
+
parser.add_argument('--test_size', type=float, default=0.2,
|
169 |
+
help='Proportion for test split.')
|
170 |
+
parser.add_argument('--random_state', type=int, default=42,
|
171 |
+
help='Random seed.')
|
172 |
+
parser.add_argument('--cv_folds', type=int, default=5,
|
173 |
+
help='Number of cross-validation folds.')
|
174 |
+
parser.add_argument('--scoring_metric', type=str, default=None,
|
175 |
+
help='Scoring metric for model evaluation.')
|
176 |
+
# Output arguments
|
177 |
+
parser.add_argument('--model_path', type=str, default=None,
|
178 |
+
help='Path to save the trained model.')
|
179 |
+
parser.add_argument('--results_path', type=str, default=None,
|
180 |
+
help='Path to save results and metrics.')
|
181 |
+
parser.add_argument('--visualize', action='store_true',
|
182 |
+
help='Generate and save visualizations.')
|
183 |
+
|
184 |
+
args = parser.parse_args()
|
185 |
+
main(args)
|
scripts/train_regression_model.py
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This script trains regression models using scikit-learn.
|
3 |
+
It includes data loading, preprocessing, optional log transformation,
|
4 |
+
hyperparameter tuning, model evaluation, and saving of models, metrics,
|
5 |
+
and visualizations.
|
6 |
+
|
7 |
+
Usage:
|
8 |
+
python train_regression_model.py --model_module MODEL_MODULE --data_path DATA_PATH/DATA_NAME.csv
|
9 |
+
--target_variable TARGET_VARIABLE
|
10 |
+
|
11 |
+
Optional arguments:
|
12 |
+
--test_size TEST_SIZE
|
13 |
+
--random_state RANDOM_STATE
|
14 |
+
--log_transform
|
15 |
+
--cv_folds CV_FOLDS
|
16 |
+
--scoring_metric SCORING_METRIC
|
17 |
+
--model_path MODEL_PATH
|
18 |
+
--results_path RESULTS_PATH
|
19 |
+
--visualize
|
20 |
+
--drop_columns COLUMN_NAMES
|
21 |
+
|
22 |
+
Example:
|
23 |
+
python train_regression_model.py --model_module linear_regression
|
24 |
+
--data_path data/house_prices/train.csv
|
25 |
+
--target_variable SalePrice --drop_columns Id
|
26 |
+
--log_transform --visualize
|
27 |
+
"""
|
28 |
+
|
29 |
+
import os
|
30 |
+
import sys
|
31 |
+
import argparse
|
32 |
+
import importlib
|
33 |
+
import pandas as pd
|
34 |
+
import numpy as np
|
35 |
+
import matplotlib.pyplot as plt
|
36 |
+
import seaborn as sns
|
37 |
+
from sklearn.model_selection import train_test_split
|
38 |
+
from sklearn.metrics import root_mean_squared_error, mean_squared_error, r2_score, mean_absolute_error
|
39 |
+
import joblib
|
40 |
+
from timeit import default_timer as timer
|
41 |
+
|
42 |
+
def main(args):
|
43 |
+
# Change to the root directory of the project
|
44 |
+
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
45 |
+
os.chdir(project_root)
|
46 |
+
sys.path.insert(0, project_root)
|
47 |
+
|
48 |
+
# Import the hyperparameter tuning and the model modules
|
49 |
+
from utils.supervised_hyperparameter_tuning import regression_hyperparameter_tuning
|
50 |
+
model_module_path = f"models.supervised.regression.{args.model_module}"
|
51 |
+
model_module = importlib.import_module(model_module_path)
|
52 |
+
|
53 |
+
# Get the model estimator, parameters grid, and the scoring metric
|
54 |
+
estimator = model_module.estimator
|
55 |
+
param_grid = model_module.param_grid
|
56 |
+
scoring_metric = args.scoring_metric or getattr(model_module, 'default_scoring', 'neg_root_mean_squared_error')
|
57 |
+
model_name = estimator.__class__.__name__
|
58 |
+
|
59 |
+
# Set default paths if not provided
|
60 |
+
args.model_path = args.model_path or os.path.join('saved_models', model_name)
|
61 |
+
args.results_path = args.results_path or os.path.join('results', model_name)
|
62 |
+
os.makedirs(args.results_path, exist_ok=True)
|
63 |
+
|
64 |
+
# Load the dataset
|
65 |
+
df = pd.read_csv(os.path.join(args.data_path))
|
66 |
+
|
67 |
+
# Drop specified columns
|
68 |
+
if args.drop_columns:
|
69 |
+
columns_to_drop = args.drop_columns.split(',')
|
70 |
+
df = df.drop(columns=columns_to_drop)
|
71 |
+
|
72 |
+
# Define target variable and features
|
73 |
+
target_variable = args.target_variable
|
74 |
+
X = df.drop(columns=[target_variable])
|
75 |
+
y = df[target_variable]
|
76 |
+
|
77 |
+
# Ensure target variable is numeric
|
78 |
+
if not np.issubdtype(y.dtype, np.number):
|
79 |
+
raise ValueError(f"The target variable '{target_variable}' must be numeric for regression tasks.")
|
80 |
+
|
81 |
+
# Split the data
|
82 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
83 |
+
X, y, test_size=args.test_size, random_state=args.random_state)
|
84 |
+
|
85 |
+
# Visualize target variable distribution
|
86 |
+
if args.visualize:
|
87 |
+
plt.figure(figsize=(6, 4))
|
88 |
+
sns.histplot(y_train, kde=True)
|
89 |
+
plt.title(f'{target_variable} Distribution Before Transformation')
|
90 |
+
plt.savefig(os.path.join(args.results_path, 'target_distribution_before.png'))
|
91 |
+
plt.show()
|
92 |
+
|
93 |
+
# Optional: Apply log transformation
|
94 |
+
if args.log_transform:
|
95 |
+
y_train_transformed = np.log1p(y_train)
|
96 |
+
y_test_transformed = np.log1p(y_test)
|
97 |
+
if args.visualize:
|
98 |
+
plt.figure(figsize=(6, 4))
|
99 |
+
sns.histplot(y_train_transformed, kde=True, color='green')
|
100 |
+
plt.title(f'{target_variable} Distribution After Log Transform')
|
101 |
+
plt.savefig(os.path.join(args.results_path, 'target_distribution_after.png'))
|
102 |
+
plt.show()
|
103 |
+
else:
|
104 |
+
y_train_transformed = y_train
|
105 |
+
y_test_transformed = y_test
|
106 |
+
|
107 |
+
# Start the timer
|
108 |
+
start_time = timer()
|
109 |
+
|
110 |
+
# Perform hyperparameter tuning
|
111 |
+
best_model, best_params = regression_hyperparameter_tuning(
|
112 |
+
X_train, y_train_transformed, estimator, param_grid,
|
113 |
+
cv=args.cv_folds, scoring=scoring_metric)
|
114 |
+
|
115 |
+
# End the timer and calculate how long it took
|
116 |
+
end_time = timer()
|
117 |
+
train_time = end_time-start_time
|
118 |
+
|
119 |
+
# Evaluate the best model on the test set
|
120 |
+
y_pred_transformed = best_model.predict(X_test)
|
121 |
+
|
122 |
+
# Reverse transformation if applied
|
123 |
+
if args.log_transform:
|
124 |
+
y_pred = np.expm1(y_pred_transformed)
|
125 |
+
y_test_actual = np.expm1(y_test_transformed)
|
126 |
+
else:
|
127 |
+
y_pred = y_pred_transformed
|
128 |
+
y_test_actual = y_test_transformed
|
129 |
+
|
130 |
+
# Save the trained model
|
131 |
+
model_output_path = os.path.join(args.model_path, 'best_model.pkl')
|
132 |
+
os.makedirs(args.model_path, exist_ok=True)
|
133 |
+
joblib.dump(best_model, model_output_path)
|
134 |
+
print(f"Trained model saved to {model_output_path}")
|
135 |
+
|
136 |
+
# Calculate metrics
|
137 |
+
rmse = root_mean_squared_error(y_test_actual, y_pred)
|
138 |
+
r2 = r2_score(y_test_actual, y_pred)
|
139 |
+
mae = mean_absolute_error(y_test_actual, y_pred)
|
140 |
+
mse = mean_squared_error(y_test_actual, y_pred)
|
141 |
+
print(f"\n{model_name} Regression Metrics on Test Set:")
|
142 |
+
print(f"- RMSE: {rmse:.4f}")
|
143 |
+
print(f"- R² Score: {r2:.4f}")
|
144 |
+
print(f"- MAE: {mae:.4f}")
|
145 |
+
print(f"- MSE: {mse:.4f}")
|
146 |
+
print(f"- Training time: {train_time:.4f} seconds")
|
147 |
+
# Save metrics
|
148 |
+
metrics = {'RMSE': [rmse], 'R2': [r2], 'MAE': [mae], 'MSE': [mse], 'train_time': [train_time]}
|
149 |
+
|
150 |
+
# Save metrics to CSV
|
151 |
+
results_df = pd.DataFrame(metrics)
|
152 |
+
results_df.to_csv(os.path.join(args.results_path, 'metrics.csv'), index=False)
|
153 |
+
print(f"\nMetrics saved to {os.path.join(args.results_path, 'metrics.csv')}")
|
154 |
+
|
155 |
+
if args.visualize:
|
156 |
+
# Plot Actual vs. Predicted
|
157 |
+
plt.figure(figsize=(8, 6))
|
158 |
+
plt.scatter(y_test_actual, y_pred, alpha=0.6, color='blue')
|
159 |
+
plt.plot([y_test_actual.min(), y_test_actual.max()],
|
160 |
+
[y_test_actual.min(), y_test_actual.max()], 'r--')
|
161 |
+
plt.xlabel(f'Actual {target_variable}')
|
162 |
+
plt.ylabel(f'Predicted {target_variable}')
|
163 |
+
plt.title(f'Actual vs. Predicted {target_variable}')
|
164 |
+
plt.savefig(os.path.join(args.results_path, 'actual_vs_predicted.png'))
|
165 |
+
plt.show()
|
166 |
+
print(f"Visualization saved to {os.path.join(args.results_path, 'actual_vs_predicted.png')}")
|
167 |
+
|
168 |
+
if __name__ == "__main__":
|
169 |
+
parser = argparse.ArgumentParser(description="Train a regression model.")
|
170 |
+
# Model module argument
|
171 |
+
parser.add_argument('--model_module', type=str, required=True,
|
172 |
+
help='Name of the regression model module to import.')
|
173 |
+
# Data arguments
|
174 |
+
parser.add_argument('--data_path', type=str, required=True,
|
175 |
+
help='Path to the dataset file including data name.')
|
176 |
+
parser.add_argument('--target_variable', type=str, required=True,
|
177 |
+
help='Name of the target variable.')
|
178 |
+
parser.add_argument('--drop_columns', type=str, default='',
|
179 |
+
help='Columns to drop from the dataset.')
|
180 |
+
# Model arguments
|
181 |
+
parser.add_argument('--test_size', type=float, default=0.2,
|
182 |
+
help='Proportion for test split.')
|
183 |
+
parser.add_argument('--random_state', type=int, default=42,
|
184 |
+
help='Random seed.')
|
185 |
+
parser.add_argument('--log_transform', action='store_true',
|
186 |
+
help='Apply log transformation to the target variable.')
|
187 |
+
parser.add_argument('--cv_folds', type=int, default=5,
|
188 |
+
help='Number of cross-validation folds.')
|
189 |
+
parser.add_argument('--scoring_metric', type=str, default=None,
|
190 |
+
help='Scoring metric for model evaluation.')
|
191 |
+
# Output arguments
|
192 |
+
parser.add_argument('--model_path', type=str, default=None,
|
193 |
+
help='Path to save the trained model.')
|
194 |
+
parser.add_argument('--results_path', type=str, default=None,
|
195 |
+
help='Path to save results and metrics.')
|
196 |
+
parser.add_argument('--visualize', action='store_true',
|
197 |
+
help='Generate and save visualizations.')
|
198 |
+
|
199 |
+
args = parser.parse_args()
|
200 |
+
main(args)
|
utils/README.md
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Utils
|
2 |
+
|
3 |
+
This directory contains utility scripts and helper functions that are used throughout the project. These scripts provide common functionalities such as data preprocessing, hyperparameter tuning, and other support functions that assist in model training and evaluation.
|
4 |
+
|
5 |
+
## Contents
|
6 |
+
|
7 |
+
- [`supervised_hyperparameter_tuning.py`](#supervised_hyperparameter_tuning.py)
|
8 |
+
|
9 |
+
### `supervised_hyperparameter_tuning.py`
|
10 |
+
|
11 |
+
This script contains functions for performing hyperparameter tuning on supervised learning models using scikit-learn's `Pipeline` and `GridSearchCV`.
|
12 |
+
|
13 |
+
#### Functions
|
14 |
+
|
15 |
+
- **`regression_hyperparameter_tuning(X_train, y_train, estimator, param_grid, cv=5, scoring=None)`**
|
16 |
+
|
17 |
+
Performs hyperparameter tuning using grid search cross-validation.
|
18 |
+
|
19 |
+
- **Parameters:**
|
20 |
+
- `X_train`: Training features.
|
21 |
+
- `y_train`: Training target variable.
|
22 |
+
- `estimator`: A scikit-learn estimator (e.g., `LinearRegression()`).
|
23 |
+
- `param_grid`: Dictionary with parameters names (`str`) as keys and lists of parameter settings to try as values.
|
24 |
+
- `cv`: Number of cross-validation folds. Default is 5.
|
25 |
+
- `scoring`: Scoring metric to use. Default depends on the estimator.
|
26 |
+
|
27 |
+
- **Returns:**
|
28 |
+
- `best_model`: The estimator with the best found parameters.
|
29 |
+
- `best_params`: Dictionary of the best parameters.
|
30 |
+
|
31 |
+
#### Usage Example
|
32 |
+
|
33 |
+
```python
|
34 |
+
from utils.supervised_hyperparameter_tuning import regression_hyperparameter_tuning
|
35 |
+
from sklearn.linear_model import LinearRegression
|
36 |
+
|
37 |
+
# Define estimator and parameter grid
|
38 |
+
estimator = LinearRegression()
|
39 |
+
param_grid = {
|
40 |
+
'model__fit_intercept': [True, False],
|
41 |
+
# Add other parameters
|
42 |
+
}
|
43 |
+
|
44 |
+
# Perform hyperparameter tuning
|
45 |
+
best_model, best_params = regression_hyperparameter_tuning(X_train, y_train, estimator, param_grid)
|
utils/supervised_hyperparameter_tuning.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module provides a function for hyperparameter tuning with preprocessing using scikit-learn's GridSearchCV specifically for regression models.
|
3 |
+
|
4 |
+
Features:
|
5 |
+
- Handles numerical and categorical preprocessing using pipelines.
|
6 |
+
- Automates hyperparameter tuning for any scikit-learn regressor.
|
7 |
+
- Uses GridSearchCV for cross-validation and hyperparameter search.
|
8 |
+
- Applies algorithm-specific preprocessing when necessary.
|
9 |
+
|
10 |
+
Functions:
|
11 |
+
- hyperparameter_tuning_model: Performs hyperparameter tuning on a given dataset and estimator.
|
12 |
+
|
13 |
+
Example Usage:
|
14 |
+
from sklearn.ensemble import RandomForestRegressor
|
15 |
+
from supervised_hyperparameter_tuning import hyperparameter_tuning_model
|
16 |
+
|
17 |
+
X = ... # Your feature DataFrame
|
18 |
+
y = ... # Your target variable
|
19 |
+
param_grid = {
|
20 |
+
'model__n_estimators': [100, 200, 500],
|
21 |
+
'model__max_depth': [None, 10, 20]
|
22 |
+
}
|
23 |
+
best_model, best_params = hyperparameter_tuning_model(X, y, RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')
|
24 |
+
"""
|
25 |
+
|
26 |
+
from sklearn.compose import ColumnTransformer
|
27 |
+
from sklearn.impute import SimpleImputer
|
28 |
+
from sklearn.pipeline import Pipeline
|
29 |
+
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
|
30 |
+
from sklearn.model_selection import GridSearchCV, KFold
|
31 |
+
|
32 |
+
def regression_hyperparameter_tuning(X, y, estimator, param_grid, cv=5, scoring=None):
|
33 |
+
"""
|
34 |
+
Performs hyperparameter tuning for a given regression model using GridSearchCV with preprocessing.
|
35 |
+
|
36 |
+
Args:
|
37 |
+
X (pd.DataFrame): Features.
|
38 |
+
y (pd.Series): Target variable.
|
39 |
+
estimator: The scikit-learn regressor to use (e.g., LinearRegression(), RandomForestRegressor()).
|
40 |
+
param_grid (dict): Hyperparameter grid for GridSearchCV.
|
41 |
+
cv (int or cross-validation generator): Number of cross-validation folds or a cross-validation generator.
|
42 |
+
scoring (str or None): Scoring metric to use.
|
43 |
+
|
44 |
+
Returns:
|
45 |
+
best_model (Pipeline): Best model within a pipeline from GridSearch.
|
46 |
+
best_params (dict): Best hyperparameters.
|
47 |
+
"""
|
48 |
+
# Identify numerical and categorical columns
|
49 |
+
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
|
50 |
+
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
|
51 |
+
|
52 |
+
# Define preprocessing for numerical data
|
53 |
+
numerical_transformer = Pipeline(steps=[
|
54 |
+
('imputer', SimpleImputer(strategy='median')),
|
55 |
+
('scaler', StandardScaler())
|
56 |
+
])
|
57 |
+
|
58 |
+
# Conditional preprocessing for categorical data
|
59 |
+
estimator_name = estimator.__class__.__name__
|
60 |
+
|
61 |
+
if estimator_name in [
|
62 |
+
'DecisionTreeRegressor', 'RandomForestRegressor', 'ExtraTreesRegressor',
|
63 |
+
'GradientBoostingRegressor', 'XGBRegressor', 'LGBMRegressor', 'CatBoostRegressor'
|
64 |
+
]:
|
65 |
+
# Use Ordinal Encoding for tree-based models
|
66 |
+
categorical_transformer = Pipeline(steps=[
|
67 |
+
('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
|
68 |
+
('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
|
69 |
+
])
|
70 |
+
else:
|
71 |
+
# Use OneHotEncoder for other models
|
72 |
+
categorical_transformer = Pipeline(steps=[
|
73 |
+
('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
|
74 |
+
('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
|
75 |
+
])
|
76 |
+
|
77 |
+
# Create preprocessing pipeline
|
78 |
+
preprocessor = ColumnTransformer(
|
79 |
+
transformers=[
|
80 |
+
('num', numerical_transformer, numerical_cols),
|
81 |
+
('cat', categorical_transformer, categorical_cols)
|
82 |
+
]
|
83 |
+
)
|
84 |
+
|
85 |
+
# Create a pipeline that combines preprocessing and the estimator
|
86 |
+
pipeline = Pipeline(steps=[
|
87 |
+
('preprocessor', preprocessor),
|
88 |
+
('model', estimator)
|
89 |
+
])
|
90 |
+
|
91 |
+
# Define cross-validation strategy
|
92 |
+
if isinstance(cv, int):
|
93 |
+
cv = KFold(n_splits=cv, shuffle=True, random_state=42)
|
94 |
+
|
95 |
+
# Initialize GridSearchCV
|
96 |
+
grid_search = GridSearchCV(
|
97 |
+
estimator=pipeline,
|
98 |
+
param_grid=param_grid,
|
99 |
+
cv=cv,
|
100 |
+
scoring=scoring,
|
101 |
+
n_jobs=-1
|
102 |
+
)
|
103 |
+
|
104 |
+
# Perform Grid Search
|
105 |
+
grid_search.fit(X, y)
|
106 |
+
|
107 |
+
# Get the best model and parameters
|
108 |
+
best_model = grid_search.best_estimator_
|
109 |
+
best_params = grid_search.best_params_
|
110 |
+
|
111 |
+
print(f"Best Hyperparameters for {estimator_name}:")
|
112 |
+
for param_name in sorted(best_params.keys()):
|
113 |
+
print(f"{param_name}: {best_params[param_name]}")
|
114 |
+
|
115 |
+
return best_model, best_params
|