Spaces:
Running
Running
| """Functions for doing feature selection during preprocessing.""" | |
| import numpy as np | |
| def run_feature_selection(X, y, select_k_features, random_state=None): | |
| """ | |
| Find most important features. | |
| Uses a gradient boosting tree regressor as a proxy for finding | |
| the k most important features in X, returning indices for those | |
| features as output. | |
| """ | |
| from sklearn.ensemble import RandomForestRegressor | |
| from sklearn.feature_selection import SelectFromModel | |
| clf = RandomForestRegressor( | |
| n_estimators=100, max_depth=3, random_state=random_state | |
| ) | |
| clf.fit(X, y) | |
| selector = SelectFromModel( | |
| clf, threshold=-np.inf, max_features=select_k_features, prefit=True | |
| ) | |
| return selector.get_support(indices=True) | |
| # Function has not been removed only due to usage in module tests | |
| def _handle_feature_selection(X, select_k_features, y, variable_names): | |
| if select_k_features is not None: | |
| selection = run_feature_selection(X, y, select_k_features) | |
| print(f"Using features {[variable_names[i] for i in selection]}") | |
| X = X[:, selection] | |
| else: | |
| selection = None | |
| return X, selection | |