Spaces:
Running
Running
Commit
·
964082a
1
Parent(s):
3662fae
Add feature selection based on gradient boosting
Browse files- pysr/sr.py +35 -4
pysr/sr.py
CHANGED
|
@@ -76,6 +76,7 @@ def pysr(X=None, y=None, weights=None,
|
|
| 76 |
fast_cycle=False,
|
| 77 |
maxdepth=None,
|
| 78 |
variable_names=[],
|
|
|
|
| 79 |
threads=None, #deprecated
|
| 80 |
julia_optimization=3,
|
| 81 |
):
|
|
@@ -140,6 +141,11 @@ def pysr(X=None, y=None, weights=None,
|
|
| 140 |
15% faster. May be algorithmically less efficient.
|
| 141 |
:param variable_names: list, a list of names for the variables, other
|
| 142 |
than "x0", "x1", etc.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
:param julia_optimization: int, Optimization level (0, 1, 2, 3)
|
| 144 |
:returns: pd.DataFrame, Results dataframe, giving complexity, MSE, and equations
|
| 145 |
(as strings).
|
|
@@ -154,6 +160,8 @@ def pysr(X=None, y=None, weights=None,
|
|
| 154 |
variable_names = list(X.columns)
|
| 155 |
X = np.array(X)
|
| 156 |
|
|
|
|
|
|
|
| 157 |
# Check for potential errors before they happen
|
| 158 |
assert len(unary_operators) + len(binary_operators) > 0
|
| 159 |
assert len(X.shape) == 2
|
|
@@ -162,9 +170,17 @@ def pysr(X=None, y=None, weights=None,
|
|
| 162 |
if weights is not None:
|
| 163 |
assert len(weights.shape) == 1
|
| 164 |
assert X.shape[0] == weights.shape[0]
|
| 165 |
-
if
|
| 166 |
assert len(variable_names) == X.shape[1]
|
| 167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
if populations is None:
|
| 169 |
populations = procs
|
| 170 |
|
|
@@ -233,7 +249,7 @@ const nrestarts = {nrestarts:d}
|
|
| 233 |
const perturbationFactor = {perturbationFactor:f}f0
|
| 234 |
const annealing = {"true" if annealing else "false"}
|
| 235 |
const weighted = {"true" if weights is not None else "false"}
|
| 236 |
-
const useVarMap = {"
|
| 237 |
const mutationWeights = [
|
| 238 |
{weightMutateConstant:f},
|
| 239 |
{weightMutateOperator:f},
|
|
@@ -260,7 +276,7 @@ const y = convert(Array{Float32, 1}, """f"{y_str})"
|
|
| 260 |
def_datasets += """
|
| 261 |
const weights = convert(Array{Float32, 1}, """f"{weight_str})"
|
| 262 |
|
| 263 |
-
if
|
| 264 |
def_hyperparams += f"""
|
| 265 |
const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
|
| 266 |
|
|
@@ -299,7 +315,7 @@ const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
|
|
| 299 |
lastComplexity = 0
|
| 300 |
sympy_format = []
|
| 301 |
lambda_format = []
|
| 302 |
-
if
|
| 303 |
sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(X.shape[1])]
|
| 304 |
else:
|
| 305 |
sympy_symbols = [sympy.Symbol('x%d'%i) for i in range(X.shape[1])]
|
|
@@ -326,3 +342,18 @@ const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
|
|
| 326 |
return output[['Complexity', 'MSE', 'score', 'Equation', 'sympy_format', 'lambda_format']]
|
| 327 |
|
| 328 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
fast_cycle=False,
|
| 77 |
maxdepth=None,
|
| 78 |
variable_names=[],
|
| 79 |
+
select_k_features=None,
|
| 80 |
threads=None, #deprecated
|
| 81 |
julia_optimization=3,
|
| 82 |
):
|
|
|
|
| 141 |
15% faster. May be algorithmically less efficient.
|
| 142 |
:param variable_names: list, a list of names for the variables, other
|
| 143 |
than "x0", "x1", etc.
|
| 144 |
+
:param feature_selection: bool,
|
| 145 |
+
:param select_k_features: (None, int), whether to run feature selection in
|
| 146 |
+
Python using random forests, before passing to the symbolic regression
|
| 147 |
+
code. None means no feature selection; an int means select that many
|
| 148 |
+
features.
|
| 149 |
:param julia_optimization: int, Optimization level (0, 1, 2, 3)
|
| 150 |
:returns: pd.DataFrame, Results dataframe, giving complexity, MSE, and equations
|
| 151 |
(as strings).
|
|
|
|
| 160 |
variable_names = list(X.columns)
|
| 161 |
X = np.array(X)
|
| 162 |
|
| 163 |
+
use_custom_variable_names = (len(variable_names) != 0)
|
| 164 |
+
|
| 165 |
# Check for potential errors before they happen
|
| 166 |
assert len(unary_operators) + len(binary_operators) > 0
|
| 167 |
assert len(X.shape) == 2
|
|
|
|
| 170 |
if weights is not None:
|
| 171 |
assert len(weights.shape) == 1
|
| 172 |
assert X.shape[0] == weights.shape[0]
|
| 173 |
+
if use_custom_variable_names:
|
| 174 |
assert len(variable_names) == X.shape[1]
|
| 175 |
|
| 176 |
+
if select_k_features is not None:
|
| 177 |
+
selection = run_feature_selection(X, y, select_k_features)
|
| 178 |
+
print(f"Using features {selection}")
|
| 179 |
+
X = X[:, selection]
|
| 180 |
+
|
| 181 |
+
if use_custom_variable_names:
|
| 182 |
+
variable_names = variable_names[selection]
|
| 183 |
+
|
| 184 |
if populations is None:
|
| 185 |
populations = procs
|
| 186 |
|
|
|
|
| 249 |
const perturbationFactor = {perturbationFactor:f}f0
|
| 250 |
const annealing = {"true" if annealing else "false"}
|
| 251 |
const weighted = {"true" if weights is not None else "false"}
|
| 252 |
+
const useVarMap = {"true" if use_custom_variable_names else "false"}
|
| 253 |
const mutationWeights = [
|
| 254 |
{weightMutateConstant:f},
|
| 255 |
{weightMutateOperator:f},
|
|
|
|
| 276 |
def_datasets += """
|
| 277 |
const weights = convert(Array{Float32, 1}, """f"{weight_str})"
|
| 278 |
|
| 279 |
+
if use_custom_variable_names:
|
| 280 |
def_hyperparams += f"""
|
| 281 |
const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
|
| 282 |
|
|
|
|
| 315 |
lastComplexity = 0
|
| 316 |
sympy_format = []
|
| 317 |
lambda_format = []
|
| 318 |
+
if use_custom_variable_names:
|
| 319 |
sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(X.shape[1])]
|
| 320 |
else:
|
| 321 |
sympy_symbols = [sympy.Symbol('x%d'%i) for i in range(X.shape[1])]
|
|
|
|
| 342 |
return output[['Complexity', 'MSE', 'score', 'Equation', 'sympy_format', 'lambda_format']]
|
| 343 |
|
| 344 |
|
| 345 |
+
def run_feature_selection(X, y, select_k_features):
|
| 346 |
+
"""Use a gradient boosting tree regressor as a proxy for finding
|
| 347 |
+
the k most important features in X, returning indices for those
|
| 348 |
+
features as output."""
|
| 349 |
+
|
| 350 |
+
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
|
| 351 |
+
from sklearn.feature_selection import SelectFromModel, SelectKBest
|
| 352 |
+
|
| 353 |
+
clf = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls') #RandomForestRegressor()
|
| 354 |
+
clf.fit(X, y)
|
| 355 |
+
selector = SelectFromModel(clf, threshold=-np.inf,
|
| 356 |
+
max_features=select_k_features, prefit=True)
|
| 357 |
+
return selector.get_support(indices=True)
|
| 358 |
+
|
| 359 |
+
|