Spaces:
Sleeping
Sleeping
Commit
路
b7e75e1
1
Parent(s):
65a4248
Update all docstrings
Browse files- pysr/sr.py +55 -20
pysr/sr.py
CHANGED
|
@@ -17,6 +17,9 @@ is_julia_warning_silenced = False
|
|
| 17 |
|
| 18 |
|
| 19 |
def install(julia_project=None): # pragma: no cover
|
|
|
|
|
|
|
|
|
|
| 20 |
import julia
|
| 21 |
|
| 22 |
julia.install()
|
|
@@ -405,14 +408,26 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
| 405 |
:type binary_operators: list
|
| 406 |
:param unary_operators: Same but for operators taking a single scalar. Default is [].
|
| 407 |
:type unary_operators: list
|
| 408 |
-
:param procs: Number of processes (=number of populations running).
|
| 409 |
-
:type procs: int
|
| 410 |
-
:param loss: String of Julia code specifying the loss function. Can either be a loss from LossFunctions.jl, or your own loss written as a function. Examples of custom written losses include: `myloss(x, y) = abs(x-y)` for non-weighted, or `myloss(x, y, w) = w*abs(x-y)` for weighted. Among the included losses, these are as follows. Regression: `LPDistLoss{P}()`, `L1DistLoss()`, `L2DistLoss()` (mean square), `LogitDistLoss()`, `HuberLoss(d)`, `L1EpsilonInsLoss(系)`, `L2EpsilonInsLoss(系)`, `PeriodicLoss(c)`, `QuantileLoss(蟿)`. Classification: `ZeroOneLoss()`, `PerceptronLoss()`, `L1HingeLoss()`, `SmoothedL1HingeLoss(纬)`, `ModifiedHuberLoss()`, `L2MarginLoss()`, `ExpLoss()`, `SigmoidLoss()`, `DWDMarginLoss(q)`.
|
| 411 |
-
:type loss: str
|
| 412 |
-
:param populations: Number of populations running.
|
| 413 |
-
:type populations: int
|
| 414 |
:param niterations: Number of iterations of the algorithm to run. The best equations are printed, and migrate between populations, at the end of each.
|
| 415 |
:type niterations: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
:param ncyclesperiteration: Number of total mutations to run, per 10 samples of the population, per iteration.
|
| 417 |
:type ncyclesperiteration: int
|
| 418 |
:param alpha: Initial temperature.
|
|
@@ -459,20 +474,12 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
| 459 |
:type verbosity: int
|
| 460 |
:param progress: Whether to use a progress bar instead of printing to stdout.
|
| 461 |
:type progress: bool
|
| 462 |
-
:param maxsize: Max size of an equation.
|
| 463 |
-
:type maxsize: int
|
| 464 |
:param maxdepth: Max depth of an equation. You can use both maxsize and maxdepth. maxdepth is by default set to = maxsize, which means that it is redundant.
|
| 465 |
:type maxdepth: int
|
| 466 |
:param fast_cycle: (experimental) - batch over population subsamples. This is a slightly different algorithm than regularized evolution, but does cycles 15% faster. May be algorithmically less efficient.
|
| 467 |
:type fast_cycle: bool
|
| 468 |
:param variable_names: a list of names for the variables, other than "x0", "x1", etc.
|
| 469 |
:type variable_names: list
|
| 470 |
-
:param batching: whether to compare population members on small batches during evolution. Still uses full dataset for comparing against hall of fame.
|
| 471 |
-
:type batching: bool
|
| 472 |
-
:param batchSize: the amount of data to use if doing batching.
|
| 473 |
-
:type batchSize: int
|
| 474 |
-
:param select_k_features: whether to run feature selection in Python using random forests, before passing to the symbolic regression code. None means no feature selection; an int means select that many features.
|
| 475 |
-
:type select_k_features: None/int
|
| 476 |
:param warmupMaxsizeBy: whether to slowly increase max size from a small number up to the maxsize (if greater than 0). If greater than 0, says the fraction of training time at which the current maxsize will reach the user-passed maxsize.
|
| 477 |
:type warmupMaxsizeBy: float
|
| 478 |
:param constraints: dictionary of int (unary) or 2-tuples (binary), this enforces maxsize constraints on the individual arguments of operators. E.g., `'pow': (-1, 1)` says that power laws can have any complexity left argument, but only 1 complexity exponent. Use this to force more interpretable solutions.
|
|
@@ -497,12 +504,8 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
| 497 |
:type tournament_selection_n: int
|
| 498 |
:param tournament_selection_p: Probability of selecting the best expression in each tournament. The probability will decay as p*(1-p)^n for other expressions, sorted by loss.
|
| 499 |
:type tournament_selection_p: float
|
| 500 |
-
:param denoise: Whether to use a Gaussian Process to denoise the data before inputting to PySR. Can help PySR fit noisy data.
|
| 501 |
-
:type denoise: bool
|
| 502 |
:param precision: What precision to use for the data. By default this is 32 (float32), but you can select 64 or 16 as well.
|
| 503 |
:type precision: int
|
| 504 |
-
:param multithreading: Use multithreading instead of distributed backend. Default is yes. Using procs=0 will turn off both.
|
| 505 |
-
:type multithreading: bool
|
| 506 |
:param **kwargs: Other options passed to SymbolicRegression.Options, for example, if you modify SymbolicRegression.jl to include additional arguments.
|
| 507 |
:type **kwargs: dict
|
| 508 |
:returns: Results dataframe, giving complexity, MSE, and equations (as strings), as well as functional forms. If list, each element corresponds to a dataframe of equations for each output.
|
|
@@ -666,6 +669,11 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
| 666 |
]
|
| 667 |
|
| 668 |
def __repr__(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 669 |
if self.equations is None:
|
| 670 |
return "PySRRegressor.equations = None"
|
| 671 |
|
|
@@ -712,7 +720,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
| 712 |
return output
|
| 713 |
|
| 714 |
def set_params(self, **params):
|
| 715 |
-
"""Set parameters for
|
| 716 |
for key, value in params.items():
|
| 717 |
if key in self.surface_parameters:
|
| 718 |
self.__setattr__(key, value)
|
|
@@ -723,6 +731,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
| 723 |
return self
|
| 724 |
|
| 725 |
def get_params(self, deep=True):
|
|
|
|
| 726 |
del deep
|
| 727 |
return {
|
| 728 |
**self.params,
|
|
@@ -730,6 +739,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
| 730 |
}
|
| 731 |
|
| 732 |
def get_best(self):
|
|
|
|
| 733 |
if self.equations is None:
|
| 734 |
raise ValueError("No equations have been generated yet.")
|
| 735 |
if self.model_selection == "accuracy":
|
|
@@ -746,7 +756,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
| 746 |
)
|
| 747 |
|
| 748 |
def fit(self, X, y, weights=None, variable_names=None):
|
| 749 |
-
"""Search for equations to fit the dataset.
|
| 750 |
|
| 751 |
:param X: 2D array. Rows are examples, columns are features. If pandas DataFrame, the columns are used for variable names (so make sure they don't contain spaces).
|
| 752 |
:type X: np.ndarray/pandas.DataFrame
|
|
@@ -755,6 +765,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
| 755 |
:param weights: Optional. Same shape as y. Each element is how to weight the mean-square-error loss for that particular element of y.
|
| 756 |
:type weights: np.ndarray
|
| 757 |
:param variable_names: a list of names for the variables, other than "x0", "x1", etc.
|
|
|
|
| 758 |
:type variable_names: list
|
| 759 |
"""
|
| 760 |
if variable_names is None:
|
|
@@ -775,6 +786,15 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
| 775 |
self.equations = self.get_hof()
|
| 776 |
|
| 777 |
def predict(self, X):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 778 |
self.refresh()
|
| 779 |
best = self.get_best()
|
| 780 |
if self.multioutput:
|
|
@@ -782,6 +802,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
| 782 |
return best["lambda_format"](X)
|
| 783 |
|
| 784 |
def sympy(self):
|
|
|
|
| 785 |
self.refresh()
|
| 786 |
best = self.get_best()
|
| 787 |
if self.multioutput:
|
|
@@ -789,6 +810,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
| 789 |
return best["sympy_format"]
|
| 790 |
|
| 791 |
def latex(self):
|
|
|
|
| 792 |
self.refresh()
|
| 793 |
sympy_representation = self.sympy()
|
| 794 |
if self.multioutput:
|
|
@@ -796,6 +818,12 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
| 796 |
return sympy.latex(sympy_representation)
|
| 797 |
|
| 798 |
def jax(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 799 |
if self.using_pandas:
|
| 800 |
warnings.warn(
|
| 801 |
"PySR's JAX modules are not set up to work with a "
|
|
@@ -810,6 +838,13 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
| 810 |
return best["jax_format"]
|
| 811 |
|
| 812 |
def pytorch(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 813 |
if self.using_pandas:
|
| 814 |
warnings.warn(
|
| 815 |
"PySR's PyTorch modules are not set up to work with a "
|
|
|
|
| 17 |
|
| 18 |
|
| 19 |
def install(julia_project=None): # pragma: no cover
|
| 20 |
+
"""Install PyCall.jl and all required dependencies for SymbolicRegression.jl.
|
| 21 |
+
|
| 22 |
+
Also updates the local Julia registry."""
|
| 23 |
import julia
|
| 24 |
|
| 25 |
julia.install()
|
|
|
|
| 408 |
:type binary_operators: list
|
| 409 |
:param unary_operators: Same but for operators taking a single scalar. Default is [].
|
| 410 |
:type unary_operators: list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
:param niterations: Number of iterations of the algorithm to run. The best equations are printed, and migrate between populations, at the end of each.
|
| 412 |
:type niterations: int
|
| 413 |
+
:param populations: Number of populations running.
|
| 414 |
+
:type populations: int
|
| 415 |
+
:param loss: String of Julia code specifying the loss function. Can either be a loss from LossFunctions.jl, or your own loss written as a function. Examples of custom written losses include: `myloss(x, y) = abs(x-y)` for non-weighted, or `myloss(x, y, w) = w*abs(x-y)` for weighted. Among the included losses, these are as follows. Regression: `LPDistLoss{P}()`, `L1DistLoss()`, `L2DistLoss()` (mean square), `LogitDistLoss()`, `HuberLoss(d)`, `L1EpsilonInsLoss(系)`, `L2EpsilonInsLoss(系)`, `PeriodicLoss(c)`, `QuantileLoss(蟿)`. Classification: `ZeroOneLoss()`, `PerceptronLoss()`, `L1HingeLoss()`, `SmoothedL1HingeLoss(纬)`, `ModifiedHuberLoss()`, `L2MarginLoss()`, `ExpLoss()`, `SigmoidLoss()`, `DWDMarginLoss(q)`.
|
| 416 |
+
:type loss: str
|
| 417 |
+
:param denoise: Whether to use a Gaussian Process to denoise the data before inputting to PySR. Can help PySR fit noisy data.
|
| 418 |
+
:type denoise: bool
|
| 419 |
+
:param select_k_features: whether to run feature selection in Python using random forests, before passing to the symbolic regression code. None means no feature selection; an int means select that many features.
|
| 420 |
+
:type select_k_features: None/int
|
| 421 |
+
:param procs: Number of processes (=number of populations running).
|
| 422 |
+
:type procs: int
|
| 423 |
+
:param multithreading: Use multithreading instead of distributed backend. Default is yes. Using procs=0 will turn off both.
|
| 424 |
+
:type multithreading: bool
|
| 425 |
+
:param batching: whether to compare population members on small batches during evolution. Still uses full dataset for comparing against hall of fame.
|
| 426 |
+
:type batching: bool
|
| 427 |
+
:param batchSize: the amount of data to use if doing batching.
|
| 428 |
+
:type batchSize: int
|
| 429 |
+
:param maxsize: Max size of an equation.
|
| 430 |
+
:type maxsize: int
|
| 431 |
:param ncyclesperiteration: Number of total mutations to run, per 10 samples of the population, per iteration.
|
| 432 |
:type ncyclesperiteration: int
|
| 433 |
:param alpha: Initial temperature.
|
|
|
|
| 474 |
:type verbosity: int
|
| 475 |
:param progress: Whether to use a progress bar instead of printing to stdout.
|
| 476 |
:type progress: bool
|
|
|
|
|
|
|
| 477 |
:param maxdepth: Max depth of an equation. You can use both maxsize and maxdepth. maxdepth is by default set to = maxsize, which means that it is redundant.
|
| 478 |
:type maxdepth: int
|
| 479 |
:param fast_cycle: (experimental) - batch over population subsamples. This is a slightly different algorithm than regularized evolution, but does cycles 15% faster. May be algorithmically less efficient.
|
| 480 |
:type fast_cycle: bool
|
| 481 |
:param variable_names: a list of names for the variables, other than "x0", "x1", etc.
|
| 482 |
:type variable_names: list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
:param warmupMaxsizeBy: whether to slowly increase max size from a small number up to the maxsize (if greater than 0). If greater than 0, says the fraction of training time at which the current maxsize will reach the user-passed maxsize.
|
| 484 |
:type warmupMaxsizeBy: float
|
| 485 |
:param constraints: dictionary of int (unary) or 2-tuples (binary), this enforces maxsize constraints on the individual arguments of operators. E.g., `'pow': (-1, 1)` says that power laws can have any complexity left argument, but only 1 complexity exponent. Use this to force more interpretable solutions.
|
|
|
|
| 504 |
:type tournament_selection_n: int
|
| 505 |
:param tournament_selection_p: Probability of selecting the best expression in each tournament. The probability will decay as p*(1-p)^n for other expressions, sorted by loss.
|
| 506 |
:type tournament_selection_p: float
|
|
|
|
|
|
|
| 507 |
:param precision: What precision to use for the data. By default this is 32 (float32), but you can select 64 or 16 as well.
|
| 508 |
:type precision: int
|
|
|
|
|
|
|
| 509 |
:param **kwargs: Other options passed to SymbolicRegression.Options, for example, if you modify SymbolicRegression.jl to include additional arguments.
|
| 510 |
:type **kwargs: dict
|
| 511 |
:returns: Results dataframe, giving complexity, MSE, and equations (as strings), as well as functional forms. If list, each element corresponds to a dataframe of equations for each output.
|
|
|
|
| 669 |
]
|
| 670 |
|
| 671 |
def __repr__(self):
|
| 672 |
+
"""Prints all current equations fitted by the model.
|
| 673 |
+
|
| 674 |
+
The string `>>>>` denotes which equation is selected by the
|
| 675 |
+
`model_selection`.
|
| 676 |
+
"""
|
| 677 |
if self.equations is None:
|
| 678 |
return "PySRRegressor.equations = None"
|
| 679 |
|
|
|
|
| 720 |
return output
|
| 721 |
|
| 722 |
def set_params(self, **params):
|
| 723 |
+
"""Set parameters for equation search."""
|
| 724 |
for key, value in params.items():
|
| 725 |
if key in self.surface_parameters:
|
| 726 |
self.__setattr__(key, value)
|
|
|
|
| 731 |
return self
|
| 732 |
|
| 733 |
def get_params(self, deep=True):
|
| 734 |
+
"""Get parameters for equation search."""
|
| 735 |
del deep
|
| 736 |
return {
|
| 737 |
**self.params,
|
|
|
|
| 739 |
}
|
| 740 |
|
| 741 |
def get_best(self):
|
| 742 |
+
"""Get best equation using `model_selection`."""
|
| 743 |
if self.equations is None:
|
| 744 |
raise ValueError("No equations have been generated yet.")
|
| 745 |
if self.model_selection == "accuracy":
|
|
|
|
| 756 |
)
|
| 757 |
|
| 758 |
def fit(self, X, y, weights=None, variable_names=None):
|
| 759 |
+
"""Search for equations to fit the dataset and store them in `self.equations`.
|
| 760 |
|
| 761 |
:param X: 2D array. Rows are examples, columns are features. If pandas DataFrame, the columns are used for variable names (so make sure they don't contain spaces).
|
| 762 |
:type X: np.ndarray/pandas.DataFrame
|
|
|
|
| 765 |
:param weights: Optional. Same shape as y. Each element is how to weight the mean-square-error loss for that particular element of y.
|
| 766 |
:type weights: np.ndarray
|
| 767 |
:param variable_names: a list of names for the variables, other than "x0", "x1", etc.
|
| 768 |
+
You can also pass a pandas DataFrame for X.
|
| 769 |
:type variable_names: list
|
| 770 |
"""
|
| 771 |
if variable_names is None:
|
|
|
|
| 786 |
self.equations = self.get_hof()
|
| 787 |
|
| 788 |
def predict(self, X):
|
| 789 |
+
"""Predict y from input X using the equation chosen by `model_selection`.
|
| 790 |
+
|
| 791 |
+
You may see what equation is used by printing this object. X should have the same
|
| 792 |
+
columns as the training data.
|
| 793 |
+
|
| 794 |
+
:param X: 2D array. Rows are examples, columns are features. If pandas DataFrame, the columns are used for variable names (so make sure they don't contain spaces).
|
| 795 |
+
:type X: np.ndarray/pandas.DataFrame
|
| 796 |
+
:return: 1D array (rows are examples) or 2D array (rows are examples, columns are outputs).
|
| 797 |
+
"""
|
| 798 |
self.refresh()
|
| 799 |
best = self.get_best()
|
| 800 |
if self.multioutput:
|
|
|
|
| 802 |
return best["lambda_format"](X)
|
| 803 |
|
| 804 |
def sympy(self):
|
| 805 |
+
"""Return sympy representation of the equation(s) chosen by `model_selection`."""
|
| 806 |
self.refresh()
|
| 807 |
best = self.get_best()
|
| 808 |
if self.multioutput:
|
|
|
|
| 810 |
return best["sympy_format"]
|
| 811 |
|
| 812 |
def latex(self):
|
| 813 |
+
"""Return latex representation of the equation(s) chosen by `model_selection`."""
|
| 814 |
self.refresh()
|
| 815 |
sympy_representation = self.sympy()
|
| 816 |
if self.multioutput:
|
|
|
|
| 818 |
return sympy.latex(sympy_representation)
|
| 819 |
|
| 820 |
def jax(self):
|
| 821 |
+
"""Return jax representation of the equation(s) chosen by `model_selection`.
|
| 822 |
+
|
| 823 |
+
Each equation (multiple given if there are multiple outputs) is a dictionary
|
| 824 |
+
containing {"callable": func, "parameters": params}. To call `func`, pass
|
| 825 |
+
func(X, params). This function is differentiable using `jax.grad`.
|
| 826 |
+
"""
|
| 827 |
if self.using_pandas:
|
| 828 |
warnings.warn(
|
| 829 |
"PySR's JAX modules are not set up to work with a "
|
|
|
|
| 838 |
return best["jax_format"]
|
| 839 |
|
| 840 |
def pytorch(self):
|
| 841 |
+
"""Return pytorch representation of the equation(s) chosen by `model_selection`.
|
| 842 |
+
|
| 843 |
+
Each equation (multiple given if there are multiple outputs) is a PyTorch module
|
| 844 |
+
containing the parameters as trainable attributes. You can use the module like
|
| 845 |
+
any other PyTorch module: `module(X)`, where `X` is a tensor with the same
|
| 846 |
+
column ordering as trained with.
|
| 847 |
+
"""
|
| 848 |
if self.using_pandas:
|
| 849 |
warnings.warn(
|
| 850 |
"PySR's PyTorch modules are not set up to work with a "
|