Spaces:

MilesCranmer
/

PySR

Sleeping

App Files Files Community

MilesCranmer commited on Feb 1, 2022

Commit

4424b0a

unverified ·

2 Parent(s): bae75db 7ab053d

Merge pull request #95 from MilesCranmer/state-saving

Browse files

Files changed (5) hide show

README.md +3 -1
docs/start.md +3 -1
pysr/sr.py +46 -5
setup.py +2 -2
test/test.py +8 -1

README.md CHANGED Viewed

@@ -74,7 +74,7 @@ Most common issues at this stage are solved
 by [tweaking the Julia package server](https://github.com/MilesCranmer/PySR/issues/27).
 to use up-to-date packages.
-# Quickstart
 Let's create a PySR example. First, let's import
 numpy to generate some test data:
@@ -144,6 +144,8 @@ This arrow in the `pick` column indicates which equation is currently selected b
 SymPy format (`sympy_format`), and even JAX and PyTorch format
 (both of which are differentiable).
 There are several other useful features such as denoising (e.g., `denoising=True`),
 feature selection (e.g., `select_k_features=3`).
 For a summary of features and options, see [this docs page](https://pysr.readthedocs.io/en/latest/docs/options/).

 by [tweaking the Julia package server](https://github.com/MilesCranmer/PySR/issues/27).
 to use up-to-date packages.
+# Introduction
 Let's create a PySR example. First, let's import
 numpy to generate some test data:
 SymPy format (`sympy_format`), and even JAX and PyTorch format
 (both of which are differentiable).
+Note that `PySRRegressor` stores the state of the last search, and will restart from where you left off the next time you call `.fit()`. This will cause problems if significant changes are made to the search parameters (like changing the operators). You can run `model.reset()` to reset the state.
 There are several other useful features such as denoising (e.g., `denoising=True`),
 feature selection (e.g., `select_k_features=3`).
 For a summary of features and options, see [this docs page](https://pysr.readthedocs.io/en/latest/docs/options/).

docs/start.md CHANGED Viewed

@@ -19,7 +19,7 @@ Most common issues at this stage are solved
 by [tweaking the Julia package server](https://github.com/MilesCranmer/PySR/issues/27).
 to use up-to-date packages.
-# Quickstart
 Let's create a PySR example. First, let's import
 numpy to generate some test data:
@@ -89,6 +89,8 @@ This arrow in the `pick` column indicates which equation is currently selected b
 SymPy format (`sympy_format`), and even JAX and PyTorch format
 (both of which are differentiable).
 There are several other useful features such as denoising (e.g., `denoising=True`),
 feature selection (e.g., `select_k_features=3`).
 For a summary of features and options, see [this docs page](https://pysr.readthedocs.io/en/latest/docs/options/).

 by [tweaking the Julia package server](https://github.com/MilesCranmer/PySR/issues/27).
 to use up-to-date packages.
+# Introduction
 Let's create a PySR example. First, let's import
 numpy to generate some test data:
 SymPy format (`sympy_format`), and even JAX and PyTorch format
 (both of which are differentiable).
+Note that `PySRRegressor` stores the state of the last search, and will restart from where you left off the next time you call `.fit()`. This will cause problems if significant changes are made to the search parameters (like changing the operators). You can run `model.reset()` to reset the state.
 There are several other useful features such as denoising (e.g., `denoising=True`),
 feature selection (e.g., `select_k_features=3`).
 For a summary of features and options, see [this docs page](https://pysr.readthedocs.io/en/latest/docs/options/).

pysr/sr.py CHANGED Viewed

@@ -12,6 +12,8 @@ from datetime import datetime
 import warnings
 from multiprocessing import cpu_count
 from sklearn.base import BaseEstimator, RegressorMixin
 is_julia_warning_silenced = False
@@ -320,7 +322,7 @@ def _write_project_file(tmp_dir):
 SymbolicRegression = "8254be44-1295-4e6a-a16d-46603ac705cb"
 [compat]
-SymbolicRegression = "0.7.0"
 julia = "1.5"
     """
@@ -636,9 +638,10 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
         # Stored equations:
         self.equations = None
         self.multioutput = None
-        self.raw_julia_output = None
         self.equation_file = equation_file
         self.n_features = None
         self.extra_sympy_mappings = extra_sympy_mappings
@@ -654,7 +657,6 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
         self.surface_parameters = [
             "model_selection",
             "multioutput",
-            "raw_julia_output",
             "equation_file",
             "n_features",
             "extra_sympy_mappings",
@@ -727,7 +729,6 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
             else:
                 self.params[key] = value
-        self.refresh()
         return self
     def get_params(self, deep=True):
@@ -858,6 +859,12 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
             return [eq["torch_format"] for eq in best]
         return best["torch_format"]
     def _run(self, X, y, weights, variable_names):
         global already_ran
         global Main
@@ -1046,6 +1053,38 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
             float(weightDoNothing),
         ]
         options = Main.Options(
             binary_operators=Main.eval(str(tuple(binary_operators)).replace("'", "")),
             unary_operators=Main.eval(str(tuple(unary_operators)).replace("'", "")),
@@ -1085,6 +1124,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
             optimizer_iterations=self.params["optimizer_iterations"],
             perturbationFactor=self.params["perturbationFactor"],
             annealing=self.params["annealing"],
         )
         np_dtype = {16: np.float16, 32: np.float32, 64: np.float64}[
@@ -1106,7 +1146,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
         cprocs = 0 if multithreading else procs
-        self.raw_julia_output = Main.EquationSearch(
             Main.X,
             Main.y,
             weights=Main.weights,
@@ -1119,6 +1159,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
             options=options,
             numprocs=int(cprocs),
             multithreading=bool(multithreading),
         )
         self.variable_names = variable_names

 import warnings
 from multiprocessing import cpu_count
 from sklearn.base import BaseEstimator, RegressorMixin
+from collections import OrderedDict
+from hashlib import sha256
 is_julia_warning_silenced = False
 SymbolicRegression = "8254be44-1295-4e6a-a16d-46603ac705cb"
 [compat]
+SymbolicRegression = "0.7.3"
 julia = "1.5"
     """
         # Stored equations:
         self.equations = None
+        self.params_hash = None
+        self.raw_julia_state = None
         self.multioutput = None
         self.equation_file = equation_file
         self.n_features = None
         self.extra_sympy_mappings = extra_sympy_mappings
         self.surface_parameters = [
             "model_selection",
             "multioutput",
             "equation_file",
             "n_features",
             "extra_sympy_mappings",
             else:
                 self.params[key] = value
         return self
     def get_params(self, deep=True):
             return [eq["torch_format"] for eq in best]
         return best["torch_format"]
+    def reset(self):
+        """Reset the search state."""
+        self.equations = None
+        self.params_hash = None
+        self.raw_julia_state = None
     def _run(self, X, y, weights, variable_names):
         global already_ran
         global Main
             float(weightDoNothing),
         ]
+        params_to_hash = {
+            **{k: self.__getattribute__(k) for k in self.surface_parameters},
+            **self.params,
+        }
+        params_excluded_from_hash = [
+            "niterations",
+        ]
+        # Delete these^ from params_to_hash:
+        params_to_hash = {
+            k: v
+            for k, v in params_to_hash.items()
+            if k not in params_excluded_from_hash
+        }
+        # Sort params_to_hash by key:
+        params_to_hash = OrderedDict(sorted(params_to_hash.items()))
+        # Hash all parameters:
+        cur_hash = sha256(str(params_to_hash).encode()).hexdigest()
+        if self.params_hash is not None:
+            if cur_hash != self.params_hash:
+                warnings.warn(
+                    "Warning: PySR options have changed since the last run. "
+                    "This is experimental and may not work. "
+                    "For example, if the operators change, or even their order,"
+                    " the saved equations will be in the wrong format."
+                    "\n\n"
+                    "To reset the search state, run `.reset()`. "
+                )
+        self.params_hash = cur_hash
         options = Main.Options(
             binary_operators=Main.eval(str(tuple(binary_operators)).replace("'", "")),
             unary_operators=Main.eval(str(tuple(unary_operators)).replace("'", "")),
             optimizer_iterations=self.params["optimizer_iterations"],
             perturbationFactor=self.params["perturbationFactor"],
             annealing=self.params["annealing"],
+            stateReturn=True,  # Required for state saving.
         )
         np_dtype = {16: np.float16, 32: np.float32, 64: np.float64}[
         cprocs = 0 if multithreading else procs
+        self.raw_julia_state = Main.EquationSearch(
             Main.X,
             Main.y,
             weights=Main.weights,
             options=options,
             numprocs=int(cprocs),
             multithreading=bool(multithreading),
+            saved_state=self.raw_julia_state,
         )
         self.variable_names = variable_names

setup.py CHANGED Viewed

@@ -8,14 +8,14 @@ except FileNotFoundError:
 setuptools.setup(
     name="pysr",
-    version="0.7.0",
     author="Miles Cranmer",
     author_email="[email protected]",
     description="Simple and efficient symbolic regression",
     long_description=long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/MilesCranmer/pysr",
-    install_requires=["julia", "numpy", "pandas", "sympy", "scikit-learn"],
     packages=setuptools.find_packages(),
     package_data={"pysr": ["../Project.toml", "../datasets/*"]},
     include_package_data=False,

 setuptools.setup(
     name="pysr",
+    version="0.7.0-1",
     author="Miles Cranmer",
     author_email="[email protected]",
     description="Simple and efficient symbolic regression",
     long_description=long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/MilesCranmer/pysr",
+    install_requires=["julia>=0.5.7", "numpy", "pandas", "sympy", "scikit-learn"],
     packages=setuptools.find_packages(),
     package_data={"pysr": ["../Project.toml", "../datasets/*"]},
     include_package_data=False,

test/test.py CHANGED Viewed

@@ -77,7 +77,7 @@ class TestPipeline(unittest.TestCase):
             model.predict(self.X)[:, 1], self.X[:, 1] ** 2, decimal=4
         )
-    def test_empty_operators_single_input_sklearn(self):
         X = np.random.randn(100, 1)
         y = X[:, 0] + 3.0
         regressor = PySRRegressor(
@@ -94,6 +94,13 @@ class TestPipeline(unittest.TestCase):
         self.assertLessEqual(regressor.equations.iloc[-1]["loss"], 1e-4)
         np.testing.assert_almost_equal(regressor.predict(X), y, decimal=1)
         # Tweak model selection:
         regressor.set_params(model_selection="best")
         self.assertEqual(regressor.get_params()["model_selection"], "best")

             model.predict(self.X)[:, 1], self.X[:, 1] ** 2, decimal=4
         )
+    def test_empty_operators_single_input_multirun(self):
         X = np.random.randn(100, 1)
         y = X[:, 0] + 3.0
         regressor = PySRRegressor(
         self.assertLessEqual(regressor.equations.iloc[-1]["loss"], 1e-4)
         np.testing.assert_almost_equal(regressor.predict(X), y, decimal=1)
+        # Test if repeated fit works:
+        regressor.set_params(niterations=0)
+        regressor.fit(X, y)
+        self.assertLessEqual(regressor.equations.iloc[-1]["loss"], 1e-4)
+        np.testing.assert_almost_equal(regressor.predict(X), y, decimal=1)
         # Tweak model selection:
         regressor.set_params(model_selection="best")
         self.assertEqual(regressor.get_params()["model_selection"], "best")