reformatting

interpretml · Jul 28, 2024 · c438070 · c438070
1 parent 356e589
commit c438070
Show file tree

Hide file tree

Showing 18 changed files with 214 additions and 159 deletions.
diff --git a/python/interpret-core/interpret/glassbox/_ebm/_ebm.py b/python/interpret-core/interpret/glassbox/_ebm/_ebm.py
@@ -2439,20 +2439,20 @@ class ExplainableBoostingClassifier(EBMModel, ClassifierMixin, ExplainerMixin):
         Number of rounds with no improvement to trigger early stopping. 0 turns off
         early stopping and boosting will occur for exactly max_rounds.
     early_stopping_tolerance : float, default=1e-4
-        Tolerance that dictates the smallest delta required to be considered an 
+        Tolerance that dictates the smallest delta required to be considered an
         improvement which prevents the algorithm from early stopping.
-        early_stopping_tolerance is expressed as a percentage of the early 
-        stopping metric. Negative values indicate that the individual 
+        early_stopping_tolerance is expressed as a percentage of the early
+        stopping metric. Negative values indicate that the individual
         models should be overfit before stopping.
-        EBMs are a bagged ensemble of models. Setting the early_stopping_tolerance 
-        to zero (or even negative), allows learning to overfit each of the individual 
-        models a little, which can improve the accuracy of the ensemble as a whole. 
-        Overfitting each of the individual models reduces the bias of each model at 
-        the expense of increasing the variance (due to overfitting) of the individual 
-        models.  But averaging the models in the ensemble reduces variance without 
-        much change in bias.  Since the goal is to find the optimum bias-variance 
-        tradeoff for the ensemble of models --- not the individual models --- a small 
-        amount of overfitting of the individual models can improve the accuracy of 
+        EBMs are a bagged ensemble of models. Setting the early_stopping_tolerance
+        to zero (or even negative), allows learning to overfit each of the individual
+        models a little, which can improve the accuracy of the ensemble as a whole.
+        Overfitting each of the individual models reduces the bias of each model at
+        the expense of increasing the variance (due to overfitting) of the individual
+        models.  But averaging the models in the ensemble reduces variance without
+        much change in bias.  Since the goal is to find the optimum bias-variance
+        tradeoff for the ensemble of models --- not the individual models --- a small
+        amount of overfitting of the individual models can improve the accuracy of
         the ensemble as a whole.
     min_samples_leaf : int, default=2
         Minimum number of samples allowed in the leaves.
@@ -2774,20 +2774,20 @@ class ExplainableBoostingRegressor(EBMModel, RegressorMixin, ExplainerMixin):
         Number of rounds with no improvement to trigger early stopping. 0 turns off
         early stopping and boosting will occur for exactly max_rounds.
     early_stopping_tolerance : float, default=1e-4
-        Tolerance that dictates the smallest delta required to be considered an 
+        Tolerance that dictates the smallest delta required to be considered an
         improvement which prevents the algorithm from early stopping.
-        early_stopping_tolerance is expressed as a percentage of the early 
-        stopping metric. Negative values indicate that the individual 
+        early_stopping_tolerance is expressed as a percentage of the early
+        stopping metric. Negative values indicate that the individual
         models should be overfit before stopping.
-        EBMs are a bagged ensemble of models. Setting the early_stopping_tolerance 
-        to zero (or even negative), allows learning to overfit each of the individual 
-        models a little, which can improve the accuracy of the ensemble as a whole. 
-        Overfitting each of the individual models reduces the bias of each model at 
-        the expense of increasing the variance (due to overfitting) of the individual 
-        models.  But averaging the models in the ensemble reduces variance without 
-        much change in bias.  Since the goal is to find the optimum bias-variance 
-        tradeoff for the ensemble of models --- not the individual models --- a small 
-        amount of overfitting of the individual models can improve the accuracy of 
+        EBMs are a bagged ensemble of models. Setting the early_stopping_tolerance
+        to zero (or even negative), allows learning to overfit each of the individual
+        models a little, which can improve the accuracy of the ensemble as a whole.
+        Overfitting each of the individual models reduces the bias of each model at
+        the expense of increasing the variance (due to overfitting) of the individual
+        models.  But averaging the models in the ensemble reduces variance without
+        much change in bias.  Since the goal is to find the optimum bias-variance
+        tradeoff for the ensemble of models --- not the individual models --- a small
+        amount of overfitting of the individual models can improve the accuracy of
         the ensemble as a whole.
     min_samples_leaf : int, default=2
         Minimum number of samples allowed in the leaves.

diff --git a/python/powerlift/src/powerlift/bench/__init__.py b/python/powerlift/src/powerlift/bench/__init__.py
@@ -16,4 +16,11 @@
 from powerlift.bench.benchmark import Benchmark
 
 from powerlift.bench.store import populate_with_datasets, DatasetAlreadyExistsError
-from powerlift.bench.store import retrieve_openml_automl_regression, retrieve_openml_automl_classification, retrieve_openml_cc18, retrieve_openml, retrieve_catboost_50k, retrieve_pmlb
+from powerlift.bench.store import (
+    retrieve_openml_automl_regression,
+    retrieve_openml_automl_classification,
+    retrieve_openml_cc18,
+    retrieve_openml,
+    retrieve_catboost_50k,
+    retrieve_pmlb,
+)
diff --git a/python/powerlift/src/powerlift/bench/store.py b/python/powerlift/src/powerlift/bench/store.py
@@ -150,7 +150,6 @@ def serialize(cls, obj):
             mimetype = MIMETYPE_WHEEL
         else:
             return None, None
-
 
         return mimetype, bstream
 
@@ -923,8 +922,10 @@ def name(self):
 
 class DatasetAlreadyExistsError(Exception):
     """Raised when dataset already exists in store."""
+
     pass
 
+
 def populate_with_datasets(
     store: Store,
     dataset_iter: Iterable[Dataset] = None,
@@ -947,7 +948,8 @@ def populate_with_datasets(
 
     if dataset_iter is None:
         dataset_iter = chain(
-            retrieve_openml_automl_regression(cache_dir=cache_dir), retrieve_openml_automl_classification(cache_dir=cache_dir)
+            retrieve_openml_automl_regression(cache_dir=cache_dir),
+            retrieve_openml_automl_classification(cache_dir=cache_dir),
         )
 
     for dataset in dataset_iter:
@@ -963,7 +965,9 @@ def populate_with_datasets(
     return True
 
 
-def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str = "openml") -> Generator[SupervisedDataset, None, None]:
+def retrieve_openml(
+    cache_dir: str = None, suite_id: int | str = 99, source: str = "openml"
+) -> Generator[SupervisedDataset, None, None]:
     """Retrives OpenML datasets.
 
     Args:
@@ -978,7 +982,7 @@ def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str
 
     if cache_dir is not None:
         cache_dir = pathlib.Path(cache_dir, source)
-    
+
     dataset_names_filename = "dataset_names.json"
     dataset_names_stream = retrieve_cache(cache_dir, [dataset_names_filename])
     if dataset_names_stream is None:
@@ -987,8 +991,19 @@ def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str
         tasks = suite.tasks.copy()
         random.Random(1337).shuffle(tasks)
         for task_id in tqdm(tasks, desc=source):
-            task = openml.tasks.get_task(task_id, download_splits=False, download_data=False, download_qualities=False, download_features_meta_data=False)
-            dataset = openml.datasets.get_dataset(task.dataset_id, download_data=True, download_qualities=True, download_features_meta_data=True)
+            task = openml.tasks.get_task(
+                task_id,
+                download_splits=False,
+                download_data=False,
+                download_qualities=False,
+                download_features_meta_data=False,
+            )
+            dataset = openml.datasets.get_dataset(
+                task.dataset_id,
+                download_data=True,
+                download_qualities=True,
+                download_features_meta_data=True,
+            )
             name = dataset.name
             dataset_names.append(name)
 
@@ -1002,7 +1017,9 @@ def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str
 
             if task.task_type_id == openml.tasks.TaskType.SUPERVISED_CLASSIFICATION:
                 problem = (
-                    "binary" if dataset.qualities["NumberOfClasses"] == 2 else "multiclass"
+                    "binary"
+                    if dataset.qualities["NumberOfClasses"] == 2
+                    else "multiclass"
                 )
 
                 # for benchmarking we do not care about the original target strings
@@ -1019,20 +1036,26 @@ def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str
                     col = col.sparse.to_dense()
                     X[col_name] = col
 
-                if col.dtype.name == 'category':
+                if col.dtype.name == "category":
                     if not cat:
-                        raise Exception(f"Categorical type mismatch. Was CategoricalDtype but indicated non-categorical.")
+                        raise Exception(
+                            f"Categorical type mismatch. Was CategoricalDtype but indicated non-categorical."
+                        )
                     if col.cat.ordered:
                         # OpenMl incorrectly is indicating these as ordered
                         X[col_name] = col.cat.as_unordered()
-                elif col.dtype.name == 'object':
+                elif col.dtype.name == "object":
                     if cat:
                         X[col_name] = col.astype(pd.CategoricalDtype(ordered=False))
                     else:
                         X[col_name] = col.astype(float)
-                elif np.issubdtype(col.dtype, np.floating) or np.issubdtype(col.dtype, np.integer):
+                elif np.issubdtype(col.dtype, np.floating) or np.issubdtype(
+                    col.dtype, np.integer
+                ):
                     if cat:
-                        raise Exception(f"Categorical type mismatch. Was continuous but indicated categorical.")
+                        raise Exception(
+                            f"Categorical type mismatch. Was continuous but indicated categorical."
+                        )
                 else:
                     raise Exception(f"Unrecognized data type {col.dtype.name}.")
 
@@ -1051,11 +1074,15 @@ def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str
             yield supervised
 
         if cache_dir is not None:
-            _, dataset_names_stream = BytesParser.serialize({"dataset_names": dataset_names})
+            _, dataset_names_stream = BytesParser.serialize(
+                {"dataset_names": dataset_names}
+            )
             update_cache(cache_dir, [dataset_names_filename], [dataset_names_stream])
     else:
         dataset_names_stream = dataset_names_stream[0]
-        dataset_names = BytesParser.deserialize(MIMETYPE_JSON, dataset_names_stream)["dataset_names"]
+        dataset_names = BytesParser.deserialize(MIMETYPE_JSON, dataset_names_stream)[
+            "dataset_names"
+        ]
         for name in tqdm(dataset_names, desc=source):
             X_name = f"{name}.X.parquet"
             y_name = f"{name}.y.parquet"
@@ -1064,7 +1091,10 @@ def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str
             supervised = SupervisedDataset.deserialize(*cached)
             yield supervised
 
-def retrieve_openml_automl_regression(cache_dir: str = None) -> Generator[SupervisedDataset, None, None]:
+
+def retrieve_openml_automl_regression(
+    cache_dir: str = None,
+) -> Generator[SupervisedDataset, None, None]:
     """Retrives OpenML AutoML regression datasets.
 
     Args:
@@ -1076,7 +1106,10 @@ def retrieve_openml_automl_regression(cache_dir: str = None) -> Generator[Superv
 
     return retrieve_openml(cache_dir, 269, "openml_automl_regression")
 
-def retrieve_openml_automl_classification(cache_dir: str = None) -> Generator[SupervisedDataset, None, None]:
+
+def retrieve_openml_automl_classification(
+    cache_dir: str = None,
+) -> Generator[SupervisedDataset, None, None]:
     """Retrives OpenML AutoML classification datasets.
 
     Args:
@@ -1088,7 +1121,10 @@ def retrieve_openml_automl_classification(cache_dir: str = None) -> Generator[Su
 
     return retrieve_openml(cache_dir, 271, "openml_automl_classification")
 
-def retrieve_openml_cc18(cache_dir: str = None) -> Generator[SupervisedDataset, None, None]:
+
+def retrieve_openml_cc18(
+    cache_dir: str = None,
+) -> Generator[SupervisedDataset, None, None]:
     """Retrives OpenML CC18 datasets.
 
     Args:
@@ -1100,7 +1136,10 @@ def retrieve_openml_cc18(cache_dir: str = None) -> Generator[SupervisedDataset,
 
     return retrieve_openml(cache_dir, 99, "openml_cc18")
 
-def retrieve_catboost_50k(cache_dir: str = None) -> Generator[SupervisedDataset, None, None]:
+
+def retrieve_catboost_50k(
+    cache_dir: str = None,
+) -> Generator[SupervisedDataset, None, None]:
     """Retrieves catboost regression and classification datasets that have less than 50k training instances.
 
     Does not download adult dataset as currently there some download issues.
@@ -1125,39 +1164,39 @@ def retrieve_catboost_50k(cache_dir: str = None) -> Generator[SupervisedDataset,
             "name": "amazon",
             "data_fn": amazon,
             "problem": "classification",
-            "target": "ACTION"
+            "target": "ACTION",
         },
         {
             "name": "msrank_10k",
             "data_fn": msrank_10k,
             "problem": "regression",
-            "target": 0
+            "target": 0,
         },
         {
             "name": "titanic",
             "data_fn": titanic,
             "problem": "classification",
-            "target": "Survived"
+            "target": "Survived",
         },
     ]
 
     if cache_dir is not None:
         cache_dir = pathlib.Path(cache_dir, "catboost_50k")
 
     for dataset in tqdm(datasets, desc="catboost_50k"):
-        name = dataset['name']
+        name = dataset["name"]
         X_name = f"{name}.X.parquet"
         y_name = f"{name}.y.parquet"
         meta_name = f"{name}.meta.json"
 
         cached = retrieve_cache(cache_dir, [X_name, y_name, meta_name])
         if cached is None:
-            df = dataset['data_fn']()[0]
-            target = dataset['target']
+            df = dataset["data_fn"]()[0]
+            target = dataset["target"]
             X = df.drop(target, axis=1)
             y = df[target]
-            problem = dataset['problem']
-            if dataset['problem'] == "classification":
+            problem = dataset["problem"]
+            if dataset["problem"] == "classification":
                 problem = "binary" if len(y.unique()) == 2 else "multiclass"
             meta = {
                 "name": name,

diff --git a/python/powerlift/src/powerlift/db/schema.py b/python/powerlift/src/powerlift/db/schema.py
@@ -198,7 +198,9 @@ class Task(Base):
     measure_outcomes = relationship(
         "MeasureOutcome", secondary=task_measure_outcome_table, back_populates="tasks"
     )
-    __table_args__ = (UniqueConstraint("name", "problem", "origin", name="u_name_problem_origin"),)
+    __table_args__ = (
+        UniqueConstraint("name", "problem", "origin", name="u_name_problem_origin"),
+    )
 
 
 class Asset(Base):

diff --git a/python/powerlift/src/powerlift/executors/azure_ci.py b/python/powerlift/src/powerlift/executors/azure_ci.py
@@ -25,7 +25,15 @@ def _wait_for_completed_worker(results):
         time.sleep(1)
 
 
-def _run(tasks, azure_json, num_cores, mem_size_gb, n_running_containers, delete_group_container_on_complete, batch_id):
+def _run(
+    tasks,
+    azure_json,
+    num_cores,
+    mem_size_gb,
+    n_running_containers,
+    delete_group_container_on_complete,
+    batch_id,
+):
     from azure.mgmt.containerinstance.models import (
         ContainerGroup,
         Container,
@@ -131,7 +139,7 @@ def __init__(
         wheel_filepaths: List[str] = None,
         docker_db_uri: str = None,
         raise_exception: bool = False,
-        delete_group_container_on_complete: bool = True
+        delete_group_container_on_complete: bool = True,
     ):
         """Runs remote execution of trials via Azure Container Instances.
 
@@ -168,7 +176,12 @@ def __init__(
             "resource_group": resource_group,
         }
         self._batch_id = random.getrandbits(64)
-        super().__init__(store=store, n_cpus=1, raise_exception=raise_exception, wheel_filepaths=wheel_filepaths)
+        super().__init__(
+            store=store,
+            n_cpus=1,
+            raise_exception=raise_exception,
+            wheel_filepaths=wheel_filepaths,
+        )
 
     def delete_credentials(self):
         """Deletes credentials in object for accessing Azure Resources."""

diff --git a/python/powerlift/src/powerlift/executors/docker.py b/python/powerlift/src/powerlift/executors/docker.py
@@ -48,7 +48,7 @@ def __init__(
         n_running_containers: int = None,
         wheel_filepaths: List[str] = None,
         docker_db_uri: str = None,
-        raise_exception: bool = False
+        raise_exception: bool = False,
     ):
         """Runs trials in local docker containers.
 
@@ -62,7 +62,12 @@ def __init__(
         """
         self._docker_db_uri = docker_db_uri
         self._image = image
-        super().__init__(store=store, n_cpus=n_running_containers, raise_exception=raise_exception, wheel_filepaths=wheel_filepaths)
+        super().__init__(
+            store=store,
+            n_cpus=n_running_containers,
+            raise_exception=raise_exception,
+            wheel_filepaths=wheel_filepaths,
+        )
 
     def submit(self, trial_run_fn, trials: Iterable, timeout=None):
         uri = (