Skip to content

Commit

Permalink
reformatting
Browse files Browse the repository at this point in the history
  • Loading branch information
paulbkoch committed Jul 28, 2024
1 parent 356e589 commit c438070
Show file tree
Hide file tree
Showing 18 changed files with 214 additions and 159 deletions.
48 changes: 24 additions & 24 deletions python/interpret-core/interpret/glassbox/_ebm/_ebm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2439,20 +2439,20 @@ class ExplainableBoostingClassifier(EBMModel, ClassifierMixin, ExplainerMixin):
Number of rounds with no improvement to trigger early stopping. 0 turns off
early stopping and boosting will occur for exactly max_rounds.
early_stopping_tolerance : float, default=1e-4
Tolerance that dictates the smallest delta required to be considered an
Tolerance that dictates the smallest delta required to be considered an
improvement which prevents the algorithm from early stopping.
early_stopping_tolerance is expressed as a percentage of the early
stopping metric. Negative values indicate that the individual
early_stopping_tolerance is expressed as a percentage of the early
stopping metric. Negative values indicate that the individual
models should be overfit before stopping.
EBMs are a bagged ensemble of models. Setting the early_stopping_tolerance
to zero (or even negative), allows learning to overfit each of the individual
models a little, which can improve the accuracy of the ensemble as a whole.
Overfitting each of the individual models reduces the bias of each model at
the expense of increasing the variance (due to overfitting) of the individual
models. But averaging the models in the ensemble reduces variance without
much change in bias. Since the goal is to find the optimum bias-variance
tradeoff for the ensemble of models --- not the individual models --- a small
amount of overfitting of the individual models can improve the accuracy of
EBMs are a bagged ensemble of models. Setting the early_stopping_tolerance
to zero (or even negative), allows learning to overfit each of the individual
models a little, which can improve the accuracy of the ensemble as a whole.
Overfitting each of the individual models reduces the bias of each model at
the expense of increasing the variance (due to overfitting) of the individual
models. But averaging the models in the ensemble reduces variance without
much change in bias. Since the goal is to find the optimum bias-variance
tradeoff for the ensemble of models --- not the individual models --- a small
amount of overfitting of the individual models can improve the accuracy of
the ensemble as a whole.
min_samples_leaf : int, default=2
Minimum number of samples allowed in the leaves.
Expand Down Expand Up @@ -2774,20 +2774,20 @@ class ExplainableBoostingRegressor(EBMModel, RegressorMixin, ExplainerMixin):
Number of rounds with no improvement to trigger early stopping. 0 turns off
early stopping and boosting will occur for exactly max_rounds.
early_stopping_tolerance : float, default=1e-4
Tolerance that dictates the smallest delta required to be considered an
Tolerance that dictates the smallest delta required to be considered an
improvement which prevents the algorithm from early stopping.
early_stopping_tolerance is expressed as a percentage of the early
stopping metric. Negative values indicate that the individual
early_stopping_tolerance is expressed as a percentage of the early
stopping metric. Negative values indicate that the individual
models should be overfit before stopping.
EBMs are a bagged ensemble of models. Setting the early_stopping_tolerance
to zero (or even negative), allows learning to overfit each of the individual
models a little, which can improve the accuracy of the ensemble as a whole.
Overfitting each of the individual models reduces the bias of each model at
the expense of increasing the variance (due to overfitting) of the individual
models. But averaging the models in the ensemble reduces variance without
much change in bias. Since the goal is to find the optimum bias-variance
tradeoff for the ensemble of models --- not the individual models --- a small
amount of overfitting of the individual models can improve the accuracy of
EBMs are a bagged ensemble of models. Setting the early_stopping_tolerance
to zero (or even negative), allows learning to overfit each of the individual
models a little, which can improve the accuracy of the ensemble as a whole.
Overfitting each of the individual models reduces the bias of each model at
the expense of increasing the variance (due to overfitting) of the individual
models. But averaging the models in the ensemble reduces variance without
much change in bias. Since the goal is to find the optimum bias-variance
tradeoff for the ensemble of models --- not the individual models --- a small
amount of overfitting of the individual models can improve the accuracy of
the ensemble as a whole.
min_samples_leaf : int, default=2
Minimum number of samples allowed in the leaves.
Expand Down
9 changes: 8 additions & 1 deletion python/powerlift/src/powerlift/bench/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,11 @@
from powerlift.bench.benchmark import Benchmark

from powerlift.bench.store import populate_with_datasets, DatasetAlreadyExistsError
from powerlift.bench.store import retrieve_openml_automl_regression, retrieve_openml_automl_classification, retrieve_openml_cc18, retrieve_openml, retrieve_catboost_50k, retrieve_pmlb
from powerlift.bench.store import (
retrieve_openml_automl_regression,
retrieve_openml_automl_classification,
retrieve_openml_cc18,
retrieve_openml,
retrieve_catboost_50k,
retrieve_pmlb,
)
91 changes: 65 additions & 26 deletions python/powerlift/src/powerlift/bench/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,6 @@ def serialize(cls, obj):
mimetype = MIMETYPE_WHEEL
else:
return None, None


return mimetype, bstream

Expand Down Expand Up @@ -923,8 +922,10 @@ def name(self):

class DatasetAlreadyExistsError(Exception):
"""Raised when dataset already exists in store."""

pass


def populate_with_datasets(
store: Store,
dataset_iter: Iterable[Dataset] = None,
Expand All @@ -947,7 +948,8 @@ def populate_with_datasets(

if dataset_iter is None:
dataset_iter = chain(
retrieve_openml_automl_regression(cache_dir=cache_dir), retrieve_openml_automl_classification(cache_dir=cache_dir)
retrieve_openml_automl_regression(cache_dir=cache_dir),
retrieve_openml_automl_classification(cache_dir=cache_dir),
)

for dataset in dataset_iter:
Expand All @@ -963,7 +965,9 @@ def populate_with_datasets(
return True


def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str = "openml") -> Generator[SupervisedDataset, None, None]:
def retrieve_openml(
cache_dir: str = None, suite_id: int | str = 99, source: str = "openml"
) -> Generator[SupervisedDataset, None, None]:
"""Retrives OpenML datasets.
Args:
Expand All @@ -978,7 +982,7 @@ def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str

if cache_dir is not None:
cache_dir = pathlib.Path(cache_dir, source)

dataset_names_filename = "dataset_names.json"
dataset_names_stream = retrieve_cache(cache_dir, [dataset_names_filename])
if dataset_names_stream is None:
Expand All @@ -987,8 +991,19 @@ def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str
tasks = suite.tasks.copy()
random.Random(1337).shuffle(tasks)
for task_id in tqdm(tasks, desc=source):
task = openml.tasks.get_task(task_id, download_splits=False, download_data=False, download_qualities=False, download_features_meta_data=False)
dataset = openml.datasets.get_dataset(task.dataset_id, download_data=True, download_qualities=True, download_features_meta_data=True)
task = openml.tasks.get_task(
task_id,
download_splits=False,
download_data=False,
download_qualities=False,
download_features_meta_data=False,
)
dataset = openml.datasets.get_dataset(
task.dataset_id,
download_data=True,
download_qualities=True,
download_features_meta_data=True,
)
name = dataset.name
dataset_names.append(name)

Expand All @@ -1002,7 +1017,9 @@ def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str

if task.task_type_id == openml.tasks.TaskType.SUPERVISED_CLASSIFICATION:
problem = (
"binary" if dataset.qualities["NumberOfClasses"] == 2 else "multiclass"
"binary"
if dataset.qualities["NumberOfClasses"] == 2
else "multiclass"
)

# for benchmarking we do not care about the original target strings
Expand All @@ -1019,20 +1036,26 @@ def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str
col = col.sparse.to_dense()
X[col_name] = col

if col.dtype.name == 'category':
if col.dtype.name == "category":
if not cat:
raise Exception(f"Categorical type mismatch. Was CategoricalDtype but indicated non-categorical.")
raise Exception(
f"Categorical type mismatch. Was CategoricalDtype but indicated non-categorical."
)
if col.cat.ordered:
# OpenMl incorrectly is indicating these as ordered
X[col_name] = col.cat.as_unordered()
elif col.dtype.name == 'object':
elif col.dtype.name == "object":
if cat:
X[col_name] = col.astype(pd.CategoricalDtype(ordered=False))
else:
X[col_name] = col.astype(float)
elif np.issubdtype(col.dtype, np.floating) or np.issubdtype(col.dtype, np.integer):
elif np.issubdtype(col.dtype, np.floating) or np.issubdtype(
col.dtype, np.integer
):
if cat:
raise Exception(f"Categorical type mismatch. Was continuous but indicated categorical.")
raise Exception(
f"Categorical type mismatch. Was continuous but indicated categorical."
)
else:
raise Exception(f"Unrecognized data type {col.dtype.name}.")

Expand All @@ -1051,11 +1074,15 @@ def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str
yield supervised

if cache_dir is not None:
_, dataset_names_stream = BytesParser.serialize({"dataset_names": dataset_names})
_, dataset_names_stream = BytesParser.serialize(
{"dataset_names": dataset_names}
)
update_cache(cache_dir, [dataset_names_filename], [dataset_names_stream])
else:
dataset_names_stream = dataset_names_stream[0]
dataset_names = BytesParser.deserialize(MIMETYPE_JSON, dataset_names_stream)["dataset_names"]
dataset_names = BytesParser.deserialize(MIMETYPE_JSON, dataset_names_stream)[
"dataset_names"
]
for name in tqdm(dataset_names, desc=source):
X_name = f"{name}.X.parquet"
y_name = f"{name}.y.parquet"
Expand All @@ -1064,7 +1091,10 @@ def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str
supervised = SupervisedDataset.deserialize(*cached)
yield supervised

def retrieve_openml_automl_regression(cache_dir: str = None) -> Generator[SupervisedDataset, None, None]:

def retrieve_openml_automl_regression(
cache_dir: str = None,
) -> Generator[SupervisedDataset, None, None]:
"""Retrives OpenML AutoML regression datasets.
Args:
Expand All @@ -1076,7 +1106,10 @@ def retrieve_openml_automl_regression(cache_dir: str = None) -> Generator[Superv

return retrieve_openml(cache_dir, 269, "openml_automl_regression")

def retrieve_openml_automl_classification(cache_dir: str = None) -> Generator[SupervisedDataset, None, None]:

def retrieve_openml_automl_classification(
cache_dir: str = None,
) -> Generator[SupervisedDataset, None, None]:
"""Retrives OpenML AutoML classification datasets.
Args:
Expand All @@ -1088,7 +1121,10 @@ def retrieve_openml_automl_classification(cache_dir: str = None) -> Generator[Su

return retrieve_openml(cache_dir, 271, "openml_automl_classification")

def retrieve_openml_cc18(cache_dir: str = None) -> Generator[SupervisedDataset, None, None]:

def retrieve_openml_cc18(
cache_dir: str = None,
) -> Generator[SupervisedDataset, None, None]:
"""Retrives OpenML CC18 datasets.
Args:
Expand All @@ -1100,7 +1136,10 @@ def retrieve_openml_cc18(cache_dir: str = None) -> Generator[SupervisedDataset,

return retrieve_openml(cache_dir, 99, "openml_cc18")

def retrieve_catboost_50k(cache_dir: str = None) -> Generator[SupervisedDataset, None, None]:

def retrieve_catboost_50k(
cache_dir: str = None,
) -> Generator[SupervisedDataset, None, None]:
"""Retrieves catboost regression and classification datasets that have less than 50k training instances.
Does not download adult dataset as currently there some download issues.
Expand All @@ -1125,39 +1164,39 @@ def retrieve_catboost_50k(cache_dir: str = None) -> Generator[SupervisedDataset,
"name": "amazon",
"data_fn": amazon,
"problem": "classification",
"target": "ACTION"
"target": "ACTION",
},
{
"name": "msrank_10k",
"data_fn": msrank_10k,
"problem": "regression",
"target": 0
"target": 0,
},
{
"name": "titanic",
"data_fn": titanic,
"problem": "classification",
"target": "Survived"
"target": "Survived",
},
]

if cache_dir is not None:
cache_dir = pathlib.Path(cache_dir, "catboost_50k")

for dataset in tqdm(datasets, desc="catboost_50k"):
name = dataset['name']
name = dataset["name"]
X_name = f"{name}.X.parquet"
y_name = f"{name}.y.parquet"
meta_name = f"{name}.meta.json"

cached = retrieve_cache(cache_dir, [X_name, y_name, meta_name])
if cached is None:
df = dataset['data_fn']()[0]
target = dataset['target']
df = dataset["data_fn"]()[0]
target = dataset["target"]
X = df.drop(target, axis=1)
y = df[target]
problem = dataset['problem']
if dataset['problem'] == "classification":
problem = dataset["problem"]
if dataset["problem"] == "classification":
problem = "binary" if len(y.unique()) == 2 else "multiclass"
meta = {
"name": name,
Expand Down
4 changes: 3 additions & 1 deletion python/powerlift/src/powerlift/db/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,9 @@ class Task(Base):
measure_outcomes = relationship(
"MeasureOutcome", secondary=task_measure_outcome_table, back_populates="tasks"
)
__table_args__ = (UniqueConstraint("name", "problem", "origin", name="u_name_problem_origin"),)
__table_args__ = (
UniqueConstraint("name", "problem", "origin", name="u_name_problem_origin"),
)


class Asset(Base):
Expand Down
19 changes: 16 additions & 3 deletions python/powerlift/src/powerlift/executors/azure_ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,15 @@ def _wait_for_completed_worker(results):
time.sleep(1)


def _run(tasks, azure_json, num_cores, mem_size_gb, n_running_containers, delete_group_container_on_complete, batch_id):
def _run(
tasks,
azure_json,
num_cores,
mem_size_gb,
n_running_containers,
delete_group_container_on_complete,
batch_id,
):
from azure.mgmt.containerinstance.models import (
ContainerGroup,
Container,
Expand Down Expand Up @@ -131,7 +139,7 @@ def __init__(
wheel_filepaths: List[str] = None,
docker_db_uri: str = None,
raise_exception: bool = False,
delete_group_container_on_complete: bool = True
delete_group_container_on_complete: bool = True,
):
"""Runs remote execution of trials via Azure Container Instances.
Expand Down Expand Up @@ -168,7 +176,12 @@ def __init__(
"resource_group": resource_group,
}
self._batch_id = random.getrandbits(64)
super().__init__(store=store, n_cpus=1, raise_exception=raise_exception, wheel_filepaths=wheel_filepaths)
super().__init__(
store=store,
n_cpus=1,
raise_exception=raise_exception,
wheel_filepaths=wheel_filepaths,
)

def delete_credentials(self):
"""Deletes credentials in object for accessing Azure Resources."""
Expand Down
9 changes: 7 additions & 2 deletions python/powerlift/src/powerlift/executors/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(
n_running_containers: int = None,
wheel_filepaths: List[str] = None,
docker_db_uri: str = None,
raise_exception: bool = False
raise_exception: bool = False,
):
"""Runs trials in local docker containers.
Expand All @@ -62,7 +62,12 @@ def __init__(
"""
self._docker_db_uri = docker_db_uri
self._image = image
super().__init__(store=store, n_cpus=n_running_containers, raise_exception=raise_exception, wheel_filepaths=wheel_filepaths)
super().__init__(
store=store,
n_cpus=n_running_containers,
raise_exception=raise_exception,
wheel_filepaths=wheel_filepaths,
)

def submit(self, trial_run_fn, trials: Iterable, timeout=None):
uri = (
Expand Down
Loading

0 comments on commit c438070

Please sign in to comment.