diff --git a/python/interpret-core/interpret/glassbox/_ebm/_ebm.py b/python/interpret-core/interpret/glassbox/_ebm/_ebm.py index 171c8004a..42dec9451 100644 --- a/python/interpret-core/interpret/glassbox/_ebm/_ebm.py +++ b/python/interpret-core/interpret/glassbox/_ebm/_ebm.py @@ -2439,20 +2439,20 @@ class ExplainableBoostingClassifier(EBMModel, ClassifierMixin, ExplainerMixin): Number of rounds with no improvement to trigger early stopping. 0 turns off early stopping and boosting will occur for exactly max_rounds. early_stopping_tolerance : float, default=1e-4 - Tolerance that dictates the smallest delta required to be considered an + Tolerance that dictates the smallest delta required to be considered an improvement which prevents the algorithm from early stopping. - early_stopping_tolerance is expressed as a percentage of the early - stopping metric. Negative values indicate that the individual + early_stopping_tolerance is expressed as a percentage of the early + stopping metric. Negative values indicate that the individual models should be overfit before stopping. - EBMs are a bagged ensemble of models. Setting the early_stopping_tolerance - to zero (or even negative), allows learning to overfit each of the individual - models a little, which can improve the accuracy of the ensemble as a whole. - Overfitting each of the individual models reduces the bias of each model at - the expense of increasing the variance (due to overfitting) of the individual - models. But averaging the models in the ensemble reduces variance without - much change in bias. Since the goal is to find the optimum bias-variance - tradeoff for the ensemble of models --- not the individual models --- a small - amount of overfitting of the individual models can improve the accuracy of + EBMs are a bagged ensemble of models. Setting the early_stopping_tolerance + to zero (or even negative), allows learning to overfit each of the individual + models a little, which can improve the accuracy of the ensemble as a whole. + Overfitting each of the individual models reduces the bias of each model at + the expense of increasing the variance (due to overfitting) of the individual + models. But averaging the models in the ensemble reduces variance without + much change in bias. Since the goal is to find the optimum bias-variance + tradeoff for the ensemble of models --- not the individual models --- a small + amount of overfitting of the individual models can improve the accuracy of the ensemble as a whole. min_samples_leaf : int, default=2 Minimum number of samples allowed in the leaves. @@ -2774,20 +2774,20 @@ class ExplainableBoostingRegressor(EBMModel, RegressorMixin, ExplainerMixin): Number of rounds with no improvement to trigger early stopping. 0 turns off early stopping and boosting will occur for exactly max_rounds. early_stopping_tolerance : float, default=1e-4 - Tolerance that dictates the smallest delta required to be considered an + Tolerance that dictates the smallest delta required to be considered an improvement which prevents the algorithm from early stopping. - early_stopping_tolerance is expressed as a percentage of the early - stopping metric. Negative values indicate that the individual + early_stopping_tolerance is expressed as a percentage of the early + stopping metric. Negative values indicate that the individual models should be overfit before stopping. - EBMs are a bagged ensemble of models. Setting the early_stopping_tolerance - to zero (or even negative), allows learning to overfit each of the individual - models a little, which can improve the accuracy of the ensemble as a whole. - Overfitting each of the individual models reduces the bias of each model at - the expense of increasing the variance (due to overfitting) of the individual - models. But averaging the models in the ensemble reduces variance without - much change in bias. Since the goal is to find the optimum bias-variance - tradeoff for the ensemble of models --- not the individual models --- a small - amount of overfitting of the individual models can improve the accuracy of + EBMs are a bagged ensemble of models. Setting the early_stopping_tolerance + to zero (or even negative), allows learning to overfit each of the individual + models a little, which can improve the accuracy of the ensemble as a whole. + Overfitting each of the individual models reduces the bias of each model at + the expense of increasing the variance (due to overfitting) of the individual + models. But averaging the models in the ensemble reduces variance without + much change in bias. Since the goal is to find the optimum bias-variance + tradeoff for the ensemble of models --- not the individual models --- a small + amount of overfitting of the individual models can improve the accuracy of the ensemble as a whole. min_samples_leaf : int, default=2 Minimum number of samples allowed in the leaves. diff --git a/python/powerlift/src/powerlift/bench/__init__.py b/python/powerlift/src/powerlift/bench/__init__.py index e072bb8a3..41738a083 100644 --- a/python/powerlift/src/powerlift/bench/__init__.py +++ b/python/powerlift/src/powerlift/bench/__init__.py @@ -16,4 +16,11 @@ from powerlift.bench.benchmark import Benchmark from powerlift.bench.store import populate_with_datasets, DatasetAlreadyExistsError -from powerlift.bench.store import retrieve_openml_automl_regression, retrieve_openml_automl_classification, retrieve_openml_cc18, retrieve_openml, retrieve_catboost_50k, retrieve_pmlb +from powerlift.bench.store import ( + retrieve_openml_automl_regression, + retrieve_openml_automl_classification, + retrieve_openml_cc18, + retrieve_openml, + retrieve_catboost_50k, + retrieve_pmlb, +) diff --git a/python/powerlift/src/powerlift/bench/store.py b/python/powerlift/src/powerlift/bench/store.py index feac729b2..a76a00b96 100644 --- a/python/powerlift/src/powerlift/bench/store.py +++ b/python/powerlift/src/powerlift/bench/store.py @@ -150,7 +150,6 @@ def serialize(cls, obj): mimetype = MIMETYPE_WHEEL else: return None, None - return mimetype, bstream @@ -923,8 +922,10 @@ def name(self): class DatasetAlreadyExistsError(Exception): """Raised when dataset already exists in store.""" + pass + def populate_with_datasets( store: Store, dataset_iter: Iterable[Dataset] = None, @@ -947,7 +948,8 @@ def populate_with_datasets( if dataset_iter is None: dataset_iter = chain( - retrieve_openml_automl_regression(cache_dir=cache_dir), retrieve_openml_automl_classification(cache_dir=cache_dir) + retrieve_openml_automl_regression(cache_dir=cache_dir), + retrieve_openml_automl_classification(cache_dir=cache_dir), ) for dataset in dataset_iter: @@ -963,7 +965,9 @@ def populate_with_datasets( return True -def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str = "openml") -> Generator[SupervisedDataset, None, None]: +def retrieve_openml( + cache_dir: str = None, suite_id: int | str = 99, source: str = "openml" +) -> Generator[SupervisedDataset, None, None]: """Retrives OpenML datasets. Args: @@ -978,7 +982,7 @@ def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str if cache_dir is not None: cache_dir = pathlib.Path(cache_dir, source) - + dataset_names_filename = "dataset_names.json" dataset_names_stream = retrieve_cache(cache_dir, [dataset_names_filename]) if dataset_names_stream is None: @@ -987,8 +991,19 @@ def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str tasks = suite.tasks.copy() random.Random(1337).shuffle(tasks) for task_id in tqdm(tasks, desc=source): - task = openml.tasks.get_task(task_id, download_splits=False, download_data=False, download_qualities=False, download_features_meta_data=False) - dataset = openml.datasets.get_dataset(task.dataset_id, download_data=True, download_qualities=True, download_features_meta_data=True) + task = openml.tasks.get_task( + task_id, + download_splits=False, + download_data=False, + download_qualities=False, + download_features_meta_data=False, + ) + dataset = openml.datasets.get_dataset( + task.dataset_id, + download_data=True, + download_qualities=True, + download_features_meta_data=True, + ) name = dataset.name dataset_names.append(name) @@ -1002,7 +1017,9 @@ def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str if task.task_type_id == openml.tasks.TaskType.SUPERVISED_CLASSIFICATION: problem = ( - "binary" if dataset.qualities["NumberOfClasses"] == 2 else "multiclass" + "binary" + if dataset.qualities["NumberOfClasses"] == 2 + else "multiclass" ) # for benchmarking we do not care about the original target strings @@ -1019,20 +1036,26 @@ def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str col = col.sparse.to_dense() X[col_name] = col - if col.dtype.name == 'category': + if col.dtype.name == "category": if not cat: - raise Exception(f"Categorical type mismatch. Was CategoricalDtype but indicated non-categorical.") + raise Exception( + f"Categorical type mismatch. Was CategoricalDtype but indicated non-categorical." + ) if col.cat.ordered: # OpenMl incorrectly is indicating these as ordered X[col_name] = col.cat.as_unordered() - elif col.dtype.name == 'object': + elif col.dtype.name == "object": if cat: X[col_name] = col.astype(pd.CategoricalDtype(ordered=False)) else: X[col_name] = col.astype(float) - elif np.issubdtype(col.dtype, np.floating) or np.issubdtype(col.dtype, np.integer): + elif np.issubdtype(col.dtype, np.floating) or np.issubdtype( + col.dtype, np.integer + ): if cat: - raise Exception(f"Categorical type mismatch. Was continuous but indicated categorical.") + raise Exception( + f"Categorical type mismatch. Was continuous but indicated categorical." + ) else: raise Exception(f"Unrecognized data type {col.dtype.name}.") @@ -1051,11 +1074,15 @@ def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str yield supervised if cache_dir is not None: - _, dataset_names_stream = BytesParser.serialize({"dataset_names": dataset_names}) + _, dataset_names_stream = BytesParser.serialize( + {"dataset_names": dataset_names} + ) update_cache(cache_dir, [dataset_names_filename], [dataset_names_stream]) else: dataset_names_stream = dataset_names_stream[0] - dataset_names = BytesParser.deserialize(MIMETYPE_JSON, dataset_names_stream)["dataset_names"] + dataset_names = BytesParser.deserialize(MIMETYPE_JSON, dataset_names_stream)[ + "dataset_names" + ] for name in tqdm(dataset_names, desc=source): X_name = f"{name}.X.parquet" y_name = f"{name}.y.parquet" @@ -1064,7 +1091,10 @@ def retrieve_openml(cache_dir: str = None, suite_id: int | str = 99, source: str supervised = SupervisedDataset.deserialize(*cached) yield supervised -def retrieve_openml_automl_regression(cache_dir: str = None) -> Generator[SupervisedDataset, None, None]: + +def retrieve_openml_automl_regression( + cache_dir: str = None, +) -> Generator[SupervisedDataset, None, None]: """Retrives OpenML AutoML regression datasets. Args: @@ -1076,7 +1106,10 @@ def retrieve_openml_automl_regression(cache_dir: str = None) -> Generator[Superv return retrieve_openml(cache_dir, 269, "openml_automl_regression") -def retrieve_openml_automl_classification(cache_dir: str = None) -> Generator[SupervisedDataset, None, None]: + +def retrieve_openml_automl_classification( + cache_dir: str = None, +) -> Generator[SupervisedDataset, None, None]: """Retrives OpenML AutoML classification datasets. Args: @@ -1088,7 +1121,10 @@ def retrieve_openml_automl_classification(cache_dir: str = None) -> Generator[Su return retrieve_openml(cache_dir, 271, "openml_automl_classification") -def retrieve_openml_cc18(cache_dir: str = None) -> Generator[SupervisedDataset, None, None]: + +def retrieve_openml_cc18( + cache_dir: str = None, +) -> Generator[SupervisedDataset, None, None]: """Retrives OpenML CC18 datasets. Args: @@ -1100,7 +1136,10 @@ def retrieve_openml_cc18(cache_dir: str = None) -> Generator[SupervisedDataset, return retrieve_openml(cache_dir, 99, "openml_cc18") -def retrieve_catboost_50k(cache_dir: str = None) -> Generator[SupervisedDataset, None, None]: + +def retrieve_catboost_50k( + cache_dir: str = None, +) -> Generator[SupervisedDataset, None, None]: """Retrieves catboost regression and classification datasets that have less than 50k training instances. Does not download adult dataset as currently there some download issues. @@ -1125,19 +1164,19 @@ def retrieve_catboost_50k(cache_dir: str = None) -> Generator[SupervisedDataset, "name": "amazon", "data_fn": amazon, "problem": "classification", - "target": "ACTION" + "target": "ACTION", }, { "name": "msrank_10k", "data_fn": msrank_10k, "problem": "regression", - "target": 0 + "target": 0, }, { "name": "titanic", "data_fn": titanic, "problem": "classification", - "target": "Survived" + "target": "Survived", }, ] @@ -1145,19 +1184,19 @@ def retrieve_catboost_50k(cache_dir: str = None) -> Generator[SupervisedDataset, cache_dir = pathlib.Path(cache_dir, "catboost_50k") for dataset in tqdm(datasets, desc="catboost_50k"): - name = dataset['name'] + name = dataset["name"] X_name = f"{name}.X.parquet" y_name = f"{name}.y.parquet" meta_name = f"{name}.meta.json" cached = retrieve_cache(cache_dir, [X_name, y_name, meta_name]) if cached is None: - df = dataset['data_fn']()[0] - target = dataset['target'] + df = dataset["data_fn"]()[0] + target = dataset["target"] X = df.drop(target, axis=1) y = df[target] - problem = dataset['problem'] - if dataset['problem'] == "classification": + problem = dataset["problem"] + if dataset["problem"] == "classification": problem = "binary" if len(y.unique()) == 2 else "multiclass" meta = { "name": name, diff --git a/python/powerlift/src/powerlift/db/schema.py b/python/powerlift/src/powerlift/db/schema.py index 37032d58c..58cfe9699 100644 --- a/python/powerlift/src/powerlift/db/schema.py +++ b/python/powerlift/src/powerlift/db/schema.py @@ -198,7 +198,9 @@ class Task(Base): measure_outcomes = relationship( "MeasureOutcome", secondary=task_measure_outcome_table, back_populates="tasks" ) - __table_args__ = (UniqueConstraint("name", "problem", "origin", name="u_name_problem_origin"),) + __table_args__ = ( + UniqueConstraint("name", "problem", "origin", name="u_name_problem_origin"), + ) class Asset(Base): diff --git a/python/powerlift/src/powerlift/executors/azure_ci.py b/python/powerlift/src/powerlift/executors/azure_ci.py index 11d881c3b..e2a619927 100644 --- a/python/powerlift/src/powerlift/executors/azure_ci.py +++ b/python/powerlift/src/powerlift/executors/azure_ci.py @@ -25,7 +25,15 @@ def _wait_for_completed_worker(results): time.sleep(1) -def _run(tasks, azure_json, num_cores, mem_size_gb, n_running_containers, delete_group_container_on_complete, batch_id): +def _run( + tasks, + azure_json, + num_cores, + mem_size_gb, + n_running_containers, + delete_group_container_on_complete, + batch_id, +): from azure.mgmt.containerinstance.models import ( ContainerGroup, Container, @@ -131,7 +139,7 @@ def __init__( wheel_filepaths: List[str] = None, docker_db_uri: str = None, raise_exception: bool = False, - delete_group_container_on_complete: bool = True + delete_group_container_on_complete: bool = True, ): """Runs remote execution of trials via Azure Container Instances. @@ -168,7 +176,12 @@ def __init__( "resource_group": resource_group, } self._batch_id = random.getrandbits(64) - super().__init__(store=store, n_cpus=1, raise_exception=raise_exception, wheel_filepaths=wheel_filepaths) + super().__init__( + store=store, + n_cpus=1, + raise_exception=raise_exception, + wheel_filepaths=wheel_filepaths, + ) def delete_credentials(self): """Deletes credentials in object for accessing Azure Resources.""" diff --git a/python/powerlift/src/powerlift/executors/docker.py b/python/powerlift/src/powerlift/executors/docker.py index 6c03d82d5..98cb68bfc 100644 --- a/python/powerlift/src/powerlift/executors/docker.py +++ b/python/powerlift/src/powerlift/executors/docker.py @@ -48,7 +48,7 @@ def __init__( n_running_containers: int = None, wheel_filepaths: List[str] = None, docker_db_uri: str = None, - raise_exception: bool = False + raise_exception: bool = False, ): """Runs trials in local docker containers. @@ -62,7 +62,12 @@ def __init__( """ self._docker_db_uri = docker_db_uri self._image = image - super().__init__(store=store, n_cpus=n_running_containers, raise_exception=raise_exception, wheel_filepaths=wheel_filepaths) + super().__init__( + store=store, + n_cpus=n_running_containers, + raise_exception=raise_exception, + wheel_filepaths=wheel_filepaths, + ) def submit(self, trial_run_fn, trials: Iterable, timeout=None): uri = ( diff --git a/python/powerlift/src/powerlift/executors/localmachine.py b/python/powerlift/src/powerlift/executors/localmachine.py index 6ee2c8a5c..b416af7ca 100644 --- a/python/powerlift/src/powerlift/executors/localmachine.py +++ b/python/powerlift/src/powerlift/executors/localmachine.py @@ -17,7 +17,7 @@ def __init__( n_cpus: int = None, debug_mode: bool = False, wheel_filepaths: List[str] = None, - raise_exception: bool = False + raise_exception: bool = False, ): """Runs trial runs on the local machine. @@ -69,7 +69,12 @@ def submit(self, trial_run_fn, trials: Iterable, timeout=None): else: self._trial_id_to_result[trial.id] = self._pool.apply_async( runner.run_trials, - ([trial.id], self._store.uri, timeout, self._raise_exception or self._debug_mode), + ( + [trial.id], + self._store.uri, + timeout, + self._raise_exception or self._debug_mode, + ), error_callback=handle_err, ) diff --git a/python/powerlift/tests/conftest.py b/python/powerlift/tests/conftest.py index 32468b036..fc4dabb8d 100644 --- a/python/powerlift/tests/conftest.py +++ b/python/powerlift/tests/conftest.py @@ -22,6 +22,7 @@ def dataset_limit(): @pytest.fixture(scope="session") def uri(): from dotenv import load_dotenv + load_dotenv() pw = os.environ.get("TEST_DB_PASS", None) @@ -34,6 +35,7 @@ def uri(): @pytest.fixture(scope="session") def populated_uri(): from dotenv import load_dotenv + load_dotenv() pw = os.environ.get("TEST_DB_PASS", None) @@ -55,6 +57,7 @@ def populated_store(populated_uri, dataset_limit): def populated_azure_uri(): from dotenv import load_dotenv import os + load_dotenv() yield os.getenv("AZURE_DB_URL") @@ -64,6 +67,7 @@ def populated_azure_uri(): def populated_docker_uri(): from dotenv import load_dotenv import os + load_dotenv() yield os.getenv("DOCKER_DB_URL") diff --git a/python/powerlift/tests/powerlift/bench/test_experiment.py b/python/powerlift/tests/powerlift/bench/test_experiment.py index 2dba2dd44..8e341f613 100644 --- a/python/powerlift/tests/powerlift/bench/test_experiment.py +++ b/python/powerlift/tests/powerlift/bench/test_experiment.py @@ -32,7 +32,11 @@ def _benchmark(trial): from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer - if trial.task.problem == "binary" and trial.task.origin in ["openml", "pmlb", "catboost_50k"]: + if trial.task.problem == "binary" and trial.task.origin in [ + "openml", + "pmlb", + "catboost_50k", + ]: X, y, meta = trial.task.data(["X", "y", "meta"]) # Holdout split @@ -107,7 +111,9 @@ def test_scikit_experiment_local(populated_store): @pytest.mark.skip("Enable this when testing docker.") def test_scikit_experiment_docker(populated_docker_store, populated_docker_uri): executor = InsecureDocker( - populated_docker_store, n_running_containers=2, docker_db_uri=populated_docker_uri + populated_docker_store, + n_running_containers=2, + docker_db_uri=populated_docker_uri, ) benchmark = Benchmark(populated_docker_store, name="scikit_docker") benchmark.run(_benchmark, _trials, timeout=60, executor=executor) @@ -143,7 +149,7 @@ def test_scikit_experiment_aci(populated_azure_store): n_running_containers=5, num_cores=2, mem_size_gb=8, - delete_group_container_on_complete=False + delete_group_container_on_complete=False, ) benchmark = Benchmark(store, name="scikit") benchmark.run(_benchmark, _trials, timeout=60, executor=executor) diff --git a/shared/libebm/GenerateTermUpdate.cpp b/shared/libebm/GenerateTermUpdate.cpp index 25fae62e9..a8bbe2599 100644 --- a/shared/libebm/GenerateTermUpdate.cpp +++ b/shared/libebm/GenerateTermUpdate.cpp @@ -434,7 +434,7 @@ static ErrorEbm BoostMultiDimensional(BoosterShell* const pBoosterShell, } if(0 != (TermBoostFlags_PurifyUpdate & flags)) { - Tensor * const pTensor = pBoosterShell->GetInnerTermUpdate(); + Tensor* const pTensor = pBoosterShell->GetInnerTermUpdate(); size_t cDimensions = pTerm->GetCountDimensions(); size_t cTensorBinsPurify = 1; @@ -488,7 +488,7 @@ static ErrorEbm BoostMultiDimensional(BoosterShell* const pBoosterShell, pWeights += cTensorBinsPurify; ++pScores; } while(pScoreMulticlassEnd != pScores); - + free(aWeights); } diff --git a/shared/libebm/PartitionTwoDimensionalBoosting.cpp b/shared/libebm/PartitionTwoDimensionalBoosting.cpp index c6761c188..e01a9517a 100644 --- a/shared/libebm/PartitionTwoDimensionalBoosting.cpp +++ b/shared/libebm/PartitionTwoDimensionalBoosting.cpp @@ -139,13 +139,13 @@ static FloatCalc SweepMultiDimensional(const size_t cRuntimeScores, goto next; } - //if (0 != (TermBoostFlags_PurifyGain & flags)) { - // TODO: At this point we have the bin sums histogram for the tensor, so we can purify the future update - // for the cuts we're currently evaluating before calculating the gain. This should give us a more accurate gain - // calculation for the purified update. We need to construct the entire tensor here before purifying. - // We already calculate purified gain as an option during interaction detection, since the - // interaction metric we use is the gain calculation. - // See: Use of CalcInteractionFlags_Purify in PartitionTwoDimensionalInteraction.cpp + // if (0 != (TermBoostFlags_PurifyGain & flags)) { + // TODO: At this point we have the bin sums histogram for the tensor, so we can purify the future update + // for the cuts we're currently evaluating before calculating the gain. This should give us a more accurate gain + // calculation for the purified update. We need to construct the entire tensor here before purifying. + // We already calculate purified gain as an option during interaction detection, since the + // interaction metric we use is the gain calculation. + // See: Use of CalcInteractionFlags_Purify in PartitionTwoDimensionalInteraction.cpp //} { @@ -904,7 +904,8 @@ template class PartitionTwoDimensionalBoo FloatCalc weight; if(bUpdateWithHessian) { weight = static_cast(pGradientPairTotal[iScore].GetHess()); - update = ComputeSinglePartitionUpdate(static_cast(pGradientPairTotal[iScore].m_sumGradients), weight); + update = ComputeSinglePartitionUpdate( + static_cast(pGradientPairTotal[iScore].m_sumGradients), weight); if(nullptr != aWeights) { aWeights[iScore] = static_cast(weight); } diff --git a/shared/libebm/PartitionTwoDimensionalInteraction.cpp b/shared/libebm/PartitionTwoDimensionalInteraction.cpp index 3c7c17501..76613d63f 100644 --- a/shared/libebm/PartitionTwoDimensionalInteraction.cpp +++ b/shared/libebm/PartitionTwoDimensionalInteraction.cpp @@ -280,19 +280,19 @@ template class PartitionTwoDimensionalInt // // TODO: We are purififying the simple 2x2 solution below using a simple system of equations // but the solution below can be generalized to handle any size matrix and/or any size - // of tensor for 3-way and higher interactions. The system of equations below were solved - // using the substitution/elimination method, but to solve these in the general case we'll + // of tensor for 3-way and higher interactions. The system of equations below were solved + // using the substitution/elimination method, but to solve these in the general case we'll // need to implement a system of equations solver. First try something like the matrix or // inverse matrix method, and if that fails use an iterative solution like the // Jacobi or Gauss-Seidel methods. This would be a better solution than the iterative // solution that we currently use in the python purify() function. - // + // // TODO: Once more efficient purification is done, we can use the same purification - // method during boosting where we could then keep the interactions pure while we - // simultaneously boost mains and interactions togehter at the same time. This would - // be desirable in order to keep from overboosting on mains that are also included + // method during boosting where we could then keep the interactions pure while we + // simultaneously boost mains and interactions togehter at the same time. This would + // be desirable in order to keep from overboosting on mains that are also included // within interactions. - // + // // If we have a 2x2 matrix of updates, we can purify the updates using an equation // ------------------- // |update00|update01| diff --git a/shared/libebm/Purify.cpp b/shared/libebm/Purify.cpp index 96b1f8b24..aebaad8ec 100644 --- a/shared/libebm/Purify.cpp +++ b/shared/libebm/Purify.cpp @@ -4,10 +4,9 @@ // Purification algorithm from: https://arxiv.org/abs/1911.04974 //@article {lengerich2019purifying, -// title={Purifying Interaction Effects with the Functional ANOVA: An Efficient Algorithm for Recovering Identifiable Additive Models}, -// author={Lengerich, Benjamin and Tan, Sarah and Chang, Chun-Hao and Hooker, Giles and Caruana, Rich}, -// journal={arXiv preprint arXiv:1911.04974}, -// year={2019} +// title={Purifying Interaction Effects with the Functional ANOVA: An Efficient Algorithm for Recovering Identifiable +// Additive Models}, author={Lengerich, Benjamin and Tan, Sarah and Chang, Chun-Hao and Hooker, Giles and Caruana, +// Rich}, journal={arXiv preprint arXiv:1911.04974}, year={2019} //} #include "pch.hpp" @@ -216,7 +215,6 @@ EBM_API_BODY double EBM_CALLING_CONVENTION MeasureImpurity(IntEbm countMultiScor return impurityTotal; } - extern ErrorEbm PurifyInternal(const double tolerance, const size_t cScores, const size_t cTensorBins, @@ -1060,11 +1058,10 @@ static void NormalizeClasses(const size_t cScores, double* const aScores) { } while(pScoresEnd != pScore); } -static ErrorEbm PurifyNormalizedMulticlass( - const size_t cScores, +static ErrorEbm PurifyNormalizedMulticlass(const size_t cScores, const size_t cTensorBins, const size_t cSurfaceBins, - RandomDeterministic * const pRng, + RandomDeterministic* const pRng, size_t* const aRandomize, const size_t* const aDimensionLengths, const double* const aWeights, @@ -1845,17 +1842,17 @@ EBM_API_BODY ErrorEbm EBM_CALLING_CONVENTION Purify(double tolerance, // [-DBL_MAX, -DBL_MAX, DBL_MAX] which purifies to [2/3 * -DBL_MAX, 2/3 * -DBL_MAX, 4/3 * DBL_MAX] // - On the first purification, we cannot overflow the intercept or purification surface cell since we're taking // a weighted average the average cannot be larger than any value - // - On subsequent purification steps, we can overflow the intercept or purification surface cell, however I + // - On subsequent purification steps, we can overflow the intercept or purification surface cell, however I // think if the algorithm was allowed to converge and there were no floating point noise issues the // purified cell and/or intercept would not overflow to an infinity // - we can prevent the impurities and/or intercept from overflowing by limiting the amount of purification in - // the step to a value that does not overflow and I think the algorithm is still guaranteed to make forward + // the step to a value that does not overflow and I think the algorithm is still guaranteed to make forward // progress // - eventually though, even the impurities can get infinities since we later purified the impurities until we reach // the intercept, so the only guarantee we could get in theory was that we don't overflow the intercept // - But even for the intercept, since there is an existing intercept we can't guarantee that the purified intercept // will not overflow to an infinity, so there can be no guarantees in the EBM as a whole - // + // // We do take the following precautions: // - when we move impurity from the original tensor to the impurity tensor, we limit the purification at that step // to a number that will not overflow the impurity cell diff --git a/shared/libebm/TensorTotalsSum.hpp b/shared/libebm/TensorTotalsSum.hpp index 2e4ade29e..e69b692ef 100644 --- a/shared/libebm/TensorTotalsSum.hpp +++ b/shared/libebm/TensorTotalsSum.hpp @@ -256,12 +256,8 @@ INLINE_ALWAYS static void TensorTotalsSumMulti(const size_t cRuntimeScores, UNUSED(aDebugCopyBins); #ifdef CHECK_TENSORS if(nullptr != aDebugCopyBins) { - TensorTotalsCompareDebug(cScores, - cDimensions, - aDimensions, - aDebugCopyBins->Downgrade(), - *binOut.Downgrade(), - aGradientPairsOut); + TensorTotalsCompareDebug( + cScores, cDimensions, aDimensions, aDebugCopyBins->Downgrade(), *binOut.Downgrade(), aGradientPairsOut); } #endif // CHECK_TENSORS #endif // NDEBUG diff --git a/shared/libebm/interpretable_numerics.cpp b/shared/libebm/interpretable_numerics.cpp index 0570d8db1..947a0ad5e 100644 --- a/shared/libebm/interpretable_numerics.cpp +++ b/shared/libebm/interpretable_numerics.cpp @@ -1269,7 +1269,7 @@ static double Stddev(const size_t cSamples, // there should be some factor that gives us a non-overflowing stddev EBM_ASSERT(std::numeric_limits::min() <= factor); - skip:; + skip:; cNaN = 0; cInf = 0; diff --git a/shared/libebm/random.cpp b/shared/libebm/random.cpp index fc58deee4..4f33b28e2 100644 --- a/shared/libebm/random.cpp +++ b/shared/libebm/random.cpp @@ -185,10 +185,7 @@ EBM_API_BODY ErrorEbm EBM_CALLING_CONVENTION Shuffle(void* rng, IntEbm count, In LOG_0(Trace_Error, "ERROR Shuffle count < IntEbm { 0 }"); return Error_IllegalParamVal; } else { - LOG_COUNTED_0(&g_cLogExitShuffle, - Trace_Info, - Trace_Verbose, - "Shuffle zero items requested"); + LOG_COUNTED_0(&g_cLogExitShuffle, Trace_Info, Trace_Verbose, "Shuffle zero items requested"); return Error_None; } } diff --git a/shared/libebm/tests/PurifyTest.cpp b/shared/libebm/tests/PurifyTest.cpp index 2c273ec20..d04dfaa22 100644 --- a/shared/libebm/tests/PurifyTest.cpp +++ b/shared/libebm/tests/PurifyTest.cpp @@ -312,8 +312,8 @@ TEST_CASE("Purify simple 3x4, infinite weights") { &residualIntercept); CHECK(Error_None == error); CHECK_APPROX(residualIntercept, 5.8); - //const double impurity = MeasureImpurity(1, 0, cDimensions, dimensionLengths, weights, scores); - //CHECK(-0.001 < impurity && impurity < 0.001); + // const double impurity = MeasureImpurity(1, 0, cDimensions, dimensionLengths, weights, scores); + // CHECK(-0.001 < impurity && impurity < 0.001); } TEST_CASE("Purify simple 3x4, infinite weights, overflow") { @@ -335,8 +335,8 @@ TEST_CASE("Purify simple 3x4, infinite weights, overflow") { impurities, &residualIntercept); CHECK(Error_None == error); - //const double impurity = MeasureImpurity(1, 0, cDimensions, dimensionLengths, weights, scores); - //CHECK(-0.001 < impurity && impurity < 0.001); + // const double impurity = MeasureImpurity(1, 0, cDimensions, dimensionLengths, weights, scores); + // CHECK(-0.001 < impurity && impurity < 0.001); } TEST_CASE("Purify simple 3x4 with NaN") { @@ -359,8 +359,8 @@ TEST_CASE("Purify simple 3x4 with NaN") { &residualIntercept); CHECK(Error_None == error); CHECK(0.0 != residualIntercept); - //const double impurity = MeasureImpurity(1, 0, cDimensions, dimensionLengths, weights, scores); - //CHECK(-0.001 < impurity && impurity < 0.001); + // const double impurity = MeasureImpurity(1, 0, cDimensions, dimensionLengths, weights, scores); + // CHECK(-0.001 < impurity && impurity < 0.001); } TEST_CASE("Purify simple 3x4 with -inf") { @@ -383,8 +383,8 @@ TEST_CASE("Purify simple 3x4 with -inf") { &residualIntercept); CHECK(Error_None == error); CHECK(0.0 != residualIntercept); - //const double impurity = MeasureImpurity(1, 0, cDimensions, dimensionLengths, weights, scores); - //CHECK(-0.001 < impurity && impurity < 0.001); + // const double impurity = MeasureImpurity(1, 0, cDimensions, dimensionLengths, weights, scores); + // CHECK(-0.001 < impurity && impurity < 0.001); } TEST_CASE("Purify simple 3x4 with overflow") { @@ -407,8 +407,8 @@ TEST_CASE("Purify simple 3x4 with overflow") { &residualIntercept); CHECK(Error_None == error); CHECK(0.0 != residualIntercept); - //const double impurity = MeasureImpurity(1, 0, cDimensions, dimensionLengths, weights, scores); - //CHECK(-0.001 < impurity && impurity < 0.001); + // const double impurity = MeasureImpurity(1, 0, cDimensions, dimensionLengths, weights, scores); + // CHECK(-0.001 < impurity && impurity < 0.001); } TEST_CASE("Purify simple 3x3x3") { @@ -581,7 +581,6 @@ TEST_CASE("Purify simple 3x4x5") { CHECK(-1e-20 < impurity && impurity < 1e-20); } - TEST_CASE("Purify simple multiclass 3x4") { constexpr IntEbm cClasses = 2; const IntEbm dimensionLengths[]{3, 4}; @@ -630,12 +629,12 @@ TEST_CASE("Purify simple multiclass NaN") { impurities, &residualIntercept[0]); CHECK(Error_None == error); - //CHECK(0.0 != residualIntercept[0]); - //CHECK(0.0 != residualIntercept[1]); - // const double impurity0 = MeasureImpurity(cClasses, 0, cDimensions, dimensionLengths, weights, scores); - // CHECK(-0.001 < impurity0 && impurity0 < 0.001); - // const double impurity1 = MeasureImpurity(cClasses, 1, cDimensions, dimensionLengths, weights, scores); - // CHECK(-0.001 < impurity1 && impurity1 < 0.001); + // CHECK(0.0 != residualIntercept[0]); + // CHECK(0.0 != residualIntercept[1]); + // const double impurity0 = MeasureImpurity(cClasses, 0, cDimensions, dimensionLengths, weights, scores); + // CHECK(-0.001 < impurity0 && impurity0 < 0.001); + // const double impurity1 = MeasureImpurity(cClasses, 1, cDimensions, dimensionLengths, weights, scores); + // CHECK(-0.001 < impurity1 && impurity1 < 0.001); } TEST_CASE("Purify simple multiclass +inf") { @@ -658,12 +657,12 @@ TEST_CASE("Purify simple multiclass +inf") { impurities, &residualIntercept[0]); CHECK(Error_None == error); - //CHECK(0.0 != residualIntercept[0]); - //CHECK(0.0 != residualIntercept[1]); - // const double impurity0 = MeasureImpurity(cClasses, 0, cDimensions, dimensionLengths, weights, scores); - // CHECK(-0.001 < impurity0 && impurity0 < 0.001); - // const double impurity1 = MeasureImpurity(cClasses, 1, cDimensions, dimensionLengths, weights, scores); - // CHECK(-0.001 < impurity1 && impurity1 < 0.001); + // CHECK(0.0 != residualIntercept[0]); + // CHECK(0.0 != residualIntercept[1]); + // const double impurity0 = MeasureImpurity(cClasses, 0, cDimensions, dimensionLengths, weights, scores); + // CHECK(-0.001 < impurity0 && impurity0 < 0.001); + // const double impurity1 = MeasureImpurity(cClasses, 1, cDimensions, dimensionLengths, weights, scores); + // CHECK(-0.001 < impurity1 && impurity1 < 0.001); } TEST_CASE("Purify simple multiclass -inf") { @@ -686,15 +685,14 @@ TEST_CASE("Purify simple multiclass -inf") { impurities, &residualIntercept[0]); CHECK(Error_None == error); - //CHECK(0.0 != residualIntercept[0]); - //CHECK(0.0 != residualIntercept[1]); - // const double impurity0 = MeasureImpurity(cClasses, 0, cDimensions, dimensionLengths, weights, scores); - // CHECK(-0.001 < impurity0 && impurity0 < 0.001); - // const double impurity1 = MeasureImpurity(cClasses, 1, cDimensions, dimensionLengths, weights, scores); - // CHECK(-0.001 < impurity1 && impurity1 < 0.001); + // CHECK(0.0 != residualIntercept[0]); + // CHECK(0.0 != residualIntercept[1]); + // const double impurity0 = MeasureImpurity(cClasses, 0, cDimensions, dimensionLengths, weights, scores); + // CHECK(-0.001 < impurity0 && impurity0 < 0.001); + // const double impurity1 = MeasureImpurity(cClasses, 1, cDimensions, dimensionLengths, weights, scores); + // CHECK(-0.001 < impurity1 && impurity1 < 0.001); } - TEST_CASE("Purify simple multiclass overflow to -inf") { constexpr IntEbm cClasses = 3; const IntEbm dimensionLengths[]{2, 2}; @@ -715,12 +713,12 @@ TEST_CASE("Purify simple multiclass overflow to -inf") { impurities, &residualIntercept[0]); CHECK(Error_None == error); - //CHECK(0.0 != residualIntercept[0]); - //CHECK(0.0 != residualIntercept[1]); - // const double impurity0 = MeasureImpurity(cClasses, 0, cDimensions, dimensionLengths, weights, scores); - // CHECK(-0.001 < impurity0 && impurity0 < 0.001); - // const double impurity1 = MeasureImpurity(cClasses, 1, cDimensions, dimensionLengths, weights, scores); - // CHECK(-0.001 < impurity1 && impurity1 < 0.001); + // CHECK(0.0 != residualIntercept[0]); + // CHECK(0.0 != residualIntercept[1]); + // const double impurity0 = MeasureImpurity(cClasses, 0, cDimensions, dimensionLengths, weights, scores); + // CHECK(-0.001 < impurity0 && impurity0 < 0.001); + // const double impurity1 = MeasureImpurity(cClasses, 1, cDimensions, dimensionLengths, weights, scores); + // CHECK(-0.001 < impurity1 && impurity1 < 0.001); } TEST_CASE("Purify simple multiclass overflow to +inf") { @@ -780,7 +778,6 @@ TEST_CASE("Purify simple multiclass +inf and NaN, overflow-inf,-overflow+inf, in // CHECK(-0.001 < impurity1 && impurity1 < 0.001); } - TEST_CASE("Purify simple multiclass overflow the shifts") { constexpr IntEbm cClasses = 3; const IntEbm dimensionLengths[]{2, 2}; @@ -820,11 +817,6 @@ TEST_CASE("Purify simple multiclass overflow the shifts") { // CHECK(-0.001 < impurity1 && impurity1 < 0.001); } - - - - - TEST_CASE("Purify simple multiclass 3x4, normalize classes") { constexpr IntEbm cClasses = 2; const IntEbm dimensionLengths[]{3, 4}; diff --git a/shared/libebm/tests/SuggestGraphBoundsTest.cpp b/shared/libebm/tests/SuggestGraphBoundsTest.cpp index 28ae339f4..86be001c3 100644 --- a/shared/libebm/tests/SuggestGraphBoundsTest.cpp +++ b/shared/libebm/tests/SuggestGraphBoundsTest.cpp @@ -423,10 +423,6 @@ TEST_CASE("SafeMean, no overflow negative") { CHECK(std::isfinite(mean)); } - - - - TEST_CASE("SafeMean, 4 values, weights of 1.0") { double vals[]{1.0, 2.5, 10, 100}; double weights[]{1.0, 1.0, 1.0, 1.0}; @@ -537,11 +533,6 @@ TEST_CASE("SafeMean, 4 values, weights of 0.0") { CHECK(mean == 5.5); } - - - - - TEST_CASE("SafeStandardDeviation, 4 values") { double vals[]{1.0, 2.5, 10, 100}; const size_t cVals = sizeof(vals) / sizeof(vals[0]); @@ -873,10 +864,10 @@ TEST_CASE("SafeStandardDeviation, 4 values, zero weights") { CHECK_APPROX(stddev, 41.493034053922834); } -//# this function calculates the weighted standard deviation -//def _weighted_std(a, axis, weights): -// if weights is None: -// return np.std(a, axis=axis) -// average = np.average(a, axis, weights) -// variance = np.average((a - average) ** 2, axis, weights) -// return np.sqrt(variance) +// # this function calculates the weighted standard deviation +// def _weighted_std(a, axis, weights): +// if weights is None: +// return np.std(a, axis=axis) +// average = np.average(a, axis, weights) +// variance = np.average((a - average) ** 2, axis, weights) +// return np.sqrt(variance)