Skip to content

Commit

Permalink
Merge pull request #369 from Dekken/binacox_squash
Browse files Browse the repository at this point in the history
adding features relatively to the binacox model
  • Loading branch information
stephanegaiffas authored Jun 4, 2019
2 parents eec1ac5 + 3ed7ba1 commit ea27359
Show file tree
Hide file tree
Showing 6 changed files with 419 additions and 29 deletions.
43 changes: 35 additions & 8 deletions tick/preprocessing/features_binarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class FeaturesBinarizer(Base, BaseEstimator, TransformerMixin):
method : "quantile" or "linspace", default="quantile"
* If ``"quantile"`` quantile-based cuts are used.
* If ``"linspace"`` linearly spaced cuts are used.
* If ``"given"`` bins_boundaries needs to be provided.
detect_column_type : "auto" or "column_names", default="auto"
* If ``"auto"`` feature type detection done automatically.
Expand All @@ -40,6 +41,9 @@ class FeaturesBinarizer(Base, BaseEstimator, TransformerMixin):
If `True`, first column of each binarized continuous feature block is
removed.
bins_boundaries : `list`, default="none"
Bins boundaries for continuous features.
Attributes
----------
one_hot_encoder : `OneHotEncoder`
Expand Down Expand Up @@ -119,21 +123,37 @@ class FeaturesBinarizer(Base, BaseEstimator, TransformerMixin):
}

def __init__(self, method="quantile", n_cuts=10, detect_column_type="auto",
remove_first=False):
remove_first=False, bins_boundaries=None):
Base.__init__(self)

self.method = method
self.n_cuts = n_cuts
self.detect_column_type = detect_column_type
self.remove_first = remove_first
self.bins_boundaries = bins_boundaries
self.reset()

def reset(self):
self._set("one_hot_encoder", OneHotEncoder(sparse=True))
self._set("bins_boundaries", {})
self._set("mapper", {})
self._set("feature_type", {})
self._set("_fitted", False)
if self.method != "given":
self._set("bins_boundaries", {})

@property
def boundaries(self):
"""Get bins boundaries for all features.
Returns
-------
output : `dict`
The bins boundaries for each feature.
"""
if not self._fitted:
raise ValueError("cannot get bins_boundaries if object has not "
"been fitted")
return self.bins_boundaries

@property
def blocks_start(self):
Expand Down Expand Up @@ -440,13 +460,20 @@ def _get_boundaries(self, feature_name, feature, fit=False):
the actual number of distinct boundaries for this feature.
"""
if fit:
boundaries = FeaturesBinarizer._detect_boundaries(
feature, self.n_cuts, self.method)
self.bins_boundaries[feature_name] = boundaries

if self.method == 'given':
if self.bins_boundaries is None:
raise ValueError("bins_boundaries required when `method` "
"equals 'given'")

if not isinstance(self.bins_boundaries[feature_name], np.ndarray):
raise ValueError("feature %s not found in bins_boundaries" % feature_name)
boundaries = self.bins_boundaries[feature_name]
else:
boundaries = FeaturesBinarizer._detect_boundaries(
feature, self.n_cuts, self.method)
self.bins_boundaries[feature_name] = boundaries
elif self._fitted:
boundaries = self.bins_boundaries[feature_name]

else:
raise ValueError("cannot call method with fit=True as object has "
"not been fit")
Expand Down Expand Up @@ -518,7 +545,7 @@ def _assign_interval(self, feature_name, feature, fit=False):
if feature.dtype != float:
feature = feature.astype(float)

# Compute bins boundaries for the feature
# Get bins boundaries for the feature
boundaries = self._get_boundaries(feature_name, feature, fit)

# Discretize feature
Expand Down
2 changes: 1 addition & 1 deletion tick/simulation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
__all__ = [
"features_normal_cov_uniform", "features_normal_cov_toeplitz",
"weights_sparse_exp", "weights_sparse_gauss"
]
]
2 changes: 1 addition & 1 deletion tick/survival/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from .model_coxreg_partial_lik import ModelCoxRegPartialLik
from .model_sccs import ModelSCCS

from .simu_coxreg import SimuCoxReg
from .simu_coxreg import SimuCoxReg, SimuCoxRegWithCutPoints
from .simu_sccs import SimuSCCS
from .convolutional_sccs import ConvSCCS

Expand Down
2 changes: 1 addition & 1 deletion tick/survival/cox_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def _construct_model_obj(self):

def _all_safe(self, features: np.ndarray, times: np.array,
censoring: np.array):
if not np.array_equal(np.unique(censoring), [0, 1]):
if not set(np.unique(censoring)).issubset({0, 1}):
raise ValueError('``censoring`` must only have values in {0, 1}')
# All times must be positive
if not np.all(times >= 0):
Expand Down
Loading

0 comments on commit ea27359

Please sign in to comment.