Merge pull request #369 from Dekken/binacox_squash

adding features relatively to the binacox model
X-DataInitiative · Jun 4, 2019 · ea27359 · ea27359
2 parents eec1ac5 + 3ed7ba1
commit ea27359
Show file tree

Hide file tree

Showing 6 changed files with 419 additions and 29 deletions.
diff --git a/tick/preprocessing/features_binarizer.py b/tick/preprocessing/features_binarizer.py
@@ -29,6 +29,7 @@ class FeaturesBinarizer(Base, BaseEstimator, TransformerMixin):
     method : "quantile" or "linspace", default="quantile"
         * If ``"quantile"`` quantile-based cuts are used.
         * If ``"linspace"`` linearly spaced cuts are used.
+        * If ``"given"`` bins_boundaries needs to be provided.
 
     detect_column_type : "auto" or "column_names", default="auto"
         * If ``"auto"`` feature type detection done automatically.
@@ -40,6 +41,9 @@ class FeaturesBinarizer(Base, BaseEstimator, TransformerMixin):
         If `True`, first column of each binarized continuous feature block is
         removed.
 
+    bins_boundaries : `list`, default="none"
+        Bins boundaries for continuous features.
+
     Attributes
     ----------
     one_hot_encoder : `OneHotEncoder`
@@ -119,21 +123,37 @@ class FeaturesBinarizer(Base, BaseEstimator, TransformerMixin):
     }
 
     def __init__(self, method="quantile", n_cuts=10, detect_column_type="auto",
-                 remove_first=False):
+                 remove_first=False, bins_boundaries=None):
         Base.__init__(self)
 
         self.method = method
         self.n_cuts = n_cuts
         self.detect_column_type = detect_column_type
         self.remove_first = remove_first
+        self.bins_boundaries = bins_boundaries
         self.reset()
 
     def reset(self):
         self._set("one_hot_encoder", OneHotEncoder(sparse=True))
-        self._set("bins_boundaries", {})
         self._set("mapper", {})
         self._set("feature_type", {})
         self._set("_fitted", False)
+        if self.method != "given":
+            self._set("bins_boundaries", {})
+
+    @property
+    def boundaries(self):
+        """Get bins boundaries for all features.
+
+        Returns
+        -------
+        output : `dict`
+            The bins boundaries for each feature.
+        """
+        if not self._fitted:
+            raise ValueError("cannot get bins_boundaries if object has not "
+                             "been fitted")
+        return self.bins_boundaries
 
     @property
     def blocks_start(self):
@@ -440,13 +460,20 @@ def _get_boundaries(self, feature_name, feature, fit=False):
             the actual number of distinct boundaries for this feature.
         """
         if fit:
-            boundaries = FeaturesBinarizer._detect_boundaries(
-                feature, self.n_cuts, self.method)
-            self.bins_boundaries[feature_name] = boundaries
-
+            if self.method == 'given':
+                if self.bins_boundaries is None:
+                    raise ValueError("bins_boundaries required when `method` "
+                                     "equals 'given'")
+
+                if not isinstance(self.bins_boundaries[feature_name], np.ndarray):
+                    raise ValueError("feature %s not found in bins_boundaries" % feature_name)
+                boundaries = self.bins_boundaries[feature_name]
+            else:
+                boundaries = FeaturesBinarizer._detect_boundaries(
+                    feature, self.n_cuts, self.method)
+                self.bins_boundaries[feature_name] = boundaries
         elif self._fitted:
             boundaries = self.bins_boundaries[feature_name]
-
         else:
             raise ValueError("cannot call method with fit=True as object has "
                              "not been fit")
@@ -518,7 +545,7 @@ def _assign_interval(self, feature_name, feature, fit=False):
             if feature.dtype != float:
                 feature = feature.astype(float)
 
-            # Compute bins boundaries for the feature
+            # Get bins boundaries for the feature
             boundaries = self._get_boundaries(feature_name, feature, fit)
 
             # Discretize feature

diff --git a/tick/simulation/__init__.py b/tick/simulation/__init__.py
@@ -8,4 +8,4 @@
 __all__ = [
     "features_normal_cov_uniform", "features_normal_cov_toeplitz",
     "weights_sparse_exp", "weights_sparse_gauss"
-]
+]
diff --git a/tick/survival/__init__.py b/tick/survival/__init__.py
@@ -9,7 +9,7 @@
 from .model_coxreg_partial_lik import ModelCoxRegPartialLik
 from .model_sccs import ModelSCCS
 
-from .simu_coxreg import SimuCoxReg
+from .simu_coxreg import SimuCoxReg, SimuCoxRegWithCutPoints
 from .simu_sccs import SimuSCCS
 from .convolutional_sccs import ConvSCCS
 

diff --git a/tick/survival/cox_regression.py b/tick/survival/cox_regression.py
@@ -109,7 +109,7 @@ def _construct_model_obj(self):
 
     def _all_safe(self, features: np.ndarray, times: np.array,
                   censoring: np.array):
-        if not np.array_equal(np.unique(censoring), [0, 1]):
+        if not set(np.unique(censoring)).issubset({0, 1}):
             raise ValueError('``censoring`` must only have values in {0, 1}')
         # All times must be positive
         if not np.all(times >= 0):