diff --git a/tick/preprocessing/features_binarizer.py b/tick/preprocessing/features_binarizer.py index 7decf67f5..e86d4a895 100644 --- a/tick/preprocessing/features_binarizer.py +++ b/tick/preprocessing/features_binarizer.py @@ -29,6 +29,7 @@ class FeaturesBinarizer(Base, BaseEstimator, TransformerMixin): method : "quantile" or "linspace", default="quantile" * If ``"quantile"`` quantile-based cuts are used. * If ``"linspace"`` linearly spaced cuts are used. + * If ``"given"`` bins_boundaries needs to be provided. detect_column_type : "auto" or "column_names", default="auto" * If ``"auto"`` feature type detection done automatically. @@ -40,6 +41,9 @@ class FeaturesBinarizer(Base, BaseEstimator, TransformerMixin): If `True`, first column of each binarized continuous feature block is removed. + bins_boundaries : `list`, default="none" + Bins boundaries for continuous features. + Attributes ---------- one_hot_encoder : `OneHotEncoder` @@ -119,21 +123,37 @@ class FeaturesBinarizer(Base, BaseEstimator, TransformerMixin): } def __init__(self, method="quantile", n_cuts=10, detect_column_type="auto", - remove_first=False): + remove_first=False, bins_boundaries=None): Base.__init__(self) self.method = method self.n_cuts = n_cuts self.detect_column_type = detect_column_type self.remove_first = remove_first + self.bins_boundaries = bins_boundaries self.reset() def reset(self): self._set("one_hot_encoder", OneHotEncoder(sparse=True)) - self._set("bins_boundaries", {}) self._set("mapper", {}) self._set("feature_type", {}) self._set("_fitted", False) + if self.method != "given": + self._set("bins_boundaries", {}) + + @property + def boundaries(self): + """Get bins boundaries for all features. + + Returns + ------- + output : `dict` + The bins boundaries for each feature. + """ + if not self._fitted: + raise ValueError("cannot get bins_boundaries if object has not " + "been fitted") + return self.bins_boundaries @property def blocks_start(self): @@ -440,13 +460,20 @@ def _get_boundaries(self, feature_name, feature, fit=False): the actual number of distinct boundaries for this feature. """ if fit: - boundaries = FeaturesBinarizer._detect_boundaries( - feature, self.n_cuts, self.method) - self.bins_boundaries[feature_name] = boundaries - + if self.method == 'given': + if self.bins_boundaries is None: + raise ValueError("bins_boundaries required when `method` " + "equals 'given'") + + if not isinstance(self.bins_boundaries[feature_name], np.ndarray): + raise ValueError("feature %s not found in bins_boundaries" % feature_name) + boundaries = self.bins_boundaries[feature_name] + else: + boundaries = FeaturesBinarizer._detect_boundaries( + feature, self.n_cuts, self.method) + self.bins_boundaries[feature_name] = boundaries elif self._fitted: boundaries = self.bins_boundaries[feature_name] - else: raise ValueError("cannot call method with fit=True as object has " "not been fit") @@ -518,7 +545,7 @@ def _assign_interval(self, feature_name, feature, fit=False): if feature.dtype != float: feature = feature.astype(float) - # Compute bins boundaries for the feature + # Get bins boundaries for the feature boundaries = self._get_boundaries(feature_name, feature, fit) # Discretize feature diff --git a/tick/simulation/__init__.py b/tick/simulation/__init__.py index 658479553..234c14851 100644 --- a/tick/simulation/__init__.py +++ b/tick/simulation/__init__.py @@ -8,4 +8,4 @@ __all__ = [ "features_normal_cov_uniform", "features_normal_cov_toeplitz", "weights_sparse_exp", "weights_sparse_gauss" -] +] \ No newline at end of file diff --git a/tick/survival/__init__.py b/tick/survival/__init__.py index 9ff7bf39e..8989bb04d 100644 --- a/tick/survival/__init__.py +++ b/tick/survival/__init__.py @@ -9,7 +9,7 @@ from .model_coxreg_partial_lik import ModelCoxRegPartialLik from .model_sccs import ModelSCCS -from .simu_coxreg import SimuCoxReg +from .simu_coxreg import SimuCoxReg, SimuCoxRegWithCutPoints from .simu_sccs import SimuSCCS from .convolutional_sccs import ConvSCCS diff --git a/tick/survival/cox_regression.py b/tick/survival/cox_regression.py index e47354276..97c1856f3 100644 --- a/tick/survival/cox_regression.py +++ b/tick/survival/cox_regression.py @@ -109,7 +109,7 @@ def _construct_model_obj(self): def _all_safe(self, features: np.ndarray, times: np.array, censoring: np.array): - if not np.array_equal(np.unique(censoring), [0, 1]): + if not set(np.unique(censoring)).issubset({0, 1}): raise ValueError('``censoring`` must only have values in {0, 1}') # All times must be positive if not np.all(times >= 0): diff --git a/tick/survival/simu_coxreg.py b/tick/survival/simu_coxreg.py index aff3f1d37..5859867bf 100644 --- a/tick/survival/simu_coxreg.py +++ b/tick/survival/simu_coxreg.py @@ -2,6 +2,8 @@ import numpy as np from tick.base.simulation import SimuWithFeatures +from tick.preprocessing.features_binarizer import FeaturesBinarizer + # TODO: something better to tune the censoring level than this censoring factor @@ -31,7 +33,7 @@ class SimuCoxReg(SimuWithFeatures): Shape parameter to use in the distribution of times censoring_factor : `float`, default=2.0 - Level of censoring. Increasing censoring_factor lead + Level of censoring. Increasing censoring_factor leads to less censored times and conversely. features_type : `str`, default="cov_toeplitz" @@ -108,24 +110,30 @@ class SimuCoxReg(SimuWithFeatures): }, "_times_distribution": { "writable": False + }, + "_scale": { + "writable": False + }, + "_shape": { + "writable": False } } - def __init__(self, coeffs: np.ndarray, features: np.ndarray = None, - n_samples: int = 200, times_distribution: str = "weibull", + def __init__(self, coeffs: np.ndarray, + features: np.ndarray = None, n_samples: int = 200, + times_distribution: str = "weibull", shape: float = 1., scale: float = 1., censoring_factor: float = 2., - features_type: str = "cov_toeplitz", cov_corr: float = 0.5, - features_scaling: str = "none", seed: int = None, - verbose: bool = True, dtype="float64"): + features_type: str = "cov_toeplitz", + cov_corr: float = 0.5, features_scaling: str = "none", + seed: int = None, verbose: bool = True, dtype="float64"): n_features = coeffs.shape[0] # intercept=None in this model - SimuWithFeatures.__init__(self, None, features, n_samples, n_features, - features_type, cov_corr, features_scaling, - seed, verbose, dtype=dtype) + SimuWithFeatures.__init__(self, None, features, n_samples, + n_features, features_type, cov_corr, + features_scaling, seed, verbose, dtype=dtype) self.coeffs = coeffs - self.times_distribution = times_distribution self.shape = shape self.scale = scale self.censoring_factor = censoring_factor @@ -134,8 +142,6 @@ def __init__(self, coeffs: np.ndarray, features: np.ndarray = None, self.times = None self.censoring = None - # TODO: properties for times_dist, shape, scale, censoring factor - def simulate(self): """Launch simulation of the data @@ -166,6 +172,26 @@ def times_distribution(self, val): "understood, try using 'weibull' instead") self._set("_times_distribution", val) + @property + def shape(self): + return self._shape + + @shape.setter + def shape(self, val): + if val <= 0: + raise ValueError("``shape`` must be strictly positive") + self._set("_shape", val) + + @property + def scale(self): + return self._scale + + @scale.setter + def scale(self, val): + if val <= 0: + raise ValueError("``scale`` must be strictly positive") + self._set("_scale", val) + def _simulate(self): # The features matrix already exists, and is created by the # super class @@ -202,3 +228,284 @@ def _as_dict(self): dd.pop("times", None) dd.pop("censoring", None) return dd + + +class SimuCoxRegWithCutPoints(SimuWithFeatures): + """Simulation of a Cox regression for proportional hazards with cut-points + effects in the features + + Parameters + ---------- + features : `numpy.ndarray`, shape=(n_samples, n_features), default=`None` + The features matrix to use. If None, it is simulated + + n_samples : `int`, default=200 + Number of samples + + n_features : `int`, default=5 + Number of features + + times_distribution : `str`, default="weibull" + The distrubution of times. Only ``"weibull"`` + is implemented for now + + scale : `float`, default=1.0 + Scaling parameter to use in the distribution of times + + shape : `float`, default=1.0 + Shape parameter to use in the distribution of times + + censoring_factor : `float`, default=2.0 + Level of censoring. Increasing censoring_factor leads + to less censored times and conversely. + + features_type : `str`, default="cov_toeplitz" + The type of features matrix to simulate + + * If ``"cov_toeplitz"`` : a Gaussian distribution with + Toeplitz correlation matrix + + * If ``"cov_uniform"`` : a Gaussian distribution with + correlation matrix given by O.5 * (U + U.T), where U is + uniform on [0, 1] and diagonal filled with ones. + + cov_corr : `float`, default=0.5 + Correlation to use in the Toeplitz correlation matrix + + features_scaling : `str`, default="none" + The way the features matrix is scaled after simulation + + * If ``"standard"`` : the columns are centered and + normalized + + * If ``"min-max"`` : remove the minimum and divide by + max-min + + * If ``"norm"`` : the columns are normalized but not centered + + * If ``"none"`` : nothing is done to the features + + seed : `int`, default=None + The seed of the random number generator. If `None` it is not + seeded + + verbose : `bool`, default=True + If True, print things + + n_cut_points : `int`, default="none" + Number of cut-points generated per feature. If `None` it is sampled from + a geometric distribution of parameter n_cut_points_factor. + + n_cut_points_factor : `float`, default=0.7 + Parameter of the geometric distribution used to generate the number of + cut-points when n_cut_points is `None`. Increasing n_cut_points_factor + leads to less cut-points per feature on average. + + sparsity : `float`, default=0 + Percentage of block sparsity induced in the coefficient vector. Must be + in [0, 1]. + + Attributes + ---------- + features : `numpy.ndarray`, shape=(n_samples, n_features) + The simulated (or given) features matrix + + times : `numpy.ndarray`, shape=(n_samples,) + Simulated times + + censoring : `numpy.ndarray`, shape=(n_samples,) + Simulated censoring indicator, where ``censoring[i] == 1`` + indicates that the time of the i-th individual is a failure + time, and where ``censoring[i] == 0`` means that the time of + the i-th individual is a censoring time + + Notes + ----- + There is no intercept in this model + """ + + _attrinfos = { + "times": { + "writable": False + }, + "censoring": { + "writable": False + }, + "_times_distribution": { + "writable": False + }, + "_scale": { + "writable": False + }, + "_shape": { + "writable": False + }, + "_sparsity": { + "writable": False + } + } + + def __init__(self, features: np.ndarray = None, n_samples: int = 200, + n_features: int = 5, n_cut_points: int = None, + n_cut_points_factor: float = .7, + times_distribution: str = "weibull", + shape: float = 1., scale: float = 1., + censoring_factor: float = 2., + features_type: str = "cov_toeplitz", + cov_corr: float = 0.5, features_scaling: str = "none", + seed: int = None, verbose: bool = True, sparsity=0): + + # intercept=None in this model + SimuWithFeatures.__init__(self, None, features, n_samples, + n_features, features_type, cov_corr, + features_scaling, seed, verbose) + + self.shape = shape + self.scale = scale + self.censoring_factor = censoring_factor + self.times_distribution = times_distribution + self.n_cut_points = n_cut_points + self.n_cut_points_factor = n_cut_points_factor + self.sparsity = sparsity + self.features = None + self.times = None + self.censoring = None + + def simulate(self): + """Launch simulation of the data + + Returns + ------- + features : `numpy.ndarray`, shape=(n_samples, n_features) + The simulated (or given) features matrix + + times : `numpy.ndarray`, shape=(n_samples,) + Simulated times + + censoring : `numpy.ndarray`, shape=(n_samples,) + Simulated censoring indicator, where ``censoring[i] == 1`` + indicates that the time of the i-th individual is a failure + time, and where ``censoring[i] == 0`` means that the time of + the i-th individual is a censoring time + """ + return SimuWithFeatures.simulate(self) + + @property + def times_distribution(self): + return self._times_distribution + + @times_distribution.setter + def times_distribution(self, val): + if val != "weibull": + raise ValueError("``times_distribution`` was not " + "understood, try using 'weibull' instead") + self._set("_times_distribution", val) + + @property + def shape(self): + return self._shape + + @shape.setter + def shape(self, val): + if val <= 0: + raise ValueError("``shape`` must be strictly positive") + self._set("_shape", val) + + @property + def scale(self): + return self._scale + + @scale.setter + def scale(self, val): + if val <= 0: + raise ValueError("``scale`` must be strictly positive") + self._set("_scale", val) + + @property + def sparsity(self): + return self._sparsity + + @sparsity.setter + def sparsity(self, val): + if not 0 <= val <= 1: + raise ValueError("``sparsity`` must be in (0, 1)") + self._set("_sparsity", val) + + def _simulate(self): + # The features matrix already exists, and is created by the + # super class + features = self.features + n_samples, n_features = features.shape + # Simulation of cut-points + n_cut_points = self.n_cut_points + n_cut_points_factor = self.n_cut_points_factor + sparsity = self.sparsity + s = round(n_features * sparsity) + # sparsity index set + S = np.random.choice(n_features, s, replace=False) + + if n_cut_points is None: + n_cut_points = np.random.geometric(n_cut_points_factor, n_features) + else: + n_cut_points = np.repeat(n_cut_points, n_features) + + cut_points = {} + coeffs_binarized = np.array([]) + for j in range(n_features): + feature_j = features[:, j] + quantile_cuts = np.linspace(10, 90, 10) + candidates = np.percentile(feature_j, quantile_cuts, + interpolation="nearest") + cut_points_j = np.random.choice(candidates, n_cut_points[j], + replace=False) + cut_points_j = np.sort(cut_points_j) + cut_points_j = np.insert(cut_points_j, 0, -np.inf) + cut_points_j = np.append(cut_points_j, np.inf) + cut_points[str(j)] = cut_points_j + # generate beta star + if j in S: + coeffs_block = np.zeros(n_cut_points[j] + 1) + else: + coeffs_block = np.random.normal(1, .5, n_cut_points[j] + 1) + # make sure 2 consecutive coeffs are different enough + coeffs_block = np.abs(coeffs_block) + coeffs_block[::2] *= -1 + # sum-to-zero constraint in each block + coeffs_block = coeffs_block - coeffs_block.mean() + coeffs_binarized = np.append(coeffs_binarized, coeffs_block) + + binarizer = FeaturesBinarizer(method='given', + bins_boundaries=cut_points) + binarized_features = binarizer.fit_transform(features) + + u = binarized_features.dot(coeffs_binarized) + # Simulation of true times + E = np.random.exponential(scale=1., size=n_samples) + E *= np.exp(-u) + scale = self.scale + shape = self.shape + if self.times_distribution == "weibull": + T = 1. / scale * E ** (1. / shape) + else: + # There is not point in this test, but let's do it like that + # since we're likely to implement other distributions + T = 1. / scale * E ** (1. / shape) + + m = T.mean() + # Simulation of the censoring + c = self.censoring_factor + C = np.random.exponential(scale=c * m, size=n_samples) + # Observed time + self._set("times", np.minimum(T, C).astype(self.dtype)) + # Censoring indicator: 1 if it is a time of failure, 0 if censoring. + censoring = (T <= C).astype(np.ushort) + self._set("censoring", censoring) + return self.features, self.times, self.censoring, cut_points, \ + coeffs_binarized, S + + def _as_dict(self): + dd = SimuWithFeatures._as_dict(self) + dd.pop("features", None) + dd.pop("times", None) + dd.pop("censoring", None) + return dd diff --git a/tick/survival/tests/simu_coxreg_test.py b/tick/survival/tests/simu_coxreg_test.py index df27358ee..973f3e452 100644 --- a/tick/survival/tests/simu_coxreg_test.py +++ b/tick/survival/tests/simu_coxreg_test.py @@ -3,7 +3,7 @@ import unittest import numpy as np -from tick.survival import SimuCoxReg +from tick.survival import SimuCoxReg, SimuCoxRegWithCutPoints class Test(unittest.TestCase): @@ -34,17 +34,73 @@ def test_SimuCoxReg(self): 1.23227551, 0.50697013, 1.9409132 ], [1.8891494, 1.49834791, 2.41445794], [0.19431319, 0.80245126, 1.02577552], [ - -1.61687582, -1.08411865, -0.83438387 - ], [2.30419894, -0.68987056, - -0.39750262], [-0.28826405, -1.23635074, -0.76124386], [ - -1.32869473, -1.8752391, -0.182537 - ], [0.79464218, 0.65055633, 1.57572506], + -1.61687582, -1.08411865, -0.83438387 + ], [2.30419894, -0.68987056, + -0.39750262], + [-0.28826405, -1.23635074, -0.76124386], [ + -1.32869473, -1.8752391, -0.182537 + ], [0.79464218, 0.65055633, 1.57572506], [0.71524202, 1.66759831, 0.88679047]]) np.testing.assert_almost_equal(features, features_) np.testing.assert_almost_equal(times, times_) np.testing.assert_almost_equal(censoring, censoring_) + def test_SimuCoxRegWithCutPoints(self): + """...Test simulation of a Cox Regression with cut-points + """ + # Simulate a Cox model with cut-points with specific seed + n_samples = 10 + n_features = 3 + n_cut_points = 2 + cov_corr = .5 + sparsity = .2 + + seed = 123 + simu = SimuCoxRegWithCutPoints(n_samples=n_samples, + n_features=n_features, + seed=seed, verbose=False, + n_cut_points=n_cut_points, + shape=2, scale=.1, cov_corr=cov_corr, + sparsity=sparsity) + features_, times_, censoring_, cut_points_, coeffs_binarized_, S_ = simu.simulate() + + times = np.array([6.12215425, 6.74403919, 5.2148425, 5.42903238, + 2.42953933, 9.50705158, 18.49545933, 19.7929349, + 0.39267278, 1.24799812]) + + censoring = np.array([1, 0, 0, 1, 0, 1, 1, 1, 0, 1], dtype=np.ushort) + + features = np.array([[1.4912667, 0.80881799, 0.26977298], + [1.23227551, 0.50697013, 1.9409132], + [1.8891494, 1.49834791, 2.41445794], + [0.19431319, 0.80245126, 1.02577552], + [-1.61687582, -1.08411865, -0.83438387], + [2.30419894, -0.68987056, -0.39750262], + [-0.28826405, -1.23635074, -0.76124386], + [-1.32869473, -1.8752391, -0.182537], + [0.79464218, 0.65055633, 1.57572506], + [0.71524202, 1.66759831, 0.88679047]]) + + cut_points = {'0': np.array([-np.inf, -0.28826405, 0.79464218, np.inf]), + '1': np.array([-np.inf, -1.23635074, 0.50697013, np.inf]), + '2': np.array([-np.inf, -0.182537, 0.88679047, np.inf])} + + coeffs_binarized = np.array([-1.26789642, 1.31105319, -0.04315676, 0., + 0., 0., 0.01839684, 0.4075832, + -0.42598004]) + + S = np.array([1]) + + np.testing.assert_almost_equal(features, features_) + np.testing.assert_almost_equal(times, times_) + np.testing.assert_almost_equal(censoring, censoring_) + np.testing.assert_almost_equal(cut_points_['0'], cut_points['0']) + np.testing.assert_almost_equal(cut_points_['1'], cut_points['1']) + np.testing.assert_almost_equal(cut_points_['2'], cut_points['2']) + np.testing.assert_almost_equal(coeffs_binarized, coeffs_binarized_) + np.testing.assert_almost_equal(S, S_) + if __name__ == '__main__': unittest.main()