diff --git a/.travis.yml b/.travis.yml index 1881ce4..e60156f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,7 +13,7 @@ env: before_install: # Here we download miniconda and install the dependencies -- pip install black coverage future joblib numpy numpydoc pandas patsy progressbar2 pygam pytest python-dateutil python-utils pytz scikit-learn scipy six statsmodels xgboost +- pip install black coverage future joblib numpy numpydoc pandas patsy progressbar2 pygam pytest python-dateutil python-utils pytz scikit-learn scipy six statsmodels install: - python setup.py install diff --git a/causal_curve/tmle.py b/causal_curve/tmle.py index 1974446..b15b640 100644 --- a/causal_curve/tmle.py +++ b/causal_curve/tmle.py @@ -9,8 +9,8 @@ from pandas.api.types import is_float_dtype, is_numeric_dtype from scipy.interpolate import interp1d from scipy.stats import norm +from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor from statsmodels.genmod.generalized_linear_model import GLM -from xgboost import XGBClassifier, XGBRegressor from causal_curve.core import Core from causal_curve.utils import rand_seed_wrapper @@ -19,7 +19,7 @@ class TMLE(Core): """ Constructs a causal dose response curve through a series of TMLE comparisons across a grid - of the treatment values. XGBoost is used for prediction in Q model and G model. + of the treatment values. Gradient boosting is used for prediction in Q model and G model. Assumes continuous treatment and outcome variable. WARNING: @@ -47,18 +47,14 @@ class TMLE(Core): treatment values between the bin edges will be used to generate the CDRC. n_estimators: int, optional (default = 100) - Optional argument to set the number of learners to use when XGBoost + Optional argument to set the number of learners to use when sklearn creates TMLE's Q and G models. learning_rate: float, optional (default = 0.1) - Optional argument to set the XGBoost's learning rate for TMLE's Q and G models. + Optional argument to set the sklearn's learning rate for TMLE's Q and G models. max_depth: int, optional (default = 5) - Optional argument to set XGBoost's maximum depth when creating TMLE's Q and G models. - - gamma: float, optional (default = 1.0) - Optional argument to set XGBoost's gamma parameter (regularization) when - creating TMLE's Q and G models. + Optional argument to set sklearn's maximum depth when creating TMLE's Q and G models. random_seed: int, optional (default = None) Sets the random seed. @@ -115,7 +111,6 @@ def __init__( n_estimators=100, learning_rate=0.1, max_depth=5, - gamma=1.0, random_seed=None, verbose=False, ): @@ -124,7 +119,6 @@ def __init__( self.n_estimators = n_estimators self.learning_rate = learning_rate self.max_depth = max_depth - self.gamma = gamma self.random_seed = random_seed self.verbose = verbose @@ -190,16 +184,6 @@ def _validate_init_params(self): if self.max_depth <= 0: raise TypeError("max_depth parameter must be greater than 0") - # Checks for gamma - if not isinstance(self.gamma, float): - raise TypeError( - f"gamma parameter must be a float, " - f"but found type {type(self.gamma)}" - ) - - if self.gamma <= 0: - raise TypeError("gamma parameter must be greater than 0") - # Checks for random_seed if not isinstance(self.random_seed, (int, type(None))): raise TypeError( @@ -263,11 +247,10 @@ def _initial_bucket_mean_prediction(self): self.t_data < self.treatment_grid_bins[1] ] - init_model = XGBRegressor( + init_model = GradientBoostingRegressor( n_estimators=self.n_estimators, max_depth=self.max_depth, learning_rate=self.learning_rate, - gamma=self.gamma, random_state=self.random_seed, ).fit(X, y) @@ -495,11 +478,10 @@ def _q_model(self, temp_y, temp_x, temp_t): X = pd.concat([temp_t, temp_x], axis=1).to_numpy() y = temp_y.to_numpy() - Q_model = XGBRegressor( + Q_model = GradientBoostingRegressor( n_estimators=self.n_estimators, max_depth=self.max_depth, learning_rate=self.learning_rate, - gamma=self.gamma, random_state=self.random_seed, ).fit(X, y) @@ -525,11 +507,10 @@ def _g_model(self, temp_x, temp_t): X = temp_x.to_numpy() t = temp_t.to_numpy() - G_model = XGBClassifier( + G_model = GradientBoostingClassifier( n_estimators=self.n_estimators, max_depth=self.max_depth, learning_rate=self.learning_rate, - gamma=self.gamma, random_state=self.random_seed, ).fit(X, t) diff --git a/docs/TMLE_example.rst b/docs/TMLE_example.rst index d04d186..f870b90 100644 --- a/docs/TMLE_example.rst +++ b/docs/TMLE_example.rst @@ -13,7 +13,7 @@ tool to estimate the marginal causal curve of some continuous treatment on a con accounting for some mild confounding effects. Compared with the package's GPS method, this TMLE method is double robust against model -misspecification, incorporates more powerful machine learning techniques internally, +misspecification, incorporates more powerful machine learning techniques internally (gradient boosting), produces significantly smaller confidence intervals, however it is not computationally efficient and will take longer to run. @@ -39,7 +39,7 @@ References ---------- van der Laan MJ and Rubin D. Targeted maximum likelihood learning. In: ​U.C. Berkeley Division of -Biostatistics Working Paper Series, 2006. +Biostatistics Working Paper Series, 2006. van der Laan MJ and Gruber S. Collaborative double robust penalized targeted maximum likelihood estimation. In: The International Journal of Biostatistics 6(1), 2010. diff --git a/docs/changelog.rst b/docs/changelog.rst index 21000d6..d4fad02 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -4,6 +4,12 @@ Change Log ========== +Version 0.3.4 +------------- +- Removed XGBoost as dependency. +- Now using sklearn's gradient boosting implementation. + + Version 0.3.3 ------------- - Misc edits to paper and bibliography diff --git a/docs/conf.py b/docs/conf.py index 9df3d06..790fe35 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -22,7 +22,7 @@ author = 'Roni Kobrosly' # The full version, including alpha/beta/rc tags -release = '0.3.3' +release = '0.3.4' # -- General configuration --------------------------------------------------- diff --git a/docs/install.rst b/docs/install.rst index 1b0d02c..b683e34 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -27,7 +27,6 @@ causal-curve requires: - scipy - six - statsmodels -- xgboost diff --git a/docs/requirements.txt b/docs/requirements.txt index 0f45556..6c001a9 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -16,4 +16,3 @@ scikit-learn scipy six statsmodels -xgboost diff --git a/requirements.txt b/requirements.txt index 0f45556..6c001a9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,3 @@ scikit-learn scipy six statsmodels -xgboost diff --git a/setup.py b/setup.py index ef7cf7b..e98bbfb 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="causal-curve", - version="0.3.3", + version="0.3.4", author="Roni Kobrosly", author_email="roni.kobrosly@gmail.com", description="A python library with tools to perform causal inference using \ @@ -38,7 +38,6 @@ 'scikit-learn', 'scipy', 'six', - 'statsmodels', - 'xgboost' + 'statsmodels' ] )