From 70d01e76a3e846c08583f085ffe5d6431c22907d Mon Sep 17 00:00:00 2001 From: Pierre Boyeau Date: Mon, 4 Nov 2024 14:40:52 -0800 Subject: [PATCH 1/2] pvalues for logistic regression --- ppi_py/ppi.py | 107 ++++++++++++++++++++++++++++++++++++++++- tests/test_logistic.py | 35 ++++++++++++++ 2 files changed, 141 insertions(+), 1 deletion(-) diff --git a/ppi_py/ppi.py b/ppi_py/ppi.py index 43b9109..b5f293a 100644 --- a/ppi_py/ppi.py +++ b/ppi_py/ppi.py @@ -3,7 +3,7 @@ from scipy.stats import norm, binom from scipy.optimize import brentq, minimize from statsmodels.regression.linear_model import OLS, WLS -from statsmodels.stats.weightstats import _zconfint_generic, _zstat_generic +from statsmodels.stats.weightstats import _zconfint_generic, _zstat_generic, _zstat_generic2 from sklearn.linear_model import LogisticRegression, PoissonRegressor import warnings @@ -961,6 +961,111 @@ def _logistic_get_stats( inv_hessian = np.linalg.inv(hessian).reshape(d, d) return grads, grads_hat, grads_hat_unlabeled, inv_hessian +def ppi_logistic_pval( + X, + Y, + Yhat, + X_unlabeled, + Yhat_unlabeled, + lam=None, + coord=None, + optimizer_options=None, + w=None, + w_unlabeled=None, + alternative="two-sided", +): + """Computes the prediction-powered pvalues on the logistic regression coefficients for the null hypothesis that the coefficient is zero. + + Args: + X (ndarray): Covariates corresponding to the gold-standard labels. + Y (ndarray): Gold-standard labels. + Yhat (ndarray): Predictions corresponding to the gold-standard labels. + X_unlabeled (ndarray): Covariates corresponding to the unlabeled data. + Yhat_unlabeled (ndarray): Predictions corresponding to the unlabeled data. + lam (float, optional): Power-tuning parameter (see `[ADZ23] `__). The default value `None` will estimate the optimal value from data. Setting `lam=1` recovers PPI with no power tuning, and setting `lam=0` recovers the classical point estimate. + coord (int, optional): Coordinate for which to optimize `lam`. If `None`, it optimizes the total variance over all coordinates. Must be in {1, ..., d} where d is the shape of the estimand. + optimizer_options (dict, optional): Options to pass to the optimizer. See scipy.optimize.minimize for details. + w (ndarray, optional): Sample weights for the labeled data set. + w_unlabeled (ndarray, optional): Sample weights for the unlabeled data set. + alternative (str, optional): Alternative hypothesis, either 'two-sided', 'larger' or 'smaller'. + + Returns: + ndarray: Prediction-powered point estimate of the logistic regression coefficients. + + Notes: + `[ADZ23] `__ A. N. Angelopoulos, J. C. Duchi, and T. Zrnic. PPI++: Efficient Prediction Powered Inference. arxiv:2311.01453, 2023. + """ + n = Y.shape[0] + d = X.shape[1] + N = Yhat_unlabeled.shape[0] + w = np.ones(n) if w is None else w / w.sum() * n + w_unlabeled = ( + np.ones(N) + if w_unlabeled is None + else w_unlabeled / w_unlabeled.sum() * N + ) + use_unlabeled = lam != 0 + + ppi_pointest = ppi_logistic_pointestimate( + X, + Y, + Yhat, + X_unlabeled, + Yhat_unlabeled, + optimizer_options=optimizer_options, + lam=lam, + coord=coord, + w=w, + w_unlabeled=w_unlabeled, + ) + + grads, grads_hat, grads_hat_unlabeled, inv_hessian = _logistic_get_stats( + ppi_pointest, + X, + Y, + Yhat, + X_unlabeled, + Yhat_unlabeled, + w, + w_unlabeled, + use_unlabeled=use_unlabeled, + ) + + if lam is None: + lam = _calc_lam_glm( + grads, + grads_hat, + grads_hat_unlabeled, + inv_hessian, + clip=True, + ) + return ppi_logistic_pval( + X, + Y, + Yhat, + X_unlabeled, + Yhat_unlabeled, + optimizer_options=optimizer_options, + lam=lam, + coord=coord, + w=w, + w_unlabeled=w_unlabeled, + alternative=alternative, + ) + + var_unlabeled = np.cov(lam * grads_hat_unlabeled.T).reshape(d, d) + var = np.cov(grads.T - lam * grads_hat.T).reshape(d, d) + Sigma_hat = inv_hessian @ (n / N * var_unlabeled + var) @ inv_hessian + + var_diag = np.sqrt(np.diag(Sigma_hat) / n) + + pvals = _zstat_generic2( + ppi_pointest, + var_diag, + alternative=alternative, + )[1] + return pvals + def ppi_logistic_ci( X, diff --git a/tests/test_logistic.py b/tests/test_logistic.py index 8843ad1..a3f1d1c 100644 --- a/tests/test_logistic.py +++ b/tests/test_logistic.py @@ -62,6 +62,41 @@ def test_ppi_logistic_pointestimate_recovers(): ) # Check that the point estimate is close to the true beta assert np.linalg.norm(beta_ppi_pointestimate - beta) < 0.2 + + +def test_ppi_logistic_pval_makesense(): + # Make a synthetic regression problem + n = 10000 + N = 100000 + d = 3 + X = np.random.randn(n, d) + beta = np.array([0, 0, 1.0]) + + Y = np.random.binomial(1, expit(X.dot(beta))) + Yhat = expit(X.dot(beta)) + X_unlabeled = np.random.randn(N, d) + Yhat_unlabeled = expit(X_unlabeled.dot(beta)) + beta_ppi_pval = ppi_logistic_pval( + X, + Y, + Yhat, + X_unlabeled, + Yhat_unlabeled, + lam=0.5, + optimizer_options={"gtol": 1e-3}, + ) + assert beta_ppi_pval[-1] < 0.1 + + beta_ppi_pval = ppi_logistic_pval( + X, + Y, + Yhat, + X_unlabeled, + Yhat_unlabeled, + lam=None, + optimizer_options={"gtol": 1e-3}, + ) + assert beta_ppi_pval[-1] < 0.1 def ppi_logistic_ci_subtest(i, alphas, n=1000, N=10000, d=1, epsilon=0.02): From caef9a99be4f3bbf59353e88efdbc2785173604e Mon Sep 17 00:00:00 2001 From: Anastasios Angelopoulos Date: Wed, 6 Nov 2024 23:59:31 -0800 Subject: [PATCH 2/2] [black + test run] --- ppi_py/ppi.py | 9 +++++++-- tests/test_logistic.py | 6 +++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/ppi_py/ppi.py b/ppi_py/ppi.py index b5f293a..cfe240a 100644 --- a/ppi_py/ppi.py +++ b/ppi_py/ppi.py @@ -3,7 +3,11 @@ from scipy.stats import norm, binom from scipy.optimize import brentq, minimize from statsmodels.regression.linear_model import OLS, WLS -from statsmodels.stats.weightstats import _zconfint_generic, _zstat_generic, _zstat_generic2 +from statsmodels.stats.weightstats import ( + _zconfint_generic, + _zstat_generic, + _zstat_generic2, +) from sklearn.linear_model import LogisticRegression, PoissonRegressor import warnings @@ -961,6 +965,7 @@ def _logistic_get_stats( inv_hessian = np.linalg.inv(hessian).reshape(d, d) return grads, grads_hat, grads_hat_unlabeled, inv_hessian + def ppi_logistic_pval( X, Y, @@ -1030,7 +1035,7 @@ def ppi_logistic_pval( w_unlabeled, use_unlabeled=use_unlabeled, ) - + if lam is None: lam = _calc_lam_glm( grads, diff --git a/tests/test_logistic.py b/tests/test_logistic.py index a3f1d1c..00c977c 100644 --- a/tests/test_logistic.py +++ b/tests/test_logistic.py @@ -62,8 +62,8 @@ def test_ppi_logistic_pointestimate_recovers(): ) # Check that the point estimate is close to the true beta assert np.linalg.norm(beta_ppi_pointestimate - beta) < 0.2 - - + + def test_ppi_logistic_pval_makesense(): # Make a synthetic regression problem n = 10000 @@ -86,7 +86,7 @@ def test_ppi_logistic_pval_makesense(): optimizer_options={"gtol": 1e-3}, ) assert beta_ppi_pval[-1] < 0.1 - + beta_ppi_pval = ppi_logistic_pval( X, Y,