From 70d01e76a3e846c08583f085ffe5d6431c22907d Mon Sep 17 00:00:00 2001
From: Pierre Boyeau <pierre.boyeau@gmail.com>
Date: Mon, 4 Nov 2024 14:40:52 -0800
Subject: [PATCH 1/2] pvalues for logistic regression

---
 ppi_py/ppi.py          | 107 ++++++++++++++++++++++++++++++++++++++++-
 tests/test_logistic.py |  35 ++++++++++++++
 2 files changed, 141 insertions(+), 1 deletion(-)

diff --git a/ppi_py/ppi.py b/ppi_py/ppi.py
index 43b9109..b5f293a 100644
--- a/ppi_py/ppi.py
+++ b/ppi_py/ppi.py
@@ -3,7 +3,7 @@
 from scipy.stats import norm, binom
 from scipy.optimize import brentq, minimize
 from statsmodels.regression.linear_model import OLS, WLS
-from statsmodels.stats.weightstats import _zconfint_generic, _zstat_generic
+from statsmodels.stats.weightstats import _zconfint_generic, _zstat_generic, _zstat_generic2
 from sklearn.linear_model import LogisticRegression, PoissonRegressor
 import warnings
 
@@ -961,6 +961,111 @@ def _logistic_get_stats(
     inv_hessian = np.linalg.inv(hessian).reshape(d, d)
     return grads, grads_hat, grads_hat_unlabeled, inv_hessian
 
+def ppi_logistic_pval(
+    X,
+    Y,
+    Yhat,
+    X_unlabeled,
+    Yhat_unlabeled,
+    lam=None,
+    coord=None,
+    optimizer_options=None,
+    w=None,
+    w_unlabeled=None,
+    alternative="two-sided",
+):
+    """Computes the prediction-powered pvalues on the logistic regression coefficients for the null hypothesis that the coefficient is zero.
+
+    Args:
+        X (ndarray): Covariates corresponding to the gold-standard labels.
+        Y (ndarray): Gold-standard labels.
+        Yhat (ndarray): Predictions corresponding to the gold-standard labels.
+        X_unlabeled (ndarray): Covariates corresponding to the unlabeled data.
+        Yhat_unlabeled (ndarray): Predictions corresponding to the unlabeled data.
+        lam (float, optional): Power-tuning parameter (see `[ADZ23] <https://arxiv.org/abs/2311.01453>`__). The default value `None` will estimate the optimal value from data. Setting `lam=1` recovers PPI with no power tuning, and setting `lam=0` recovers the classical point estimate.
+        coord (int, optional): Coordinate for which to optimize `lam`. If `None`, it optimizes the total variance over all coordinates. Must be in {1, ..., d} where d is the shape of the estimand.
+        optimizer_options (dict, optional): Options to pass to the optimizer. See scipy.optimize.minimize for details.
+        w (ndarray, optional): Sample weights for the labeled data set.
+        w_unlabeled (ndarray, optional): Sample weights for the unlabeled data set.
+        alternative (str, optional): Alternative hypothesis, either 'two-sided', 'larger' or 'smaller'.
+
+    Returns:
+        ndarray: Prediction-powered point estimate of the logistic regression coefficients.
+
+    Notes:
+        `[ADZ23] <https://arxiv.org/abs/2311.01453>`__ A. N. Angelopoulos, J. C. Duchi, and T. Zrnic. PPI++: Efficient Prediction Powered Inference. arxiv:2311.01453, 2023.
+    """
+    n = Y.shape[0]
+    d = X.shape[1]
+    N = Yhat_unlabeled.shape[0]
+    w = np.ones(n) if w is None else w / w.sum() * n
+    w_unlabeled = (
+        np.ones(N)
+        if w_unlabeled is None
+        else w_unlabeled / w_unlabeled.sum() * N
+    )
+    use_unlabeled = lam != 0
+
+    ppi_pointest = ppi_logistic_pointestimate(
+        X,
+        Y,
+        Yhat,
+        X_unlabeled,
+        Yhat_unlabeled,
+        optimizer_options=optimizer_options,
+        lam=lam,
+        coord=coord,
+        w=w,
+        w_unlabeled=w_unlabeled,
+    )
+
+    grads, grads_hat, grads_hat_unlabeled, inv_hessian = _logistic_get_stats(
+        ppi_pointest,
+        X,
+        Y,
+        Yhat,
+        X_unlabeled,
+        Yhat_unlabeled,
+        w,
+        w_unlabeled,
+        use_unlabeled=use_unlabeled,
+    )
+    
+    if lam is None:
+        lam = _calc_lam_glm(
+            grads,
+            grads_hat,
+            grads_hat_unlabeled,
+            inv_hessian,
+            clip=True,
+        )
+        return ppi_logistic_pval(
+            X,
+            Y,
+            Yhat,
+            X_unlabeled,
+            Yhat_unlabeled,
+            optimizer_options=optimizer_options,
+            lam=lam,
+            coord=coord,
+            w=w,
+            w_unlabeled=w_unlabeled,
+            alternative=alternative,
+        )
+
+    var_unlabeled = np.cov(lam * grads_hat_unlabeled.T).reshape(d, d)
+    var = np.cov(grads.T - lam * grads_hat.T).reshape(d, d)
+    Sigma_hat = inv_hessian @ (n / N * var_unlabeled + var) @ inv_hessian
+
+    var_diag = np.sqrt(np.diag(Sigma_hat) / n)
+
+    pvals = _zstat_generic2(
+        ppi_pointest,
+        var_diag,
+        alternative=alternative,
+    )[1]
+    return pvals
+
 
 def ppi_logistic_ci(
     X,
diff --git a/tests/test_logistic.py b/tests/test_logistic.py
index 8843ad1..a3f1d1c 100644
--- a/tests/test_logistic.py
+++ b/tests/test_logistic.py
@@ -62,6 +62,41 @@ def test_ppi_logistic_pointestimate_recovers():
     )
     # Check that the point estimate is close to the true beta
     assert np.linalg.norm(beta_ppi_pointestimate - beta) < 0.2
+    
+    
+def test_ppi_logistic_pval_makesense():
+    # Make a synthetic regression problem
+    n = 10000
+    N = 100000
+    d = 3
+    X = np.random.randn(n, d)
+    beta = np.array([0, 0, 1.0])
+
+    Y = np.random.binomial(1, expit(X.dot(beta)))
+    Yhat = expit(X.dot(beta))
+    X_unlabeled = np.random.randn(N, d)
+    Yhat_unlabeled = expit(X_unlabeled.dot(beta))
+    beta_ppi_pval = ppi_logistic_pval(
+        X,
+        Y,
+        Yhat,
+        X_unlabeled,
+        Yhat_unlabeled,
+        lam=0.5,
+        optimizer_options={"gtol": 1e-3},
+    )
+    assert beta_ppi_pval[-1] < 0.1
+    
+    beta_ppi_pval = ppi_logistic_pval(
+        X,
+        Y,
+        Yhat,
+        X_unlabeled,
+        Yhat_unlabeled,
+        lam=None,
+        optimizer_options={"gtol": 1e-3},
+    )
+    assert beta_ppi_pval[-1] < 0.1
 
 
 def ppi_logistic_ci_subtest(i, alphas, n=1000, N=10000, d=1, epsilon=0.02):

From caef9a99be4f3bbf59353e88efdbc2785173604e Mon Sep 17 00:00:00 2001
From: Anastasios Angelopoulos <angelopoulos@berkeley.edu>
Date: Wed, 6 Nov 2024 23:59:31 -0800
Subject: [PATCH 2/2] [black + test run]

---
 ppi_py/ppi.py          | 9 +++++++--
 tests/test_logistic.py | 6 +++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/ppi_py/ppi.py b/ppi_py/ppi.py
index b5f293a..cfe240a 100644
--- a/ppi_py/ppi.py
+++ b/ppi_py/ppi.py
@@ -3,7 +3,11 @@
 from scipy.stats import norm, binom
 from scipy.optimize import brentq, minimize
 from statsmodels.regression.linear_model import OLS, WLS
-from statsmodels.stats.weightstats import _zconfint_generic, _zstat_generic, _zstat_generic2
+from statsmodels.stats.weightstats import (
+    _zconfint_generic,
+    _zstat_generic,
+    _zstat_generic2,
+)
 from sklearn.linear_model import LogisticRegression, PoissonRegressor
 import warnings
 
@@ -961,6 +965,7 @@ def _logistic_get_stats(
     inv_hessian = np.linalg.inv(hessian).reshape(d, d)
     return grads, grads_hat, grads_hat_unlabeled, inv_hessian
 
+
 def ppi_logistic_pval(
     X,
     Y,
@@ -1030,7 +1035,7 @@ def ppi_logistic_pval(
         w_unlabeled,
         use_unlabeled=use_unlabeled,
     )
-    
+
     if lam is None:
         lam = _calc_lam_glm(
             grads,
diff --git a/tests/test_logistic.py b/tests/test_logistic.py
index a3f1d1c..00c977c 100644
--- a/tests/test_logistic.py
+++ b/tests/test_logistic.py
@@ -62,8 +62,8 @@ def test_ppi_logistic_pointestimate_recovers():
     )
     # Check that the point estimate is close to the true beta
     assert np.linalg.norm(beta_ppi_pointestimate - beta) < 0.2
-    
-    
+
+
 def test_ppi_logistic_pval_makesense():
     # Make a synthetic regression problem
     n = 10000
@@ -86,7 +86,7 @@ def test_ppi_logistic_pval_makesense():
         optimizer_options={"gtol": 1e-3},
     )
     assert beta_ppi_pval[-1] < 0.1
-    
+
     beta_ppi_pval = ppi_logistic_pval(
         X,
         Y,