New argument for tests: D, included exogenous variables (#94)

* New argument for Wald test: D * New argument for AR: D * Additional arg to simulate_gaussian_iv: md * New argument for likelihood_ratio: D * Add D argument to lagrange_multiplier. * Add D argument to CLR test. * Fix some tests. * Some bugfixes. * Fix last test. * Changelog. * Add test. * More testing. * Fix inverse_wald output. * Docstring. * Revert rank. * lagrange docstring. * Fix LM inverse. * Fix warnings. * Don't test LM. * This time correctly. * Remove superfluous code.
mlondschien · Jul 18, 2024 · 94b8f8d · 94b8f8d
1 parent fa32c3d
commit 94b8f8d
Show file tree

Hide file tree

Showing 28 changed files with 724 additions and 357 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -21,6 +21,16 @@ Changelog
   `endogenous_names_`, `exogenous_names_`,  and `instrument_names_`. If pandas is
   installed, there's also `names_coefs_`.
 
+- The tests :func:`~ivmodels.tests.anderson_rubin_test`,
+  :func:`~ivmodels.tests.lagrange_multiplier_test`,
+  :func:`~ivmodels.tests.likelihood_ratio_test`, and
+  :func:`~ivmodels.tests.wald_test` and their inverses
+  :func:`~ivmodels.tests.inverse_anderson_rubin_test`,
+  :func:`~ivmodels.tests.inverse_lagrange_multiplier_test`,
+  :func:`~ivmodels.tests.inverse_likelihood_ratio_test`, and
+  :func:`~ivmodels.tests.inverse_wald_test` now support an additional parameter `D`
+  of exogenous covariates to be included in the test. This is not supported for
+  the conditional likelihood ratio test.
 
 
 **Other changes:**

diff --git a/benchmarks/benchmarks/kclass.py b/benchmarks/benchmarks/kclass.py
@@ -16,7 +16,7 @@ def setup(self, kappa, data):
             Z, X, y, C, _ = simulate_guggenberger12(n=1000, k=10, seed=0)
         else:
             n, mx, k, mc = data
-            Z, X, y, C, _ = simulate_gaussian_iv(n=n, mx=mx, k=k, u=mx, mc=mc)
+            Z, X, y, C, _, _ = simulate_gaussian_iv(n=n, mx=mx, k=k, u=mx, mc=mc)
 
         self.data = {
             "Z": Z,

diff --git a/benchmarks/benchmarks/tests.py b/benchmarks/benchmarks/tests.py
@@ -32,7 +32,7 @@ def setup(self, n, data):
             )
         else:
             k, mx, mw, mc = data
-            Z, X, y, C, W, beta = simulate_gaussian_iv(
+            Z, X, y, C, W, _, beta = simulate_gaussian_iv(
                 n=n, mx=mx, k=k, u=mx, mc=mc, mw=mw, return_beta=True
             )
 

diff --git a/ivmodels/models/kclass.py b/ivmodels/models/kclass.py
@@ -363,7 +363,8 @@ def fit(self, X, y, Z=None, C=None, *args, **kwargs):
                 "Variable names must not contain 'intercept' when fit_intercept=True."
             )
 
-        n, mx = X.shape
+        n, k = Z.shape
+        mx, mc = X.shape[1], C.shape[1]
 
         # Including an intercept is equivalent to replacing y <- M_1 y, X <- M_1 X,
         # C <- M_1 C, Z <- M_1 Z, where M_1 = Id - 1/n 1 1^T is the projection
@@ -383,13 +384,13 @@ def fit(self, X, y, Z=None, C=None, *args, **kwargs):
             y = y - y_mean
             Z = Z - Z.mean(axis=0)
 
-            if C.shape[1] > 0:
+            if mc > 0:
                 c_mean = C.mean(axis=0)
                 C = C - c_mean
             else:
                 c_mean = np.zeros(0)
         else:
-            x_mean, y_mean, c_mean = np.zeros(mx), 0, np.zeros(C.shape[1])
+            x_mean, y_mean, c_mean = np.zeros(mx), 0, np.zeros(mc)
 
         # Including exogenous covariates C can be done by the following approaches:
         #
@@ -403,7 +404,7 @@ def fit(self, X, y, Z=None, C=None, *args, **kwargs):
         #   replace Z <- [Z C], X <- [X C], y <- y, and apply the k-class estimator to
         #   the augmented data. This is the approach taken here. Care needs to be taken
         #   to compute kappa. For this, we follow the first approach. See below.
-        if C.shape[1] > 0:
+        if mc > 0:
             Z = np.hstack([Z, C])
             # save some compute here, as proj([Z, C], C) = C
             X_proj, y_proj = proj(Z, X, y)
@@ -432,7 +433,7 @@ def fit(self, X, y, Z=None, C=None, *args, **kwargs):
                     y_proj=y_proj,
                 )
                 self.kappa_liml_ = 1 + self.ar_min_
-                self.kappa_ = self.kappa_liml_ - self.fuller_alpha_ / (n - Z.shape[1])
+                self.kappa_ = self.kappa_liml_ - self.fuller_alpha_ / (n - k - mc)
         else:
             self.kappa_ = self.kappa
 
@@ -482,7 +483,7 @@ def predict(self, X, C=None, *args, **kwargs):  # noqa D
         (X, _, C), _ = self._X_Z_C(X, C=C, Z=None, predict=True)
         return super().predict(np.hstack([X, C]), *args, **kwargs)
 
-    def summary(self, X, y, Z=None, C=None, test="wald (liml)", alpha=0.05, **kwargs):
+    def summary(self, X, y, Z=None, C=None, test="wald", alpha=0.05, **kwargs):
         """
         Create Summary object for the fitted model.
 
@@ -504,8 +505,9 @@ def summary(self, X, y, Z=None, C=None, test="wald (liml)", alpha=0.05, **kwargs
             specified, ``C`` must be ``None``. If ``C`` is specified,
             ``exogenous_names`` and ``exogenous_regex`` must be ``None``.
         test: str, optional, default="wald (liml)"
-            The test to use. Must be one of "wald (liml)", "wald (tsls)", "anderson
-            rubin", "AR", or "lagrange multiplier".
+            The test to use. Must be one of "wald", "anderson-rubin",
+            "lagrange multiplier", "likelihood-ratio", or
+            "conditional likelihood-ratio".
         alpha: float, optional, default=0.05
             The significance level.
         **kwargs

diff --git a/ivmodels/quadric.py b/ivmodels/quadric.py
@@ -137,9 +137,9 @@ def _boundary(self, num=200):
                 )
 
         if np.all(self.D > 0) and (self.c_standardized > 0):
-            return np.zeros(shape=(0, 1))
+            return np.zeros(shape=(0, len(self.D)))
         if np.all(self.D < 0) and (self.c_standardized <= 0):
-            return np.zeros(shape=(0, 1))
+            return np.zeros(shape=(0, len(self.D)))
 
         # If both entries of D have the opposite sign as c_standardized, the
         # boundary of the quadric is an ellipse.
@@ -239,7 +239,7 @@ def project(self, coordinates):
         if mask.all():
             return self
 
-        if (  # [~mask, ~mask] will to a .reshape(-1) on the matrix
+        if (  # [~mask, ~mask] does a .reshape(-1) on the matrix
             scipy.linalg.eigvalsh(self.A[:, ~mask][~mask, :], subset_by_index=[0, 0])[0]
             < 0
         ):

diff --git a/ivmodels/simulate.py b/ivmodels/simulate.py
@@ -5,7 +5,7 @@
 
 
 def simulate_guggenberger12(
-    n, *, k, seed=0, h11=100, h12=1, rho=0.95, cov=None, return_beta=False
+    n, *, k, seed=0, h11=100, h12=1, rho=0.95, cov=None, return_beta=False, md=0
 ):
     """
     Generate data by process as proposed by :cite:t:`guggenberger2012asymptotic`.
@@ -96,14 +96,18 @@ def simulate_guggenberger12(
 
     Z = rng.normal(0, 1, (n, k))
 
+    Pi_ZD = rng.normal(0, 1, (k, md))
+    D = rng.normal(0, 1, (n, md)) + Z @ Pi_ZD
+    delta = rng.normal(0, 0.1, (md, 1))
+
     X = Z @ Pi_X + noise[:, 1:2]
     W = Z @ Pi_W + noise[:, 2:]
-    y = X @ beta + W @ gamma + noise[:, 0:1]
+    y = X @ beta + W @ gamma + D @ delta + noise[:, 0:1]
 
     if return_beta:
-        return Z, X, y.flatten(), None, W, beta.flatten()
+        return Z, X, y.flatten(), None, W, D, np.concatenate([beta, delta]).flatten()
     else:
-        return Z, X, y.flatten(), None, W
+        return Z, X, y.flatten(), None, W, D
 
 
 def simulate_gaussian_iv(
@@ -114,6 +118,7 @@ def simulate_gaussian_iv(
     u=None,
     mw=0,
     mc=0,
+    md=0,
     seed=0,
     include_intercept=True,
     return_beta=False,
@@ -171,32 +176,37 @@ def simulate_gaussian_iv(
     ux = rng.normal(0, 1, (u, m))
     uy = rng.normal(0, 1, (u, 1))
 
-    alpha = rng.normal(0, 1, (mc, 1))
+    alpha = rng.normal(0, 1, (mc + md, 1))
     beta = rng.normal(0, 1, (m, 1))
 
     Pi_ZX = rng.normal(0, 1, (k, m))
-    Pi_CX = rng.normal(0, 1, (mc, m))
-    Pi_CZ = rng.normal(0, 1, (mc, k))
+    Pi_CX = rng.normal(0, 1, (mc + md, m))
+    Pi_CZ = rng.normal(0, 1, (mc + md, k))
 
     U = rng.normal(0, 1, (n, u))
-    C = rng.normal(0, 1, (n, mc)) + include_intercept * rng.normal(0, 1, (1, mc))
+    C = rng.normal(0, 1, (n, mc + md))
+    if include_intercept:
+        C += rng.normal(0, 1, (1, mc + md))
 
-    Z = (
-        rng.normal(0, 1, (n, k))
-        + include_intercept * rng.normal(0, 1, (1, k))
-        + C @ Pi_CZ
-    )
+    Z = rng.normal(0, 1, (n, k)) + C @ Pi_CZ
+    if include_intercept:
+        Z += rng.normal(0, 1, (1, k))
 
     X = Z @ Pi_ZX + C @ Pi_CX + U @ ux
     X += rng.normal(0, 1, (n, m)) + include_intercept * rng.normal(0, 1, (1, m))
     y = C @ alpha + X @ beta + U @ uy
     y += rng.normal(0, 1, (n, 1)) + include_intercept * rng.normal(0, 1, (1, 1))
 
+    X, W, C, D = X[:, :mx], X[:, mx:], C[:, :mc], C[:, mc:]
+
+    gamma0 = beta[mx:]
+    beta0 = np.concatenate([beta[:mx], alpha[mc:]])
+
     if return_beta and return_gamma:
-        return Z, X[:, :mx], y.flatten(), C, X[:, mx:], beta[:mx], beta[mx:]
+        return Z, X, y.flatten(), C, W, D, beta0, gamma0
     elif return_beta:
-        return Z, X[:, :mx], y.flatten(), C, X[:, mx:], beta[:mx]
+        return Z, X, y.flatten(), C, W, D, beta0
     elif return_gamma:
-        return Z, X[:, :mx], y.flatten(), C, X[:, mx:], beta[mx:]
+        return Z, X, y.flatten(), C, W, D, gamma0
     else:
-        return Z, X[:, :mx], y.flatten(), C, X[:, mx:]
+        return Z, X, y.flatten(), C, W, D
diff --git a/ivmodels/summary.py b/ivmodels/summary.py
@@ -93,6 +93,8 @@ def fit(self, X, y, Z=None, C=None, *args, **kwargs):
         if not str(self.test).lower() in _TESTS:
             raise ValueError(f"Test {self.test} not recognized.")
 
+        n = X.shape[0]
+
         test_, inverse_test_ = _TESTS.get(str(self.test).lower())
 
         self.estimates_ = self.kclass.coef_.tolist()
@@ -102,40 +104,42 @@ def fit(self, X, y, Z=None, C=None, *args, **kwargs):
 
         (X, Z, C), _ = self.kclass._X_Z_C(X, Z, C, predict=False)
 
-        self.feature_names_ = self.kclass.endogenous_names_
-        # + self.kclass.exogenous_names_
+        self.feature_names_ = (
+            self.kclass.endogenous_names_ + self.kclass.exogenous_names_
+        )
 
         idx = 0
-        # if self.kclass.fit_intercept:
-        #     self.feature_names_ = ["intercept"] + self.feature_names_
-        #     self.estimates_ = [self.kclass.intercept_] + self.estimates_
-        #     idx -= 1
+        if self.kclass.fit_intercept:
+            self.feature_names_ = ["intercept"] + self.feature_names_
+            self.estimates_ = [self.kclass.intercept_] + self.estimates_
+            idx -= 1
+
         for name in self.feature_names_:
             if name in self.kclass.endogenous_names_:
                 mask = np.zeros(len(self.kclass.endogenous_names_), dtype=bool)
                 mask[idx] = True
 
-                X_, W_, C_, Z_ = X[:, mask], X[:, ~mask], C, Z
+                X_, W_, C_, D_ = X[:, mask], X[:, ~mask], C, np.zeros((n, 0))
                 fit_intercept_ = True
 
             elif name in self.kclass.exogenous_names_:
                 mask = np.zeros(len(self.kclass.exogenous_names_), dtype=bool)
                 mask[idx - len(self.kclass.endogenous_names_)] = True
 
-                X_, W_, C_, Z_ = C[:, mask], X, C[:, ~mask], np.hstack([Z, C[:, mask]])
+                X_, W_, C_, D_ = np.zeros((n, 0)), X, C[:, ~mask], C[:, mask]
                 fit_intercept_ = True
 
             elif name == "intercept":
-                ones = np.ones((X.shape[0], 1))
-                X_, W_, C_, Z_ = ones, X, C, np.hstack([Z, ones])
+                X_, W_, C_, D_ = np.zeros((n, 0)), X, C, np.ones((n, 1))
                 fit_intercept_ = False
 
             test_result = test_(
-                Z=Z_,
+                Z=Z,
                 X=X_,
                 W=W_,
                 y=y,
                 C=C_,
+                D=D_,
                 beta=np.array([0]),
                 fit_intercept=fit_intercept_,
                 **kwargs,
@@ -144,11 +148,12 @@ def fit(self, X, y, Z=None, C=None, *args, **kwargs):
             self.p_values_.append(test_result[1])
 
             confidence_set = inverse_test_(
-                Z=Z_,
+                Z=Z,
                 X=X_,
                 W=W_,
                 y=y,
                 C=C_,
+                D=D_,
                 alpha=self.alpha,
                 fit_intercept=fit_intercept_,
                 **kwargs,
@@ -173,7 +178,7 @@ def __format__(self, format_spec: str) -> str:  # noqa D
             return "Summary not fitted yet."
 
         def format_p_value(x):
-            return f"{x:{format_spec}}" if x > 1e-16 else "<1e-16"
+            return f"{x:{format_spec}}" if np.isnan(x) or x > 1e-16 else "<1e-16"
 
         estimate_str = [f"{e:{format_spec}}" for e in self.estimates_]
         statistics_str = [f"{s:{format_spec}}" for s in self.statistics_]