From 4e27ae5f5f344dafc7129e8e1e1777011f4ce8b0 Mon Sep 17 00:00:00 2001
From: Malte Londschien <61679398+mlondschien@users.noreply.github.com>
Date: Fri, 23 Aug 2024 10:20:23 +0200
Subject: [PATCH] Add examples to docs (#114)
* Add card example.
* Add example.
* add angrist1991.
* Add angrist1991does to index.
* Add acemoglu2001origins.
* Update docs.
* link acemoglu example.
* Get rid of examples.rst.
* Rename.
* Fix outputs.
---
CHANGELOG.rst | 2 +-
README.md | 11 +-
docs/.gitignore | 2 +
docs/api.rst | 59 ++++
docs/changelog.rst | 1 +
docs/conf.py | 2 +
docs/examples/acemoglu2001colonial.ipynb | 382 +++++++++++++++++++++++
docs/examples/angrist1991does.ipynb | 234 ++++++++++++++
docs/examples/card1993using.ipynb | 1 +
docs/examples/tanaka2010risk.ipynb | 256 +++++++++++++++
docs/index.rst | 70 ++---
docs/installation.rst | 13 +
environment.yml | 4 +-
13 files changed, 994 insertions(+), 43 deletions(-)
create mode 100644 docs/.gitignore
create mode 100644 docs/api.rst
create mode 100644 docs/changelog.rst
create mode 100644 docs/examples/acemoglu2001colonial.ipynb
create mode 100644 docs/examples/angrist1991does.ipynb
create mode 100644 docs/examples/card1993using.ipynb
create mode 100644 docs/examples/tanaka2010risk.ipynb
create mode 100644 docs/installation.rst
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 5759d1d..0df6020 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -49,7 +49,7 @@ Changelog
0.3.1 - 2024-07-30
------------------
-** Bug fixes:**
+**Bug fixes:**
- Fixed bug in
:class:`~ivmodels.tests.conditional_likelihood_ratio.inverse_conditional_likelihood_ratio_test`.
diff --git a/README.md b/README.md
index 2bffe4f..454f499 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,10 @@
-# IV Models
\ No newline at end of file
+# ivmodels
+
+ivmodels is a Python library for instrumental variable estimation.
+It implements
+
+ - K-Class estimators, including the Limited Information Maximum Likelihood (LIML) estimator and the Two-Stage Least Squares (TSLS) estimator.
+ - Tests and confidence sets for the parameters of the model, including the Anderson-Rubin test, the the Lagrange multiplier test, the (conditional) likelihood-ratio test, and the Wald test.
+ - Auxiliary tests such as Anderson's (1951) test of reduced rank (a multivariate extension to the first-stage F-test) and the J-statistic (including its LIML variant).
+
+
diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644
index 0000000..02a7e78
--- /dev/null
+++ b/docs/.gitignore
@@ -0,0 +1,2 @@
+html
+doctrees
\ No newline at end of file
diff --git a/docs/api.rst b/docs/api.rst
new file mode 100644
index 0000000..d07c425
--- /dev/null
+++ b/docs/api.rst
@@ -0,0 +1,59 @@
+API
+===
+
+Estimators
+----------
+
+.. autoclass:: ivmodels.KClass
+ :members: fit
+ :noindex:
+
+.. autoclass:: ivmodels.models.AnchorRegression
+ :members: fit
+ :noindex:
+
+.. autoclass:: ivmodels.models.PULSE
+ :members: fit
+ :noindex:
+
+.. autoclass:: ivmodels.models.SpaceIV
+ :members: fit
+ :noindex:
+
+Tests
+-----
+
+.. automodule:: ivmodels.tests
+ :members:
+ :noindex:
+
+Summary
+-------
+
+.. autoclass:: ivmodels.summary.Summary
+ :members:
+ :noindex:
+
+.. autoclass:: ivmodels.summary.CoefficientTable
+ :members:
+ :noindex:
+
+ConfidenceSet
+-------------
+
+.. autoclass:: ivmodels.confidence_set.ConfidenceSet
+ :members:
+ :noindex:
+
+Quadric
+-------
+
+.. autoclass:: ivmodels.quadric.Quadric
+ :members:
+ :noindex:
+
+
+Bibliography
+------------
+
+.. bibliography::
diff --git a/docs/changelog.rst b/docs/changelog.rst
new file mode 100644
index 0000000..4d7817a
--- /dev/null
+++ b/docs/changelog.rst
@@ -0,0 +1 @@
+.. include:: ../CHANGELOG.rst
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
index b976d04..deb0d67 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -32,9 +32,11 @@
"sphinx_rtd_theme",
"sphinx.ext.intersphinx",
"sphinx.ext.mathjax",
+ "nbsphinx",
"sphinxcontrib.bibtex",
]
+
bibtex_bibfiles = ["bib.bib"]
bibtex_reference_style = "author_year"
diff --git a/docs/examples/acemoglu2001colonial.ipynb b/docs/examples/acemoglu2001colonial.ipynb
new file mode 100644
index 0000000..aa24a92
--- /dev/null
+++ b/docs/examples/acemoglu2001colonial.ipynb
@@ -0,0 +1,382 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## The Colonial Origins of Comparative Development: An Empirical Investigation\n",
+ "Daron Acemoglu, Simon Johnson, and James A. Robinson (2001)\n",
+ "\n",
+ "We replicate tables 4, 5.\n",
+ "The data is available under https://economics.mit.edu/people/faculty/daron-acemoglu/data-archive.\n",
+ "We download directly from the dropbox folders supplied.\n",
+ "We start with table 4.\n",
+ "In all specifications a single endogenous variable is instrumented by a single instrument, so the LIML and TSLS estimators are equal, as are the Anderson-Rubin, (conditional) likelihood-ratio, and Lagrange multiplier tests."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " shortnam | \n",
+ " africa | \n",
+ " lat_abst | \n",
+ " rich4 | \n",
+ " avexpr | \n",
+ " logpgp95 | \n",
+ " logem4 | \n",
+ " asia | \n",
+ " loghjypl | \n",
+ " baseco | \n",
+ " other_continent | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " AGO | \n",
+ " 1.0 | \n",
+ " 0.136667 | \n",
+ " 0.0 | \n",
+ " 5.363636 | \n",
+ " 7.770645 | \n",
+ " 5.634789 | \n",
+ " 0.0 | \n",
+ " -3.411248 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " ARG | \n",
+ " 0.0 | \n",
+ " 0.377778 | \n",
+ " 0.0 | \n",
+ " 6.386364 | \n",
+ " 9.133459 | \n",
+ " 4.232656 | \n",
+ " 0.0 | \n",
+ " -0.872274 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " AUS | \n",
+ " 0.0 | \n",
+ " 0.300000 | \n",
+ " 1.0 | \n",
+ " 9.318182 | \n",
+ " 9.897972 | \n",
+ " 2.145931 | \n",
+ " 0.0 | \n",
+ " -0.170788 | \n",
+ " 1.0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " BFA | \n",
+ " 1.0 | \n",
+ " 0.144444 | \n",
+ " 0.0 | \n",
+ " 4.454545 | \n",
+ " 6.845880 | \n",
+ " 5.634789 | \n",
+ " 0.0 | \n",
+ " -3.540459 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " BGD | \n",
+ " 0.0 | \n",
+ " 0.266667 | \n",
+ " 0.0 | \n",
+ " 5.136364 | \n",
+ " 6.877296 | \n",
+ " 4.268438 | \n",
+ " 1.0 | \n",
+ " -2.063568 | \n",
+ " 1.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " shortnam africa lat_abst rich4 avexpr logpgp95 logem4 asia \\\n",
+ "1 AGO 1.0 0.136667 0.0 5.363636 7.770645 5.634789 0.0 \n",
+ "3 ARG 0.0 0.377778 0.0 6.386364 9.133459 4.232656 0.0 \n",
+ "5 AUS 0.0 0.300000 1.0 9.318182 9.897972 2.145931 0.0 \n",
+ "11 BFA 1.0 0.144444 0.0 4.454545 6.845880 5.634789 0.0 \n",
+ "12 BGD 0.0 0.266667 0.0 5.136364 6.877296 4.268438 1.0 \n",
+ "\n",
+ " loghjypl baseco other_continent \n",
+ "1 -3.411248 1.0 0 \n",
+ "3 -0.872274 1.0 0 \n",
+ "5 -0.170788 1.0 1 \n",
+ "11 -3.540459 1.0 0 \n",
+ "12 -2.063568 1.0 0 "
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from io import BytesIO\n",
+ "from zipfile import ZipFile\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import requests\n",
+ "\n",
+ "url4 = \"https://www.dropbox.com/scl/fi/3yuv9j514zuajzjfluoc1/maketable4.zip?rlkey=pq9l7bxktw1iqxe6fmoh26g79&e=1&dl=1\"\n",
+ "content4 = requests.get(url4).content\n",
+ "\n",
+ "with ZipFile(BytesIO(content4)).open(\"maketable4.dta\") as file:\n",
+ " df4 = pd.read_stata(file)\n",
+ "\n",
+ "df4 = df4[lambda x: x[\"baseco\"] == 1]\n",
+ "df4[\"other_continent\"] = df4[\"shortnam\"].isin([\"AUS\", \"MLT\", \"NZL\"]).astype(int)\n",
+ "\n",
+ "df4.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Column (1): 0.9443 (0.1565)\n",
+ "wald: 36.3941 (1.6e-09), ar: 56.6029 (5.32907e-14), f: 23 (1.7e-06)\n",
+ "Column (2): 0.9957 (0.2217)\n",
+ "wald: 20.1744 (7.1e-06), ar: 36.2442 (1.74077e-09), f: 13 (0.0003)\n",
+ "Column (3): 1.2813 (0.3585)\n",
+ "wald: 12.7735 (0.00035), ar: 34.3118 (4.69521e-09), f: 8.6 (0.0033)\n",
+ "Column (4): 1.2117 (0.3543)\n",
+ "wald: 11.6997 (0.00063), ar: 28.1328 (1.13272e-07), f: 7.8 (0.0051)\n",
+ "Column (5): 0.5780 (0.0981)\n",
+ "wald: 34.6971 (3.9e-09), ar: 24.2217 (8.5858e-07), f: 31 (3.3e-08)\n",
+ "Column (6): 0.5757 (0.1173)\n",
+ "wald: 24.0864 (9.2e-07), ar: 16.9777 (3.7822e-05), f: 22 (3.3e-06)\n",
+ "Column (7): 0.9822 (0.2995)\n",
+ "wald: 10.7573 (0.001), ar: 19.3383 (1.09486e-05), f: 6.2 (0.013)\n",
+ "Column (8): 1.1071 (0.4636)\n",
+ "wald: 5.7032 (0.017), ar: 13.5579 (0.000231311), f: 3.5 (0.063)\n",
+ "Column (9): 0.9808 (0.1709)\n",
+ "wald: 32.9327 (9.5e-09), ar: 106.2383 ( 0), f: 24 (1.1e-06)\n"
+ ]
+ }
+ ],
+ "source": [
+ "from ivmodels import KClass\n",
+ "from ivmodels.tests import wald_test, anderson_rubin_test, rank_test\n",
+ "\n",
+ "endogenous = [\"avexpr\"] # average protection against expropriation risk\n",
+ "instruments = [\"logem4\"] # log european settler mortality\n",
+ "\n",
+ "for column, (outcome, exogenous, filter) in enumerate([\n",
+ " # *Columns 1 - 2 (Base Sample)\n",
+ " # ivreg logpgp95 (avexpr=logem4), first\n",
+ " (\"logpgp95\", [], None),\n",
+ " # ivreg logpgp95 lat_abst (avexpr=logem4), first\n",
+ " (\"logpgp95\", [\"lat_abst\"], None),\n",
+ " # *Columns 3 - 4 (Base Sample w/o Neo-Europes)\n",
+ " # ivreg logpgp95 (avexpr=logem4) if rich4!=1, first\n",
+ " (\"logpgp95\", [], \"rich4!=1\"),\n",
+ " # ivreg logpgp95 lat_abst (avexpr=logem4) if rich4!=1, first\n",
+ " (\"logpgp95\", [\"lat_abst\"], \"rich4!=1\"),\n",
+ " # *Columns 5 - 6 (Base Sample w/o Africa)\n",
+ " # ivreg logpgp95 (avexpr=logem4) if africa!=1, first\n",
+ " (\"logpgp95\", [], \"africa!=1\"),\n",
+ " # ivreg logpgp95 lat_abst (avexpr=logem4) if africa!=1, first\n",
+ " (\"logpgp95\", [\"lat_abst\"], \"africa!=1\"),\n",
+ " # *Columns 7 - 8 (Base Sample with continent dummies)\n",
+ " # ivreg logpgp95 (avexpr=logem4) africa asia other_cont, first\n",
+ " (\"logpgp95\", [\"africa\", \"asia\", \"other_continent\"], None),\n",
+ " # ivreg logpgp95 lat_abst (avexpr=logem4) africa asia other_cont, first\n",
+ " (\"logpgp95\", [\"lat_abst\", \"africa\", \"asia\", \"other_continent\"], None),\n",
+ " # *Column 9 (Base Sample, log GDP per worker)\n",
+ " # ivreg loghjypl (avexpr=logem4), first\n",
+ " (\"loghjypl\", [], None),\n",
+ "]):\n",
+ " data = df4 if filter is None else df4.copy().query(filter)\n",
+ " data = data[lambda x: x[outcome].notna()]\n",
+ "\n",
+ " X = data[endogenous]\n",
+ " Z = data[instruments]\n",
+ " C = data[exogenous]\n",
+ " y = data[outcome]\n",
+ "\n",
+ " estimator = KClass(kappa=\"tsls\").fit(Z=Z, X=X, C=C, y=y)\n",
+ " \n",
+ " wald, wald_p = wald_test(X=X, y=y, Z=Z, C=C, beta=np.zeros(1))\n",
+ " std_error = np.abs(estimator.coef_[0]) / np.sqrt(wald)\n",
+ " ar, ar_p = anderson_rubin_test(X=X, y=y, Z=Z, C=C, beta=np.zeros(1))\n",
+ " f, f_p = rank_test(X=X, Z=Z, C=C)\n",
+ "\n",
+ " print(f\"Column ({column + 1}): {estimator.coef_[0]:.4f} ({std_error:.4f})\")\n",
+ " print(f\"wald: {wald:.4f} ({wald_p:.2g}), ar: {ar:.4f} ({ar_p:2g}), f: {f:.2g} ({f_p:.2g})\")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Identification is weak, with f-statistics dropping below the heuristic \"weak instrument threshold\" 10, but the causal effect of average protection against expropriation risk on log-gdp is still significant at level 0.001 using the weak-instrument-robust Anderson-Rubint test.\n",
+ "\n",
+ "We continue with table 5."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Column (1): 1.0778 (0.2176)\n",
+ "wald: 24.5314 (7.3e-07), ar: 44.8089 (2.17235e-11), f: 15 (0.00013)\n",
+ "Column (2): 1.1553 (0.3372)\n",
+ "wald: 11.7400 (0.00061), ar: 26.1369 (3.18051e-07), f: 7.3 (0.0069)\n",
+ "Column (3): 1.0662 (0.2443)\n",
+ "wald: 19.0473 (1.3e-05), ar: 34.7046 (3.83717e-09), f: 9.8 (0.0017)\n",
+ "Column (4): 1.3391 (0.5164)\n",
+ "wald: 6.7227 (0.0095), ar: 20.4897 (5.99543e-06), f: 3.9 (0.049)\n",
+ "Column (5): 1.0800 (0.1912)\n",
+ "wald: 31.9153 (1.6e-08), ar: 55.7875 (8.07132e-14), f: 18 (1.9e-05)\n",
+ "Column (6): 1.1811 (0.2910)\n",
+ "wald: 16.4715 (4.9e-05), ar: 36.1450 (1.83171e-09), f: 9.9 (0.0017)\n",
+ "Column (7): 0.9174 (0.1467)\n",
+ "wald: 39.1013 (4e-10), ar: 51.2727 (8.03801e-13), f: 20 (8.4e-06)\n",
+ "Column (8): 1.0062 (0.2517)\n",
+ "wald: 15.9767 (6.4e-05), ar: 27.0786 (1.95353e-07), f: 8.6 (0.0033)\n",
+ "Column (9): 1.2122 (0.3949)\n",
+ "wald: 9.4226 (0.0021), ar: 23.7357 (1.10512e-06), f: 5.3 (0.022)\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "url5 = \"https://www.dropbox.com/scl/fi/qiqyoc34vtr5tvgo4ew7n/maketable5.zip?rlkey=7xn2e59lqn8skf1psv0f7lkr0&e=1&dl=1\"\n",
+ "content5 = requests.get(url5).content\n",
+ "\n",
+ "with ZipFile(BytesIO(content5)).open(\"maketable5.dta\") as file:\n",
+ " df5 = pd.read_stata(file)\n",
+ "\n",
+ "df5 = df5[lambda x: x[\"baseco\"] == 1]\n",
+ "\n",
+ "outcome = 'logpgp95'\n",
+ "endogenous = [\"avexpr\"]\n",
+ "instruments = [\"logem4\"]\n",
+ "\n",
+ "for column, (exogenous, filter) in enumerate([\n",
+ " # *--Columns 1 and 2 (British and French colony dummies)\n",
+ " # ivreg logpgp95 (avexpr=logem4) f_brit f_french, first\n",
+ " ([\"f_brit\", \"f_french\"], None),\n",
+ " # ivreg logpgp95 lat_abst (avexpr=logem4) f_brit f_french, first\n",
+ " ([\"lat_abst\", \"f_brit\", \"f_french\"], None),\n",
+ " # *--Columns 3 and 4 (British colonies only)\n",
+ " # ivreg logpgp95 (avexpr=logem4) if f_brit==1, first\n",
+ " ([], \"f_brit==1\"),\n",
+ " # ivreg logpgp95 lat_abst (avexpr=logem4) if f_brit==1, first\n",
+ " ([\"lat_abst\"], \"f_brit==1\"),\n",
+ " # *--Columns 5 and 6 (Control for French legel origin)\n",
+ " # ivreg logpgp95 (avexpr=logem4) sjlofr, first\n",
+ " ([\"sjlofr\"], None),\n",
+ " # ivreg logpgp95 lat_abst (avexpr=logem4) sjlofr, first\n",
+ " ([\"lat_abst\", \"sjlofr\"], None),\n",
+ " # *--Columns 7 and 8 (Religion dummies)\n",
+ " # ivreg logpgp95 (avexpr=logem4) catho80 muslim80 no_cpm80, first\n",
+ " ([\"catho80\", \"muslim80\", \"no_cpm80\"], None),\n",
+ " # ivreg logpgp95 lat_abst (avexpr=logem4) catho80 muslim80 no_cpm80, first\n",
+ " ([\"lat_abst\", \"catho80\", \"muslim80\", \"no_cpm80\"], None), \n",
+ " # *--Columns 9 (Multiple controls)\n",
+ " # ivreg logpgp95 lat_abst (avexpr=logem4) f_french sjlofr catho80 muslim80 no_cpm80, first\n",
+ " ([\"lat_abst\", \"f_french\", \"sjlofr\", \"catho80\", \"muslim80\", \"no_cpm80\"], None)\n",
+ "]):\n",
+ " data = df5 if filter is None else df5.copy().query(filter)\n",
+ " data = data[lambda x: x[outcome].notna()]\n",
+ "\n",
+ " X = data[endogenous]\n",
+ " Z = data[instruments]\n",
+ " C = data[exogenous]\n",
+ " y = data[outcome]\n",
+ "\n",
+ " estimator = KClass(kappa=\"tsls\").fit(Z=Z, X=X, C=C, y=y)\n",
+ " \n",
+ " wald, wald_p = wald_test(X=X, y=y, Z=Z, C=C, beta=np.zeros(1))\n",
+ " std_error = np.abs(estimator.coef_[0]) / np.sqrt(wald)\n",
+ " ar, ar_p = anderson_rubin_test(X=X, y=y, Z=Z, C=C, beta=np.zeros(1))\n",
+ " f, f_p = rank_test(X=X, Z=Z, C=C)\n",
+ "\n",
+ " print(f\"Column ({column + 1}): {estimator.coef_[0]:.4f} ({std_error:.4f})\")\n",
+ " print(f\"wald: {wald:.4f} ({wald_p:.2g}), ar: {ar:.4f} ({ar_p:2g}), f: {f:.2g} ({f_p:.2g})\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This is not a perfect replicates of the results presented by Acemoglu et al. (2001) in Table 4.\n",
+ "However, it does match the results when running their Stata code (`maketable5.do` in the archive).\n",
+ "\n",
+ "As for table 4, identification is weak, but the causal effect of average protection against expropriation risk on log-gdp is still significant at level 0.001 using the weak-instrument-robust Anderson-Rubint test."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "ivmodels",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/examples/angrist1991does.ipynb b/docs/examples/angrist1991does.ipynb
new file mode 100644
index 0000000..27029b3
--- /dev/null
+++ b/docs/examples/angrist1991does.ipynb
@@ -0,0 +1,234 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Does Compulsory School Attendance Affect Schooling and Earnings?\n",
+ "\n",
+ "We replicate tables IV, V, and VI of Angrist and Krueger (1991).\n",
+ "We start by loading the data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import requests\n",
+ "from sklearn.preprocessing import OneHotEncoder\n",
+ "import subprocess\n",
+ "import tempfile\n",
+ "\n",
+ "url = \"https://economics.mit.edu/sites/default/files/inline-files/NEW7080_1.rar\"\n",
+ "\n",
+ "\n",
+ "dir = tempfile.TemporaryDirectory()\n",
+ "with open(f\"{dir.name}/file.rar\", 'wb') as file:\n",
+ " file.write(requests.get(url).content)\n",
+ "\n",
+ "subprocess.run([\"tar\", \"xf\", f\"{dir.name}/file.rar\", \"-C\", dir.name])\n",
+ "df = pd.read_stata(f\"{dir.name}/NEW7080.dta\")\n",
+ "\n",
+ "# renaming from\n",
+ "# https://economics.mit.edu/sites/default/files/inline-files/Descriptive%20Statistics%20QOB.txt\n",
+ "df = df.rename(columns={\n",
+ " \"v1\": \"age\",\n",
+ " \"v2\": \"ageq\",\n",
+ " \"v4\": \"educ\",\n",
+ " \"v5\": \"enocent\",\n",
+ " \"v6\": \"esocent\",\n",
+ " \"v9\": \"lwklywge\",\n",
+ " \"v10\": \"married\",\n",
+ " \"v11\": \"midatl\",\n",
+ " \"v12\": \"mt\",\n",
+ " \"v13\": \"neweng\",\n",
+ " \"v16\": \"census\",\n",
+ " \"v18\": \"qob\",\n",
+ " \"v19\": \"race\",\n",
+ " \"v20\": \"smsa\",\n",
+ " \"v21\": \"soatl\",\n",
+ " \"v24\": \"wnocent\",\n",
+ " \"v25\": \"wsocent\",\n",
+ " \"v27\": \"yob\",\n",
+ "})\n",
+ "\n",
+ "# replace AGEQ=AGEQ-1900 if CENSUS==80\n",
+ "df.loc[lambda x: x[\"census\"].eq(80), \"ageq\"] -= 1900\n",
+ "# gen AGEQSQ= AGEQ*AGEQ\n",
+ "df[\"ageqsq\"] = df[\"ageq\"] ** 2\n",
+ "\n",
+ "df[\"yob_dummies\"] = df[\"yob\"] % 10\n",
+ "yob_encoder = OneHotEncoder(\n",
+ " categories=[list(range(9))],\n",
+ " sparse_output=False,\n",
+ " handle_unknown=\"ignore\"\n",
+ ")\n",
+ "yob_encoder.set_output(transform=\"pandas\")\n",
+ "yob_dummies = yob_encoder.fit_transform(df[[\"yob_dummies\"]])\n",
+ "\n",
+ "df[\"yqob\"] = df[\"yob_dummies\"].astype(\"str\") + df[\"qob\"].astype(\"str\")\n",
+ "yqob_encoder = OneHotEncoder(\n",
+ " categories=[[f\"{y}{q}\" for y in range(10) for q in [2, 3, 4]]],\n",
+ " sparse_output=False,\n",
+ " handle_unknown=\"ignore\"\n",
+ ").set_output(transform=\"pandas\")\n",
+ "yqob_dummies = yqob_encoder.fit_transform(df[[\"yqob\"]])\n",
+ "\n",
+ "df = pd.concat([df, yob_dummies, yqob_dummies], axis=1)\n",
+ "\n",
+ "cohorts = {\n",
+ " \"IV\": df[lambda x: x[\"yob\"].isin(range(1920, 1930))],\n",
+ " \"V\": df[lambda x: x[\"yob\"].isin(range(30, 40))],\n",
+ " \"VI\": df[lambda x: x[\"yob\"].isin(range(40, 50))],\n",
+ "}\n",
+ "\n",
+ "age = [\"age\", \"ageqsq\"]\n",
+ "other = [\"race\", \"married\", \"smsa\"]\n",
+ "region = [\"neweng\", \"midatl\", \"enocent\", \"wnocent\", \"soatl\", \"esocent\", \"wsocent\", \"mt\"]\n",
+ "\n",
+ "yob_names = yob_encoder.get_feature_names_out().tolist()\n",
+ "yqob_names = yqob_encoder.get_feature_names_out().tolist()\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We now replicate results from tables IV, V, and VI.\n",
+ "We don't perfectly replicate columns (4), (8), as the authors include age and age squared only in the first, but not the second stage.\n",
+ "We include them in both stages.\n",
+ "See also the following code from https://economics.mit.edu/sites/default/files/inline-files/QOB%20Table%20IV.do:\n",
+ "```stata\n",
+ "** Col 2 4 6 8 ***\n",
+ "ivregress 2sls LWKLYWGE YR20-YR28 (EDUC = QTR120-QTR129 QTR220-QTR229 QTR320-QTR329 YR20-YR28)\n",
+ "ivregress 2sls LWKLYWGE YR20-YR28 AGEQ AGEQSQ (EDUC = QTR120-QTR129 QTR220-QTR229 QTR320-QTR329 YR20-YR28)\n",
+ "ivregress 2sls LWKLYWGE YR20-YR28 RACE MARRIED SMSA NEWENG MIDATL ENOCENT WNOCENT SOATL ESOCENT WSOCENT MT (EDUC = QTR120-QTR129 QTR220-QTR229 QTR320-QTR329 YR20-YR28)\n",
+ "ivregress 2sls LWKLYWGE YR20-YR28 RACE MARRIED SMSA NEWENG MIDATL ENOCENT WNOCENT SOATL ESOCENT WSOCENT MT AGEQ AGEQSQ (EDUC = QTR120-QTR129 QTR220-QTR229 QTR320-QTR329 YR20-YR28)\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Table IV\n",
+ "Column (1), 0.0802 (0.0004)\n",
+ "Column (2), 0.0769 (0.0150)\n",
+ "wald: 3.2e-07, ar: 0.0085, clr: 0.00052, lm: 0.00093, j: 0.17\n",
+ "Column (3), 0.0802 (0.0004)\n",
+ "Column (4), 0.1352 (0.0337)\n",
+ "wald: 6e-05, ar: 0.095, clr: 0.11, lm: 0.33, j: 0.8\n",
+ "Column (5), 0.0701 (0.0004)\n",
+ "Column (6), 0.0669 (0.0151)\n",
+ "wald: 9.4e-06, ar: 0.028, clr: 0.002, lm: 0.0028, j: 0.23\n",
+ "Column (7), 0.0701 (0.0004)\n",
+ "Column (8), 0.1039 (0.0341)\n",
+ "wald: 0.0023, ar: 0.2, clr: 0.27, lm: 0.7, j: 0.69\n",
+ "\n",
+ "Table V\n",
+ "Column (1), 0.0711 (0.0003)\n",
+ "Column (2), 0.0891 (0.0161)\n",
+ "wald: 3.2e-08, ar: 0.013, clr: 1.2e-05, lm: 1e-05, j: 0.66\n",
+ "Column (3), 0.0711 (0.0003)\n",
+ "Column (4), 0.0655 (0.0280)\n",
+ "wald: 0.019, ar: 0.64, clr: 0.38, lm: 0.33, j: 0.71\n",
+ "Column (5), 0.0632 (0.0003)\n",
+ "Column (6), 0.0806 (0.0164)\n",
+ "wald: 8.8e-07, ar: 0.064, clr: 7.4e-05, lm: 5e-05, j: 0.8\n",
+ "Column (7), 0.0632 (0.0003)\n",
+ "Column (8), 0.0509 (0.0279)\n",
+ "wald: 0.069, ar: 0.85, clr: 0.51, lm: 0.42, j: 0.87\n",
+ "\n",
+ "Table VI\n",
+ "Column (1), 0.0573 (0.0003)\n",
+ "Column (2), 0.0553 (0.0138)\n",
+ "wald: 5.8e-05, ar: 5.6e-11, clr: 0.0089, lm: 0.039, j: 5.4e-10\n",
+ "Column (3), 0.0574 (0.0003)\n",
+ "Column (4), 0.1293 (0.0191)\n",
+ "wald: 1.4e-11, ar: 1.6e-09, clr: 1.9e-10, lm: 9.8e-08, j: 0.0032\n",
+ "Column (5), 0.0520 (0.0003)\n",
+ "Column (6), 0.0393 (0.0145)\n",
+ "wald: 0.0067, ar: 1e-08, clr: 0.19, lm: 0.29, j: 1.1e-08\n",
+ "Column (7), 0.0521 (0.0003)\n",
+ "Column (8), 0.1138 (0.0200)\n",
+ "wald: 1.3e-08, ar: 2.5e-07, clr: 1.3e-07, lm: 1.6e-05, j: 0.0025\n"
+ ]
+ }
+ ],
+ "source": [
+ "from ivmodels import KClass\n",
+ "from ivmodels.tests import wald_test, anderson_rubin_test, conditional_likelihood_ratio_test, lagrange_multiplier_test, j_test\n",
+ "\n",
+ "for table, cohort in cohorts.items():\n",
+ " print(f\"\\nTable {table}\")\n",
+ " for column, kappa, exogenous in [\n",
+ " (\"(1)\", \"ols\", yob_names),\n",
+ " (\"(2)\", \"tsls\", yob_names),\n",
+ " (\"(3)\", \"ols\", yob_names + age),\n",
+ " (\"(4)\", \"tsls\", yob_names + age),\n",
+ " (\"(5)\", \"ols\", yob_names + region + other),\n",
+ " (\"(6)\", \"tsls\", yob_names + region + other),\n",
+ " (\"(7)\", \"ols\", yob_names + region + other + age),\n",
+ " (\"(8)\", \"tsls\", yob_names + region + other + age)\n",
+ " ]:\n",
+ " y = cohort[[\"lwklywge\"]]\n",
+ " X = cohort[[\"educ\"]]\n",
+ " C = cohort[exogenous]\n",
+ " Z = cohort[yqob_names]\n",
+ " estimator = KClass(kappa).fit(X=X, y=y, C=C, Z=Z)\n",
+ "\n",
+ " wald_stat, wald_p = wald_test(X=X, y=y, Z=Z, C=C, beta=np.zeros(1), estimator=kappa)\n",
+ " std_error = np.abs(estimator.coef_[0]) / np.sqrt(wald_stat)\n",
+ "\n",
+ " print(f\"Column {column}, {estimator.coef_[0]:.4f} ({std_error:.4f})\")\n",
+ "\n",
+ " if kappa == \"tsls\":\n",
+ " _, ar_p = anderson_rubin_test(X=X, y=y, Z=Z, C=C, beta=np.zeros(1))\n",
+ " _, clr_p = conditional_likelihood_ratio_test(X=X, y=y, Z=Z, C=C, beta=np.zeros(1))\n",
+ " _, lm_p = lagrange_multiplier_test(X=X, y=y, Z=Z, C=C, beta=np.zeros(1))\n",
+ " _, j_p = j_test(X=X, y=y, Z=Z, C=C)\n",
+ " print(f\"wald: {wald_p:.2g}, ar: {ar_p:.2g}, clr: {clr_p:.2g}, lm: {lm_p:.2g}, j: {j_p:.2g}\")\n",
+ " \n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Notably, for cohorts 1920 - 1929 and 1930 - 1939, the causal effect of education on wages is no longer significant at level 0.05 if using weak-instrument-robust inference and if age and its square are included as an exogenous variables.\n",
+ "The LIML variant of the J-statistic rejects the null of correct model specification at level 0.01 for cohort 1940 - 49, making any inference questionable."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "ivmodels",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/examples/card1993using.ipynb b/docs/examples/card1993using.ipynb
new file mode 100644
index 0000000..f29863a
--- /dev/null
+++ b/docs/examples/card1993using.ipynb
@@ -0,0 +1 @@
+{"cells":[{"cell_type":"markdown","metadata":{},"source":["## Using Geographic Variation in College Proximity to Estimate the Return to Schooling\n","\n","Card (1993) estimates the causal effect of length of education on hourly wages.\n","\n","We start by loading the data."]},{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":["from io import BytesIO\n","from zipfile import ZipFile\n","\n","import numpy as np\n","import pandas as pd\n","import requests\n","\n","url = \"https://davidcard.berkeley.edu/data_sets/proximity.zip\"\n","content = requests.get(url).content\n","\n","# From code_bk.txt in the zip file\n","colspec = {\n"," \"id\": (1, 5), # sequential id runs from 1 to 5225\n"," \"nearc2\": (7, 7), # grew up near 2-yr college\n"," \"nearc4\": (10, 10), # grew up near 4-yr college\n"," \"nearc4a\": (12, 13), # grew up near 4-yr public college\n"," \"nearc4b\": (15, 16), # grew up near 4-yr priv college\n"," \"ed76\": (18, 19), # educ in 1976\n"," \"ed66\": (21, 22), # educ in 1966\n"," \"age76\": (24, 25), # age in 1976\n"," \"daded\": (27, 31), # dads education missing=avg\n"," \"nodaded\": (33, 33), # 1 if dad ed imputed\n"," \"momed\": (35, 39), # moms education\n"," \"nomomed\": (41, 41), # 1 if mom ed imputed\n"," \"weight\": (43, 54), # nls weight for 1976 cross-section\n"," \"momdad14\": (56, 56), # 1 if live with mom and dad age 14\n"," \"sinmom14\": (58, 58), # lived with single mom age 14\n"," \"step14\": (60, 60), # lived step parent age 14\n"," \"reg661\": (62, 62), # dummy for region=1 in 1966\n"," \"reg662\": (64, 64), # dummy for region=2 in 1966\n"," \"reg663\": (66, 66), # dummy for region=3 in 1966\n"," \"reg664\": (68, 68),\n"," \"reg665\": (70, 70),\n"," \"reg666\": (72, 72),\n"," \"reg667\": (74, 74),\n"," \"reg668\": (76, 76),\n"," \"reg669\": (78, 78), # dummy for region=9 in 1966\n"," \"south66\": (80, 80), # lived in south in 1966\n"," \"work76\": (82, 82), # worked in 1976\n"," \"work78\": (84, 84), # worked in 1978\n"," \"lwage76\": (86, 97), # log wage (outliers trimmed) 1976\n"," \"lwage78\": (99, 110), # log wage in 1978 outliers trimmed\n"," \"famed\": (112, 112), # mom-dad education class 1-9\n"," \"black\": (114, 114), # 1 if black\n"," \"smsa76r\": (116, 116), # in smsa in 1976\n"," \"smsa78r\": (118, 118), # in smsa in 1978\n"," \"reg76r\": (120, 120), # in south in 1976\n"," \"reg78r\": (122, 122), # in south in 1978\n"," \"reg80r\": (124, 124), # in south in 1980\n"," \"smsa66r\": (126, 126), # in smsa in 1966\n"," \"wage76\": (128, 132), # raw wage cents per hour 1976\n"," \"wage78\": (134, 138),\n"," \"wage80\": (140, 144),\n"," \"noint78\": (146, 146), # 1 if noninterview in 78\n"," \"noint80\": (148, 148),\n"," \"enroll76\": (150, 150), # 1 if enrolled in 76\n"," \"enroll78\": (152, 152),\n"," \"enroll80\": (154, 154),\n"," \"kww\": (156, 157), # the kww score\n"," \"iq\": (159, 161), # a normed iq score\n"," \"marsta76\": (163, 163), # mar status in 1976 1=married, sp. present\n"," \"marsta78\": (165, 165),\n"," \"marsta80\": (167, 167),\n"," \"libcrd14\": (169, 169), # 1 if lib card in home age 14\n","}\n","\n","with ZipFile(BytesIO(content)).open(\"nls.dat\") as file:\n"," df = pd.read_fwf(\n"," file,\n"," names=colspec.keys(),\n"," # pandas expects [from, to[ values, starting at 0\n"," colspecs=[(f - 1, t) for (f, t) in colspec.values()],\n"," na_values=\".\",\n"," )\n","\n","df = df[lambda x: x[\"lwage76\"].notna()].set_index(\"id\")\n","\n","# construct potential experience and its square\n","df[\"exp76\"] = df[\"age76\"] - df[\"ed76\"] - 6\n","df[\"exp762\"] = df[\"exp76\"] ** 2\n","df[\"age762\"] = df[\"age76\"] ** 2\n","\n","df[\"f1\"] = df[\"famed\"].eq(1).astype(\"float\") # mom and dad both > 12 yrs ed\n","df[\"f2\"] = df[\"famed\"].eq(2).astype(\"float\") # mom&dad >=12 and not both exactly 12\n","df[\"f3\"] = df[\"famed\"].eq(3).astype(\"float\") # mom=dad=12\n","df[\"f4\"] = df[\"famed\"].eq(4).astype(\"float\") # mom >=12 and dad missing\n","df[\"f5\"] = df[\"famed\"].eq(5).astype(\"float\") # father >=12 and mom not in f1-f4\n","df[\"f6\"] = df[\"famed\"].eq(6).astype(\"float\") # mom>=12 and dad nonmissing\n","df[\"f7\"] = df[\"famed\"].eq(7).astype(\"float\") # mom and dad both >=9\n","df[\"f8\"] = df[\"famed\"].eq(8).astype(\"float\") # mom and dad both nonmissing\n","\n","indicators = [\"black\", \"smsa66r\", \"smsa76r\", \"reg76r\"]\n","# exclude reg669, as sum(reg661, ..., reg669) = 1\n","indicators += [f\"reg66{i}\" for i in range(1, 9)]\n","\n","family = [\"daded\", \"momed\", \"nodaded\", \"nomomed\", \"famed\", \"momdad14\", \"sinmom14\"]\n","fs = [f\"f{i}\" for i in range(1, 9)] # exclude f9 as sum(f1, ..., f9) = 1\n","family += fs\n","\n","X = df[[\"ed76\", \"exp76\", \"exp762\"]] # endogenous\n","y = df[\"lwage76\"] # outcome\n","C = df[family + indicators] # included exogenous variables\n","Z = df[[\"nearc4a\", \"nearc4b\", \"nearc2\", \"age76\", \"age762\"]] # instruments"]},{"cell_type":"markdown","metadata":{},"source":["We then estimate the causal effect of education on wages using the Two-Stage Least-Squares (TSLS) and Limited Information Maximum Likelihood (LIML) estimators."]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["intercept 4.680112\n","ed76 0.144954\n","exp76 0.061604\n","exp762 -0.001196\n","Name: coefficients, dtype: float64\n","intercept 3.060812\n","ed76 0.172352\n","exp76 0.051571\n","exp762 -0.000713\n","Name: coefficients, dtype: float64\n"]}],"source":["from ivmodels.models import KClass\n","\n","tsls = KClass(kappa=\"tsls\").fit(Z=Z, X=X, y=y, C=C)\n","print(tsls.named_coefs_[:4])\n","\n","liml = KClass(kappa=\"liml\").fit(Z=Z, X=X, y=y, C=C)\n","print(liml.named_coefs_[:4])"]},{"cell_type":"markdown","metadata":{},"source":["To influence policy, inference for causal effect estimates is crucial.\n","Test statistics, p-values, and confidence sets can be computed using the tests in `ivmodels.tests`."]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["statistic=284.905, p_value=0\n"]}],"source":["\n","from ivmodels.tests import wald_test\n","\n","statistic, p_value = wald_test(Z=Z, X=X, C=C, y=y, beta=np.zeros(X.shape[1]))\n","print(f\"{statistic=:.3f}, {p_value=:.3g}\")"]},{"cell_type":"markdown","metadata":{},"source":["That is, the joint causal effect of education, experience, and experience squared is highly significant using the Wald test.\n","Subvector inference for individual components of the causal effect is easier to interpret."]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["statistic=10.529, p_value=0.00118\n","95.0% confidence set: [0.057, 0.233]\n","99.0% confidence set: [0.030, 0.260]\n","99.9% confidence set: [-0.002, 0.292]\n"]}],"source":["from ivmodels.confidence_set import ConfidenceSet\n","from ivmodels.tests import inverse_wald_test\n","\n","statistic, p_value = wald_test(Z=Z, X=df[[\"ed76\"]], W=df[[\"exp76\", \"exp762\"]], C=C, y=y, beta=np.zeros(1))\n","print(f\"{statistic=:.3f}, {p_value=:.3g}\")\n","\n","for alpha in [0.05, 0.01, 0.001]:\n"," quadric = inverse_wald_test(Z=Z, X=df[[\"ed76\"]], W=df[[\"exp76\", \"exp762\"]], C=C, y=y, alpha=alpha)\n"," confidence_set = ConfidenceSet.from_quadric(quadric)\n"," print(f\"{100 * (1 - alpha)}% confidence set: {confidence_set:.3f}\")"]},{"cell_type":"markdown","metadata":{},"source":["Test statistics, p-values, and confidence sets can be computed for multiple features using the `summary` method of `KClass`.\n","This also displays results of Anderson's (1951) test of reduced rank (a multivariate extension of the first-stage F-test) and the LIML (weak instrument robust) variant of the J-statistic."]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Summary based on the wald test.\n","\n"," estimate statistic p-value conf. set\n","ed76 0.145 10.53 0.001175 [0.0574, 0.2325]\n","exp76 0.0616 6.635 0.009997 [0.01473, 0.1085]\n","exp762 -0.001196 1.029 0.3104 [-0.003506, 0.001115]\n","\n","Endogenous model statistic: 284.9, p-value: <1e-16\n","(Multivariate) F-statistic: 15.47, p-value: 0.001454\n","J-statistic (LIML): 4.246, p-value: 0.1197\n"]}],"source":["features = [\"ed76\", \"exp76\", \"exp762\"]\n","print(tsls.summary(X=X, Z=Z, C=C, y=y, feature_names=features))"]},{"cell_type":"markdown","metadata":{},"source":["Anderson's test statistic of reduced rank is of the order 10, suggesting that instruments might be weak. The `ivmodels` package implements three weak-instrument robust tests: the (subvector) Anderson-Rubin test (Anderson and Rubin, 1949 and Guggenberger et al., 2012), the (subvector) conditional likelihood-ratio test (Moreira, 2003 and Kleibergen, 2021), and the (subvector) Lagrange multiplier test (Kleibergen, 2002 and Londschien et al., 2024). \n"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Summary based on the anderson-rubin test.\n","\n"," estimate statistic p-value conf. set\n","ed76 0.1724 5.027 0.001748 [0.08205, 0.3557]\n","exp76 0.05157 2.093 0.09876 [-0.02839, 0.09739]\n","exp762 -0.0007127 1.495 0.2138 [-0.002968, 0.003186]\n","\n","Endogenous model statistic: 66.33, p-value: <1e-16\n","(Multivariate) F-statistic: 15.47, p-value: 0.001454\n","J-statistic (LIML): 4.246, p-value: 0.1197\n","\n","Summary based on the conditional likelihood-ratio test.\n","\n"," estimate statistic p-value conf. set\n","ed76 0.1724 10.84 0.002516 [0.07273, 0.3987]\n","exp76 0.05157 2.034 0.1785 [-0.04522, 0.1018]\n","exp762 -0.0007127 0.2379 0.6444 [-0.003198, 0.004002]\n","\n","Endogenous model statistic: 327.4, p-value: <1e-16\n","(Multivariate) F-statistic: 15.47, p-value: 0.001454\n","J-statistic (LIML): 4.246, p-value: 0.1197\n","\n","Summary based on the lagrange multiplier test.\n","\n"," estimate statistic p-value conf. set\n","ed76 0.1724 5.739 0.01659 [-0.6013, -0.05795] U [0.06078, 0.472]\n","exp76 0.05157 1.648 0.1992 [-0.06781, 0.1025] U [0.116, 0.3515]\n","exp762 -0.0007127 0.2043 0.6513 [-0.01522, -0.003838] U [-0.003228, 0.005095]\n","\n","Endogenous model statistic: 324.9, p-value: <1e-16\n","(Multivariate) F-statistic: 15.47, p-value: 0.001454\n","J-statistic (LIML): 4.246, p-value: 0.1197\n","\n"]}],"source":["for test in [\"anderson-rubin\", \"conditional likelihood-ratio\", \"lagrange multiplier\"]:\n"," print(liml.summary(X=X, Z=Z, C=C, y=y, feature_names=features, test=test))\n"," print(\"\")"]},{"cell_type":"markdown","metadata":{},"source":["The causal effect of education on wages is still significant at level 0.01 for the weak-instrument robust tests."]}],"metadata":{"kernelspec":{"display_name":"ivmodels","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.12.4"}},"nbformat":4,"nbformat_minor":2}
diff --git a/docs/examples/tanaka2010risk.ipynb b/docs/examples/tanaka2010risk.ipynb
new file mode 100644
index 0000000..0958093
--- /dev/null
+++ b/docs/examples/tanaka2010risk.ipynb
@@ -0,0 +1,256 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Risk and Time Preferences: Linking Experimental and Household Survey Data from Vietnam\n",
+ "\n",
+ "Tomomi Tanaka, Colin F. Camerer, and Quang Nguyen (2010) investigate causes for risk preferences in Vietnam. Individuals from 25 households were interviewed for each of 289 villages.\n",
+ "The authors work with a subsample of in total 181 households from 9 villages.\n",
+ "From the interviews, they estimate measures of risk preferences, including the curvature of the utility function. We investigate how this is affected by income and gender.\n",
+ "\n",
+ "The data used by Tanaka et al. (2010) can be downloaded from https://www.openicpsr.org/openicpsr/project/112336, but this requires an institutional login. We assume this has been downloaded into the working directory."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "nmlrlincome 0.049098\n",
+ "mnincome 0.010253\n",
+ "gender -0.006189\n",
+ "Name: coefficients, dtype: float64\n"
+ ]
+ }
+ ],
+ "source": [
+ "from zipfile import ZipFile\n",
+ "\n",
+ "from ivmodels import KClass\n",
+ "import pandas as pd\n",
+ "\n",
+ "with ZipFile(\"112336-V1.zip\").open(\"20060431_data/20060431_risk.dta\") as file:\n",
+ " df = pd.read_stata(file)\n",
+ "\n",
+ "y = df[\"vfctnc\"]\n",
+ "C = df[[\"chinese\", \"edu\", \"market\", \"south\", \"gender\", \"age\"]]\n",
+ "X = df[[\"nmlrlincome\", \"mnincome\"]]\n",
+ "Z = df[[\"rainfall\", \"headnowork\"]]\n",
+ "\n",
+ "tsls = KClass(kappa=\"tsls\").fit(Z=Z, X=X, y=y, C=C)\n",
+ "\n",
+ "features = [\"nmlrlincome\", \"mnincome\", \"gender\"]\n",
+ "print(tsls.named_coefs_[[\"nmlrlincome\", \"mnincome\", \"gender\"]])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In this application, the number of instruments (2) is equal to the number of endogenous regressors (2).\n",
+ "Thus, the LIML estimator is equal to the TSLS estimator.\n",
+ "Also, the Anderson-Rubin, conditional likelihood-ratio, and Lagrange multiplier tests are equivalent."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "liml.kappa_=1.0\n",
+ "nmlrlincome 0.049098\n",
+ "mnincome 0.010253\n",
+ "gender -0.006189\n",
+ "Name: coefficients, dtype: float64\n"
+ ]
+ }
+ ],
+ "source": [
+ "liml = KClass(kappa=\"liml\").fit(Z=Z, X=X, y=y, C=C)\n",
+ "print(f\"{liml.kappa_=:}\")\n",
+ "print(liml.named_coefs_[features])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Summary based on the wald test.\n",
+ "\n",
+ " estimate statistic p-value conf. set\n",
+ "nmlrlincome 0.0491 0.1106 0.7394 [-0.2402, 0.3384]\n",
+ "mnincome 0.01025 3.411 0.06475 [-0.0006271, 0.02113]\n",
+ "gender -0.006189 0.01087 0.9169 [-0.1225, 0.1101]\n",
+ "\n",
+ "Endogenous model statistic: 3.525, p-value: 0.1716\n",
+ "(Multivariate) F-statistic: 6.07, p-value: 0.01375\n",
+ "J-statistic (LIML): 1.331e-12, p-value: nan\n",
+ "\n",
+ "Summary based on the anderson-rubin test.\n",
+ "\n",
+ " estimate statistic p-value conf. set\n",
+ "nmlrlincome 0.0491 0.1135 0.7362 [-0.3391, 0.6383]\n",
+ "mnincome 0.01025 3.507 0.06109 [-0.0005294, 0.02222]\n",
+ "gender -0.006189 0.01085 0.917 [-0.1212, 0.1187]\n",
+ "\n",
+ "Endogenous model statistic: 1.952, p-value: 0.1419\n",
+ "(Multivariate) F-statistic: 6.07, p-value: 0.01375\n",
+ "J-statistic (LIML): 1.331e-12, p-value: nan\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(liml.summary(Z=Z, X=X, y=y, C=C, test=\"wald\", feature_names=features))\n",
+ "print(\"\")\n",
+ "print(liml.summary(Z=Z, X=X, y=y, C=C, test=\"anderson-rubin\", feature_names=features))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The instruments are weak, but don't prohibit inference with weak-instrument-robust tests such as the Anderson-Rubin test.\n",
+ "The causal effect of mean village income (`mnincome`) is significant at alpha=0.1 for both tests.\n",
+ "\n",
+ "In Londschien and Bühlmann (2024), we suggest building interactions of instruments to improve identification.\n",
+ "We thus add the interaction of `rainfall` and `headnowork` to the instruments and repeat the analysis above.\n",
+ "As in the previous specification no individual causal effects were significant at the level 0.05, we present 80% confidence sets below."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "nmlrlincome 0.046815\n",
+ "mnincome 0.010361\n",
+ "gender -0.006066\n",
+ "Name: coefficients, dtype: float64\n",
+ "\n",
+ "nmlrlincome 0.048929\n",
+ "mnincome 0.010396\n",
+ "gender -0.005918\n",
+ "Name: coefficients, dtype: float64\n",
+ "\n",
+ "Summary based on the wald test.\n",
+ "\n",
+ " estimate statistic p-value conf. set\n",
+ "nmlrlincome 0.04893 0.1058 0.7449 [-0.1438, 0.2417]\n",
+ "mnincome 0.0104 3.501 0.06135 [0.003275, 0.01752]\n",
+ "gender -0.005918 0.009927 0.9206 [-0.08204, 0.0702]\n",
+ "\n",
+ "Endogenous model statistic: 3.609, p-value: 0.1646\n",
+ "(Multivariate) F-statistic: 6.041, p-value: 0.04877\n",
+ "J-statistic (LIML): 0.2191, p-value: 0.6397\n",
+ "\n",
+ "Summary based on the anderson-rubin test.\n",
+ "\n",
+ " estimate statistic p-value conf. set\n",
+ "nmlrlincome 0.04893 0.1636 0.8491 [-0.2698, 0.4911]\n",
+ "mnincome 0.0104 1.894 0.1504 [0.0009274, 0.02083]\n",
+ "gender -0.005918 0.1145 0.8918 [-0.1078, 0.1025]\n",
+ "\n",
+ "Endogenous model statistic: 1.402, p-value: 0.2401\n",
+ "(Multivariate) F-statistic: 6.041, p-value: 0.04877\n",
+ "J-statistic (LIML): 0.2191, p-value: 0.6397\n",
+ "\n",
+ "Summary based on the conditional likelihood-ratio test.\n",
+ "\n",
+ " estimate statistic p-value conf. set\n",
+ "nmlrlincome 0.04893 0.1081 0.7667 [-0.1954, 0.3599]\n",
+ "mnincome 0.0104 3.57 0.1052 [0.002431, 0.01907]\n",
+ "gender -0.005918 nan 1 [-inf, inf]\n",
+ "\n",
+ "Endogenous model statistic: 3.987, p-value: 0.1743\n",
+ "(Multivariate) F-statistic: 6.041, p-value: 0.04877\n",
+ "J-statistic (LIML): 0.2191, p-value: 0.6397\n",
+ "\n",
+ "Summary based on the lagrange multiplier test.\n",
+ "\n",
+ " estimate statistic p-value conf. set\n",
+ "nmlrlincome 0.04893 0.1042 0.7469 [-10.39, -1.246] U [-0.1642, 0.3111]\n",
+ "mnincome 0.0104 3.542 0.05983 [0.003364, 0.01799]\n",
+ "gender -0.005918 0.009855 0.9209 [-0.8529, -0.2543] U [-0.08144, 0.07243]\n",
+ "\n",
+ "Endogenous model statistic: 3.975, p-value: 0.1371\n",
+ "(Multivariate) F-statistic: 6.041, p-value: 0.04877\n",
+ "J-statistic (LIML): 0.2191, p-value: 0.6397\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "df[\"rainfallxheadnowork\"] = df[\"rainfall\"] * df[\"headnowork\"]\n",
+ "\n",
+ "Z = df[[\"rainfall\", \"headnowork\", \"rainfallxheadnowork\"]]\n",
+ "\n",
+ "tsls = KClass(kappa=\"tsls\").fit(Z=Z, X=X, y=y, C=C)\n",
+ "print(tsls.named_coefs_[features])\n",
+ "print(\"\")\n",
+ "\n",
+ "liml = KClass(kappa=\"liml\").fit(Z=Z, X=X, y=y, C=C)\n",
+ "print(liml.named_coefs_[features])\n",
+ "print(\"\")\n",
+ "\n",
+ "for test in [\n",
+ " \"wald\",\n",
+ " \"anderson-rubin\",\n",
+ " \"conditional likelihood-ratio\",\n",
+ " \"lagrange multiplier\"\n",
+ "]:\n",
+ " print(liml.summary(Z=Z, X=X, y=y, C=C, test=test, feature_names=features, alpha=0.2))\n",
+ " print(\"\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The additional instrument did not increase identification, with Anderson's (1951) test statistic of reduced rank decreasing from 6.07 to 6.04.\n",
+ "For the Wald test (that is not robust to weak instruments), the additional instrument decreased the p-value for the conditional causal effect of village mean income on risk preferences from 0.065 to 0.61.\n",
+ "For the (weak instrument robust) Anderson-Rubin and conditional likelihood-ratio tests, the p-values increased from 0.061 to 0.150 and 0.105 respectively.\n",
+ "For the (weak instrument robust) Lagrange multiplier test, the p-value slightly decreased from 0.061 to 0.060."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "ivmodels",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/index.rst b/docs/index.rst
index f0ed04b..56d073f 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,59 +1,49 @@
ivmodels
========
-Estimators
-=============
+ivmodels is a Python library for instrumental variable estimation.
+It implements
-.. autoclass:: ivmodels.KClass
- :members: fit
- :noindex:
+ - K-Class estimators, including the Limited Information Maximum Likelihood (LIML) estimator and the Two-Stage Least Squares (TSLS) estimator.
+ - Tests and confidence sets for the parameters of the model, including the Anderson-Rubin test, the the Lagrange multiplier test, the (conditional) likelihood-ratio test, and the Wald test.
+ - Auxiliary tests such as Anderson's (1951) test of reduced rank (a multivariate extension of the first-stage F-test) and the J-statistic (including its LIML variant).
-.. autoclass:: ivmodels.models.AnchorRegression
- :members: fit
- :noindex:
-.. autoclass:: ivmodels.models.PULSE
- :members: fit
- :noindex:
+Installation
+============
-.. autoclass:: ivmodels.models.SpaceIV
- :members: fit
- :noindex:
+You can install the package through conda
-Tests
-=====
+::
-.. automodule:: ivmodels.tests
- :members:
- :noindex:
+ conda install ivmodels -c conda-forge
-Summary
-=======
+or through pip
-.. autoclass:: ivmodels.summary.Summary
- :members:
- :noindex:
+::
-.. autoclass:: ivmodels.summary.CoefficientTable
- :members:
- :noindex:
+ pip install ivmodels
-ConfidenceSet
-=============
-.. autoclass:: ivmodels.confidence_set.ConfidenceSet
- :members:
- :noindex:
+.. toctree::
+ :maxdepth: 1
+ :caption: Examples
-Quadric
-=======
+ Card (1993) Using Geographic Variation in College Proximity to Estimate the Return to Schooling
+ Tanaka, Camerer, Nguyen (2010) Risk and time preferences: Linkining experimental and household survey data from vietnam
+ Angrist and Krueger (1991) Does Compulsory School Attendance Affect Schooling and Earnings?
+ Acemoglu, Johnson, and Robinson (2001) The Colonial Origins of Comparative Development: An Empirical Investigation
-.. autoclass:: ivmodels.quadric.Quadric
- :members:
- :noindex:
+.. toctree::
+ :maxdepth: 2
+ :caption: API reference
+ api
-Bibliography
-============
-.. bibliography::
+.. toctree::
+ :maxdepth: 1
+ :caption: Other
+
+ GitHub
+ changelog
diff --git a/docs/installation.rst b/docs/installation.rst
new file mode 100644
index 0000000..f46c1fa
--- /dev/null
+++ b/docs/installation.rst
@@ -0,0 +1,13 @@
+Installation
+============
+
+You can install the package through conda
+::
+
+ conda install ivmodels -c conda-forge
+
+or through pip
+
+::
+
+ pip install ivmodels
\ No newline at end of file
diff --git a/environment.yml b/environment.yml
index 370eb08..d7e82e2 100644
--- a/environment.yml
+++ b/environment.yml
@@ -17,4 +17,6 @@ dependencies:
- ipython # docs
- sphinx_rtd_theme # docs
- sphinxcontrib-bibtex # docs
- - pip # docs
\ No newline at end of file
+ - pip # docs
+ - nbclassic # docs
+ - nbsphinx # docs
\ No newline at end of file