From ab10e30ee03e3991f026d967048f3fedc140edaa Mon Sep 17 00:00:00 2001 From: Roni Kobrosly Date: Sun, 3 Jan 2021 17:32:01 -0500 Subject: [PATCH] Release/1.0.0 (#29) * started updates to readme * changed readme image * adjusted curves img in readme * adjusted image again * pylint clean-up * started dividing up the GPS class * more work on dividing GPS up * started updating TMLE core * made good progress on TMLE * got the code working * start revising unit tests. Done with tests of Core class * finished revising tests * whoops, needed more tests * started making big changes to docs * tested outside of project folder * revised documentation * final changes * fixed docs Co-authored-by: rkobrosly --- .pylintrc | 598 ++++++++++++++++++ README.md | 36 +- causal_curve/__init__.py | 6 +- causal_curve/core.py | 70 ++- causal_curve/gps_classifier.py | 110 ++++ causal_curve/{gps.py => gps_core.py} | 279 ++------- causal_curve/gps_regressor.py | 139 +++++ causal_curve/mediation.py | 52 +- causal_curve/tmle.py | 572 ------------------ causal_curve/tmle_core.py | 601 +++++++++++++++++++ causal_curve/tmle_regressor.py | 100 +++ causal_curve/utils.py | 22 - docs/GPS_Classifier.rst | 55 ++ docs/{GPS_example.rst => GPS_Regressor.rst} | 38 +- docs/Mediation_example.rst | 11 +- docs/TMLE_Regressor.rst | 51 ++ docs/TMLE_example.rst | 45 -- docs/causal_curve.rst | 59 +- docs/changelog.rst | 10 + docs/conf.py | 2 +- docs/full_example.rst | 2 +- docs/index.rst | 30 +- docs/intro.rst | 16 +- examples/NHANES_BLL_example.ipynb | 13 +- setup.py | 2 +- tests/conftest.py | 30 +- tests/integration/test_gps.py | 8 +- tests/integration/test_tmle.py | 5 +- tests/unit/test_core.py | 71 +++ tests/unit/test_general.py | 17 - tests/unit/test_gps_classifier.py | 33 + tests/unit/{test_gps.py => test_gps_core.py} | 92 +-- tests/unit/test_gps_regressor.py | 66 ++ tests/unit/test_tmle.py | 84 --- tests/unit/test_tmle_core.py | 112 ++++ tests/unit/test_tmle_regressor.py | 41 ++ 36 files changed, 2285 insertions(+), 1193 deletions(-) create mode 100644 .pylintrc create mode 100644 causal_curve/gps_classifier.py rename causal_curve/{gps.py => gps_core.py} (71%) create mode 100644 causal_curve/gps_regressor.py delete mode 100644 causal_curve/tmle.py create mode 100644 causal_curve/tmle_core.py create mode 100644 causal_curve/tmle_regressor.py delete mode 100644 causal_curve/utils.py create mode 100644 docs/GPS_Classifier.rst rename docs/{GPS_example.rst => GPS_Regressor.rst} (71%) create mode 100644 docs/TMLE_Regressor.rst delete mode 100644 docs/TMLE_example.rst create mode 100644 tests/unit/test_core.py delete mode 100644 tests/unit/test_general.py create mode 100644 tests/unit/test_gps_classifier.py rename tests/unit/{test_gps.py => test_gps_core.py} (60%) create mode 100644 tests/unit/test_gps_regressor.py delete mode 100644 tests/unit/test_tmle.py create mode 100644 tests/unit/test_tmle_core.py create mode 100644 tests/unit/test_tmle_regressor.py diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..2491978 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,598 @@ +[MASTER] + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. +extension-pkg-whitelist= + +# Specify a score threshold to be exceeded before program exits with error. +fail-under=10.0 + +# Add files or directories to the blacklist. They should be base names, not +# paths. +ignore=CVS + +# Add files or directories matching the regex patterns to the blacklist. The +# regex matches against base names, not paths. +ignore-patterns= + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +#init-hook= + +# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the +# number of processors available to use. +jobs=1 + +# Control the amount of potential inferred values when inferring a single +# object. This can help the performance when dealing with large functions or +# complex, nested conditions. +limit-inference-results=100 + +# List of plugins (as comma separated values of python module names) to load, +# usually to register additional checkers. +load-plugins= + +# Pickle collected data for later comparisons. +persistent=yes + +# When enabled, pylint would attempt to guess common misconfiguration and emit +# user-friendly hints instead of false-positive error messages. +suggestion-mode=yes + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. +confidence= + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once). You can also use "--disable=all" to +# disable everything first and then reenable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use "--disable=all --enable=classes +# --disable=W". +disable=too-many-locals, + super-init-not-called, + attribute-defined-outside-init, + too-many-instance-attributes, + invalid-name, + too-few-public-methods, + consider-using-dict-comprehension, + too-many-arguments, + no-name-in-module, + duplicate-code, + print-statement, + parameter-unpacking, + unpacking-in-except, + old-raise-syntax, + backtick, + long-suffix, + old-ne-operator, + old-octal-literal, + import-star-module-level, + non-ascii-bytes-literal, + raw-checker-failed, + bad-inline-option, + locally-disabled, + file-ignored, + suppressed-message, + useless-suppression, + deprecated-pragma, + use-symbolic-message-instead, + apply-builtin, + basestring-builtin, + buffer-builtin, + cmp-builtin, + coerce-builtin, + execfile-builtin, + file-builtin, + long-builtin, + raw_input-builtin, + reduce-builtin, + standarderror-builtin, + unicode-builtin, + xrange-builtin, + coerce-method, + delslice-method, + getslice-method, + setslice-method, + no-absolute-import, + old-division, + dict-iter-method, + dict-view-method, + next-method-called, + metaclass-assignment, + indexing-exception, + raising-string, + reload-builtin, + oct-method, + hex-method, + nonzero-method, + cmp-method, + input-builtin, + round-builtin, + intern-builtin, + unichr-builtin, + map-builtin-not-iterating, + zip-builtin-not-iterating, + range-builtin-not-iterating, + filter-builtin-not-iterating, + using-cmp-argument, + eq-without-hash, + div-method, + idiv-method, + rdiv-method, + exception-message-attribute, + invalid-str-codec, + sys-max-int, + bad-python3-import, + deprecated-string-function, + deprecated-str-translate-call, + deprecated-itertools-function, + deprecated-types-field, + next-method-defined, + dict-items-not-iterating, + dict-keys-not-iterating, + dict-values-not-iterating, + deprecated-operator-function, + deprecated-urllib-function, + xreadlines-attribute, + deprecated-sys-function, + exception-escape, + comprehension-escape + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +enable=c-extension-no-member + + +[REPORTS] + +# Python expression which should return a score less than or equal to 10. You +# have access to the variables 'error', 'warning', 'refactor', and 'convention' +# which contain the number of messages in each category, as well as 'statement' +# which is the total number of statements analyzed. This score is used by the +# global evaluation report (RP0004). +evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details. +#msg-template= + +# Set the output format. Available formats are text, parseable, colorized, json +# and msvs (visual studio). You can also give a reporter class, e.g. +# mypackage.mymodule.MyReporterClass. +output-format=text + +# Tells whether to display a full report or only the messages. +reports=no + +# Activate the evaluation score. +score=yes + + +[REFACTORING] + +# Maximum number of nested blocks for function / method body +max-nested-blocks=5 + +# Complete name of functions that never returns. When checking for +# inconsistent-return-statements if a never returning function is called then +# it will be considered as an explicit return statement and no message will be +# printed. +never-returning-functions=sys.exit + + +[LOGGING] + +# The type of string formatting that logging methods do. `old` means using % +# formatting, `new` is for `{}` formatting. +logging-format-style=old + +# Logging modules to check that the string format arguments are in logging +# function parameter format. +logging-modules=logging + + +[SPELLING] + +# Limits count of emitted suggestions for spelling mistakes. +max-spelling-suggestions=4 + +# Spelling dictionary name. Available dictionaries: none. To make it work, +# install the python-enchant package. +spelling-dict= + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains the private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to the private dictionary (see the +# --spelling-private-dict-file option) instead of raising a message. +spelling-store-unknown-words=no + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=FIXME, + XXX, + TODO + +# Regular expression of note tags to take in consideration. +#notes-rgx= + + +[TYPECHECK] + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= + +# Tells whether missing members accessed in mixin class should be ignored. A +# mixin class is detected if its name ends with "mixin" (case insensitive). +ignore-mixin-members=yes + +# Tells whether to warn about missing members when the owner of the attribute +# is inferred to be None. +ignore-none=yes + +# This flag controls whether pylint should warn about no-member and similar +# checks whenever an opaque object is returned when inferring. The inference +# can return multiple potential results while evaluating a Python object, but +# some branches might not be evaluated, which results in partial inference. In +# that case, it might be useful to still emit no-member and other checks for +# the rest of the inferred objects. +ignore-on-opaque-inference=yes + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis). It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# Show a hint with possible names when a member name was not found. The aspect +# of finding the hint is based on edit distance. +missing-member-hint=yes + +# The minimum edit distance a name should have in order to be considered a +# similar match for a missing member name. +missing-member-hint-distance=1 + +# The total number of similar names that should be taken in consideration when +# showing a hint for a missing member. +missing-member-max-choices=1 + +# List of decorators that change the signature of a decorated function. +signature-mutators= + + +[VARIABLES] + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid defining new builtins when possible. +additional-builtins= + +# Tells whether unused global variables should be treated as a violation. +allow-global-unused-variables=yes + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_, + _cb + +# A regular expression matching the name of dummy variables (i.e. expected to +# not be used). +dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ + +# Argument names that match this expression will be ignored. Default to name +# with leading underscore. +ignored-argument-names=_.*|^ignored_|^unused_ + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io + + +[FORMAT] + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Maximum number of characters on a single line. +max-line-length=100 + +# Maximum number of lines in a module. +max-module-lines=1000 + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +single-line-class-stmt=no + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + + +[SIMILARITIES] + +# Ignore comments when computing similarities. +ignore-comments=yes + +# Ignore docstrings when computing similarities. +ignore-docstrings=yes + +# Ignore imports when computing similarities. +ignore-imports=no + +# Minimum lines number of a similarity. +min-similarity-lines=4 + + +[BASIC] + +# Naming style matching correct argument names. +argument-naming-style=snake_case + +# Regular expression matching correct argument names. Overrides argument- +# naming-style. +#argument-rgx= + +# Naming style matching correct attribute names. +attr-naming-style=snake_case + +# Regular expression matching correct attribute names. Overrides attr-naming- +# style. +#attr-rgx= + +# Bad variable names which should always be refused, separated by a comma. +bad-names=foo, + bar, + baz, + toto, + tutu, + tata + +# Bad variable names regexes, separated by a comma. If names match any regex, +# they will always be refused +bad-names-rgxs= + +# Naming style matching correct class attribute names. +class-attribute-naming-style=any + +# Regular expression matching correct class attribute names. Overrides class- +# attribute-naming-style. +#class-attribute-rgx= + +# Naming style matching correct class names. +class-naming-style=PascalCase + +# Regular expression matching correct class names. Overrides class-naming- +# style. +#class-rgx= + +# Naming style matching correct constant names. +const-naming-style=UPPER_CASE + +# Regular expression matching correct constant names. Overrides const-naming- +# style. +#const-rgx= + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=-1 + +# Naming style matching correct function names. +function-naming-style=snake_case + +# Regular expression matching correct function names. Overrides function- +# naming-style. +#function-rgx= + +# Good variable names which should always be accepted, separated by a comma. +good-names=i, + j, + k, + ex, + Run, + _ + +# Good variable names regexes, separated by a comma. If names match any regex, +# they will always be accepted +good-names-rgxs= + +# Include a hint for the correct naming format with invalid-name. +include-naming-hint=no + +# Naming style matching correct inline iteration names. +inlinevar-naming-style=any + +# Regular expression matching correct inline iteration names. Overrides +# inlinevar-naming-style. +#inlinevar-rgx= + +# Naming style matching correct method names. +method-naming-style=snake_case + +# Regular expression matching correct method names. Overrides method-naming- +# style. +#method-rgx= + +# Naming style matching correct module names. +module-naming-style=snake_case + +# Regular expression matching correct module names. Overrides module-naming- +# style. +#module-rgx= + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=^_ + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +# These decorators are taken in consideration only for invalid-name. +property-classes=abc.abstractproperty + +# Naming style matching correct variable names. +variable-naming-style=snake_case + +# Regular expression matching correct variable names. Overrides variable- +# naming-style. +#variable-rgx= + + +[STRING] + +# This flag controls whether inconsistent-quotes generates a warning when the +# character used as a quote delimiter is used inconsistently within a module. +check-quote-consistency=no + +# This flag controls whether the implicit-str-concat should generate a warning +# on implicit string concatenation in sequences defined over several lines. +check-str-concat-over-line-jumps=no + + +[IMPORTS] + +# List of modules that can be imported at any level, not just the top level +# one. +allow-any-import-level= + +# Allow wildcard imports from modules that define __all__. +allow-wildcard-with-all=no + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + +# Deprecated modules which should not be used, separated by a comma. +deprecated-modules=optparse,tkinter.tix + +# Create a graph of external dependencies in the given file (report RP0402 must +# not be disabled). +ext-import-graph= + +# Create a graph of every (i.e. internal and external) dependencies in the +# given file (report RP0402 must not be disabled). +import-graph= + +# Create a graph of internal dependencies in the given file (report RP0402 must +# not be disabled). +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant + +# Couples of modules and preferred modules, separated by a comma. +preferred-modules= + + +[CLASSES] + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp, + __post_init__ + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict, + _fields, + _replace, + _source, + _make + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=cls + + +[DESIGN] + +# Maximum number of arguments for function / method. +max-args=5 + +# Maximum number of attributes for a class (see R0902). +max-attributes=7 + +# Maximum number of boolean expressions in an if statement (see R0916). +max-bool-expr=5 + +# Maximum number of branch for function / method body. +max-branches=12 + +# Maximum number of locals for function / method body. +max-locals=15 + +# Maximum number of parents for a class (see R0901). +max-parents=7 + +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 + +# Maximum number of return / yield for function / method body. +max-returns=6 + +# Maximum number of statements in function / method body. +max-statements=50 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=2 + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when being caught. Defaults to +# "BaseException, Exception". +overgeneral-exceptions=BaseException, + Exception diff --git a/README.md b/README.md index 59205a8..868198c 100644 --- a/README.md +++ b/README.md @@ -4,30 +4,28 @@ [![codecov](https://codecov.io/gh/ronikobrosly/causal-curve/branch/master/graph/badge.svg)](https://codecov.io/gh/ronikobrosly/causal-curve) [![DOI](https://zenodo.org/badge/256017107.svg)](https://zenodo.org/badge/latestdoi/256017107) -Python tools to perform causal inference using observational data when the treatment of interest is continuous. +Python tools to perform causal inference when the treatment of interest is continuous.

- +

-[The Antikythera mechanism](https://en.wikipedia.org/wiki/Antikythera_mechanism), an ancient analog computer, with lots of beautiful curves. - - ## Table of Contents - [Overview](#overview) - [Installation](#installation) - [Documentation](#documentation) -- [In Progress](#in-progress) - [Contributing](#contributing) - [Citation](#citation) - [References](#references) ## Overview +(**Version 1.0.0 released in January 2021!**) + There are many implemented methods to perform causal inference when your intervention of interest is binary, but few methods exist to handle continuous treatments. @@ -61,15 +59,6 @@ pip install . [Documentation is available at readthedocs.org](https://causal-curve.readthedocs.io/en/latest/) -## In Progress - -(12/26/2020) Currently working towards version 1.0.0! - -This major update will include: -* An overhaul of the TMLE tool to make it more accurate and user-friendly. -* Separate model classes for predicting binary or continuous outcomes (much like sklearn's approach) -* Better TMLE example documentation - ## Contributing Your help is absolutely welcome! Please do reach out or create a feature branch! @@ -83,19 +72,22 @@ Kobrosly, R. W., (2020). causal-curve: A Python Causal Inference Package to Esti Galagate, D. Causal Inference with a Continuous Treatment and Outcome: Alternative Estimators for Parametric Dose-Response function with Applications. PhD thesis, 2016. -Moodie E and Stephens DA. Estimation of dose–response functions for -longitudinal data using the generalised propensity score. In: Statistical Methods in -Medical Research 21(2), 2010, pp.149–166. - Hirano K and Imbens GW. The propensity score with continuous treatments. In: Gelman A and Meng XL (eds) Applied bayesian modeling and causal inference from incomplete-data perspectives. Oxford, UK: Wiley, 2004, pp.73–84. +Imai K, Keele L, Tingley D. A General Approach to Causal Mediation Analysis. Psychological +Methods. 15(4), 2010, pp.309–334. + +Kennedy EH, Ma Z, McHugh MD, Small DS. Nonparametric methods for doubly robust estimation +of continuous treatment effects. Journal of the Royal Statistical Society, Series B. 79(4), 2017, pp.1229-1245. + +Moodie E and Stephens DA. Estimation of dose–response functions for +longitudinal data using the generalised propensity score. In: Statistical Methods in +Medical Research 21(2), 2010, pp.149–166. + van der Laan MJ and Gruber S. Collaborative double robust penalized targeted maximum likelihood estimation. In: The International Journal of Biostatistics 6(1), 2010. van der Laan MJ and Rubin D. Targeted maximum likelihood learning. In: ​U.C. Berkeley Division of Biostatistics Working Paper Series, 2006. - -Imai K., Keele L., Tingley D. A General Approach to Causal Mediation Analysis. Psychological -Methods. 15(4), 2010, pp.309–334. diff --git a/causal_curve/__init__.py b/causal_curve/__init__.py index 7940c87..f031567 100644 --- a/causal_curve/__init__.py +++ b/causal_curve/__init__.py @@ -4,8 +4,10 @@ from statsmodels.genmod.generalized_linear_model import DomainWarning -from causal_curve.gps import GPS -from causal_curve.tmle import TMLE +from causal_curve.gps_classifier import GPS_Classifier +from causal_curve.gps_regressor import GPS_Regressor + +from causal_curve.tmle_regressor import TMLE_Regressor from causal_curve.mediation import Mediation diff --git a/causal_curve/core.py b/causal_curve/core.py index 80ba09a..7efc551 100644 --- a/causal_curve/core.py +++ b/causal_curve/core.py @@ -1,14 +1,16 @@ """ Core classes (with basic methods) that will be invoked when other, model classes are defined """ -import pkg_resources + +import numpy as np +from scipy.stats import norm class Core: """Base class for causal_curve module""" def __init__(self): - pass + __version__ = "1.0.0" def get_params(self): """Returns a dict of all of the object's user-facing parameters @@ -26,4 +28,66 @@ def get_params(self): [(k, v) for k, v in list(attrs.items()) if (k[0] != "_") and (k[-1] != "_")] ) - __version__ = "0.5.2" + def if_verbose_print(self, string): + """Prints the input statement if verbose is set to True + + Parameters + ---------- + string: str, some string to be printed + + Returns + ---------- + None + + """ + if self.verbose: + print(string) + + @staticmethod + def rand_seed_wrapper(random_seed=None): + """Sets the random seed using numpy + + Parameters + ---------- + random_seed: int, random seed number + + Returns + ---------- + None + """ + if random_seed is None: + pass + else: + np.random.seed(random_seed) + + @staticmethod + def calculate_z_score(ci): + """Calculates the critical z-score for a desired two-sided, + confidence interval width. + + Parameters + ---------- + ci: float, the confidence interval width (e.g. 0.95) + + Returns + ------- + Float, critical z-score value + """ + return norm.ppf((1 + ci) / 2) + + @staticmethod + def clip_negatives(number): + """Helper function to clip negative numbers to zero + + Parameters + ---------- + number: int or float, any number that needs a floor at zero + + Returns + ------- + Int or float of modified value + + """ + if number < 0: + return 0 + return number diff --git a/causal_curve/gps_classifier.py b/causal_curve/gps_classifier.py new file mode 100644 index 0000000..200e0c5 --- /dev/null +++ b/causal_curve/gps_classifier.py @@ -0,0 +1,110 @@ +""" +Defines the Generalized Prospensity Score (GPS) classifier model class +""" + +import numpy as np +from scipy.special import logit + +from causal_curve.gps_core import GPS_Core + + +class GPS_Classifier(GPS_Core): + """ + A GPS tool that handles binary outcomes. Inherits the GPS_core + base class. See that base class code its docstring for more details. + """ + + def __init__( + self, + gps_family=None, + treatment_grid_num=100, + lower_grid_constraint=0.01, + upper_grid_constraint=0.99, + spline_order=3, + n_splines=30, + lambda_=0.5, + max_iter=100, + random_seed=None, + verbose=False, + ): + GPS_Core.__init__( + self, + gps_family=None, + treatment_grid_num=100, + lower_grid_constraint=0.01, + upper_grid_constraint=0.99, + spline_order=3, + n_splines=30, + lambda_=0.5, + max_iter=100, + random_seed=None, + verbose=False, + ) + + def _cdrc_predictions_binary(self, ci): + """Returns the predictions of CDRC for each value of the treatment grid. Essentially, + we're making predictions using the original treatment and gps_at_grid. + To be used when the outcome of interest is binary. + """ + # To keep track of cdrc predictions, we create an empty 2d array of shape + # (n_samples, treatment_grid_num, 2). The last dimension is of length 2 because + # we are going to keep track of the point estimate (log-odds) of the prediction, as well as + # the standard error of the prediction interval (again, this is for the log odds) + cdrc_preds = np.zeros((len(self.T), self.treatment_grid_num, 2), dtype=float) + + # Loop through each of the grid values, predict point estimate and get prediction interval + for i in range(0, self.treatment_grid_num): + + temp_T = np.repeat(self.grid_values[i], repeats=len(self.T)) + temp_gps = self.gps_at_grid[:, i] + + temp_cdrc_preds = logit( + self.gam_results.predict_proba(np.column_stack((temp_T, temp_gps))) + ) + + temp_cdrc_interval = logit( + self.gam_results.confidence_intervals( + np.column_stack((temp_T, temp_gps)), width=ci + ) + ) + + standard_error = ( + temp_cdrc_interval[:, 1] - temp_cdrc_preds + ) / self.calculate_z_score(ci) + + cdrc_preds[:, i, 0] = temp_cdrc_preds + cdrc_preds[:, i, 1] = standard_error + + return np.round(cdrc_preds, 3) + + def estimate_log_odds(self, T): + """Calculates the estimated log odds of the highest integer class. Can + only be used when the outcome is binary. Can be estimate for a single + data point or can be run in batch for many observations. Extrapolation will produce + untrustworthy results; the provided treatment should be within + the range of the training data. + + Parameters + ---------- + T: Numpy array, shape (n_samples,) + A continuous treatment variable. + + Returns + ---------- + array: Numpy array + Contains a set of log odds + """ + if self.outcome_type != "binary": + raise TypeError("Your outcome must be binary to use this function!") + + return np.apply_along_axis(self._create_log_odds, 0, T.reshape(1, -1)) + + def _create_log_odds(self, T): + """Take a single treatment value and produces the log odds of the higher + integer class, in the case of a binary outcome. + """ + return logit( + self.gam_results.predict_proba( + np.array([T, self.gps_function(T).mean()]).reshape(1, -1) + ) + ) diff --git a/causal_curve/gps.py b/causal_curve/gps_core.py similarity index 71% rename from causal_curve/gps.py rename to causal_curve/gps_core.py index 41767f2..ab7b36a 100644 --- a/causal_curve/gps.py +++ b/causal_curve/gps_core.py @@ -1,6 +1,5 @@ -# pylint: disable=bad-continuation """ -Defines the Generalized Prospensity Score (GPS) model class +Defines the Generalized Prospensity Score (GPS) Core model class """ import contextlib @@ -18,10 +17,9 @@ from statsmodels.tools.tools import add_constant from causal_curve.core import Core -from causal_curve.utils import rand_seed_wrapper -class GPS(Core): +class GPS_Core(Core): """ In a multi-stage approach, this computes the generalized propensity score (GPS) function, and uses this in a generalized additive model (GAM) to correct treatment prediction of @@ -192,10 +190,10 @@ def __init__( # Validate the params self._validate_init_params() - rand_seed_wrapper() + self.rand_seed_wrapper() + self.if_verbose_print("Using the following params for GPS model:") if self.verbose: - print("Using the following params for GPS model:") pprint(self.get_params(), indent=4) def _validate_init_params(self): @@ -233,7 +231,7 @@ def _validate_init_params(self): if ( isinstance(self.treatment_grid_num, int) ) and self.treatment_grid_num >= 1000: - raise ValueError(f"treatment_grid_num parameter is too high!") + raise ValueError("treatment_grid_num parameter is too high!") # Checks for lower_grid_constraint if not isinstance(self.lower_grid_constraint, float): @@ -300,7 +298,7 @@ def _validate_init_params(self): ) if (isinstance(self.spline_order, int)) and self.spline_order >= 30: - raise ValueError(f"spline_order parameter is too high!") + raise ValueError("spline_order parameter is too high!") # Checks for n_splines if not isinstance(self.n_splines, int): @@ -314,7 +312,7 @@ def _validate_init_params(self): ) if (isinstance(self.n_splines, int)) and self.n_splines >= 100: - raise ValueError(f"n_splines parameter is too high!") + raise ValueError("n_splines parameter is too high!") # Checks for lambda_ if not isinstance(self.lambda_, (int, float)): @@ -328,7 +326,7 @@ def _validate_init_params(self): ) if (isinstance(self.lambda_, (int, float))) and self.lambda_ >= 1000: - raise ValueError(f"lambda_ parameter is too high!") + raise ValueError("lambda_ parameter is too high!") # Checks for max_iter if not isinstance(self.max_iter, int): @@ -338,11 +336,11 @@ def _validate_init_params(self): if (isinstance(self.max_iter, int)) and self.max_iter <= 10: raise ValueError( - f"max_iter parameter is too low! Results won't be reliable!" + "max_iter parameter is too low! Results won't be reliable!" ) if (isinstance(self.max_iter, int)) and self.max_iter >= 1e6: - raise ValueError(f"max_iter parameter is unnecessarily high!") + raise ValueError("max_iter parameter is unnecessarily high!") # Checks for random_seed if not isinstance(self.random_seed, (int, type(None))): @@ -351,7 +349,7 @@ def _validate_init_params(self): ) if (isinstance(self.random_seed, int)) and self.random_seed < 0: - raise ValueError(f"random_seed parameter must be > 0") + raise ValueError("random_seed parameter must be > 0") # Checks for verbose if not isinstance(self.verbose, bool): @@ -363,33 +361,33 @@ def _validate_fit_data(self): """Verifies that T, X, and y are formatted the right way""" # Checks for T column if not is_float_dtype(self.T): - raise TypeError(f"Treatment data must be of type float") + raise TypeError("Treatment data must be of type float") # Make sure all X columns are float or int if isinstance(self.X, pd.Series): if not is_numeric_dtype(self.X): raise TypeError( - f"All covariate (X) columns must be int or float type (i.e. must be numeric)" + "All covariate (X) columns must be int or float type (i.e. must be numeric)" ) elif isinstance(self.X, pd.DataFrame): for column in self.X: if not is_numeric_dtype(self.X[column]): raise TypeError( - f"All covariate (X) columns must be int or float type " - f"(i.e. must be numeric)" + "All covariate (X) columns must be int or float type " + "(i.e. must be numeric)" ) # Checks for Y column if not (is_float_dtype(self.y) or is_integer_dtype(self.y)): - raise TypeError(f"Outcome data must be of type float or integer") + raise TypeError("Outcome data must be of type float or integer") if is_integer_dtype(self.y) and ( not np.array_equal(np.sort(self.y.unique()), np.array([0, 1])) ): raise TypeError( - f"If your outcome data is of type integer (binary outcome)," - f"it should only contain 1's and 0's." + "If your outcome data is of type integer (binary outcome)," + "it should only contain 1's and 0's." ) def _grid_values(self): @@ -406,7 +404,8 @@ def _grid_values(self): def fit(self, T, X, y): """Fits the GPS causal dose-response model. For now, this only accepts pandas columns. While the treatment variable must be continuous (or ordinal with many levels), the - outcome variable may be continuous or binary. + outcome variable may be continuous or binary. You *must* provide + at least one covariate column. Parameters ---------- @@ -435,8 +434,9 @@ def fit(self, T, X, y): elif is_integer_dtype(self.y): self.outcome_type = "binary" - if self.verbose: - print(f"Determined the outcome variable is of type {self.outcome_type}...") + self.if_verbose_print( + f"Determined the outcome variable is of type {self.outcome_type}..." + ) # Validate this input data self._validate_fit_data() @@ -448,14 +448,12 @@ def fit(self, T, X, y): self._determine_gps_function() # Estimate the GPS - if self.verbose: - print(f"Saving GPS values...") + self.if_verbose_print("Saving GPS values...") self.gps = self.gps_function(self.T) # Create GAM that predicts outcome from the treatment and GPS - if self.verbose: - print(f"Fitting GAM using treatment and GPS...") + self.if_verbose_print("Fitting GAM using treatment and GPS...") # Save model results self.gam_results = self._fit_gam() @@ -466,17 +464,19 @@ def fit(self, T, X, y): self._gam_summary_str = f.getvalue() - if self.verbose: - print(f"Calculating many CDRC estimates for each treatment grid value...") + self.if_verbose_print( + "Calculating many CDRC estimates for each treatment grid value..." + ) # Loop over all grid values (`treatment_grid_num` in total) # and give GPS loading for each observation in the dataset self.gps_at_grid = self._gps_values_at_grid() def calculate_CDRC(self, ci=0.95): - """Using the results of the fitted model, this generates point estimates for the CDRC - at each of the values of the treatment grid. Connecting these estimates will produce - the overall estimated CDRC. Confidence interval is returned as well. + """Using the results of the fitted model, this generates a dataframe of + point estimates for the CDRC at each of the values of the + treatment grid. Connecting these estimates will produce the overall + estimated CDRC. Confidence interval is returned as well. Parameters ---------- @@ -495,11 +495,11 @@ def calculate_CDRC(self, ci=0.95): """ self._validate_calculate_CDRC_params(ci) - if self.verbose: - print( - """Generating predictions for each value of treatment grid, - and averaging to get the CDRC...""" - ) + self.if_verbose_print( + """ + Generating predictions for each value of treatment grid, + and averaging to get the CDRC...""" + ) # Create CDRC predictions from trained GAM # If working with a continuous outcome variable, use this path @@ -547,11 +547,11 @@ def calculate_CDRC(self, ci=0.95): temp_lower_bound = np.exp( temp_log_odds_estimate - - (self._calculate_z_score(ci) * self._cdrc_preds[:, i, 1].mean()) + - (self.calculate_z_score(ci) * self._cdrc_preds[:, i, 1].mean()) ) temp_upper_bound = np.exp( temp_log_odds_estimate - + (self._calculate_z_score(ci) * self._cdrc_preds[:, i, 1].mean()) + + (self.calculate_z_score(ci) * self._cdrc_preds[:, i, 1].mean()) ) results.append( [ @@ -568,7 +568,8 @@ def calculate_CDRC(self, ci=0.95): results, columns=["Treatment", outcome_name, "Lower_CI", "Upper_CI"] ).round(3) - def _validate_calculate_CDRC_params(self, ci): + @staticmethod + def _validate_calculate_CDRC_params(ci): """Validates the parameters given to `calculate_CDRC`""" if not isinstance(ci, float): @@ -579,76 +580,6 @@ def _validate_calculate_CDRC_params(self, ci): if isinstance(ci, float) and ((ci <= 0) or (ci >= 1.0)): raise ValueError("`ci` parameter should be between (0, 1)") - def _calculate_z_score(self, ci): - """Calculates the critical z-score for a desired two-sided, confidence interval width.""" - return norm.ppf((1 + ci) / 2) - - def _cdrc_predictions_continuous(self, ci): - """Returns the predictions of CDRC for each value of the treatment grid. Essentially, - we're making predictions using the original treatment and gps_at_grid. - To be used when the outcome of interest is continuous. - """ - - # To keep track of cdrc predictions, we create an empty 3d array of shape - # (n_samples, treatment_grid_num, 3). The last dimension is of length 3 because - # we are going to keep track of the point estimate of the prediction, as well as - # the lower and upper bounds of the prediction interval - cdrc_preds = np.zeros((len(self.T), self.treatment_grid_num, 3), dtype=float) - - # Loop through each of the grid values, predict point estimate and get prediction interval - for i in range(0, self.treatment_grid_num): - temp_T = np.repeat(self.grid_values[i], repeats=len(self.T)) - temp_gps = self.gps_at_grid[:, i] - temp_cdrc_preds = self.gam_results.predict( - np.column_stack((temp_T, temp_gps)) - ) - temp_cdrc_interval = self.gam_results.confidence_intervals( - np.column_stack((temp_T, temp_gps)), width=ci - ) - temp_cdrc_lower_bound = temp_cdrc_interval[:, 0] - temp_cdrc_upper_bound = temp_cdrc_interval[:, 1] - cdrc_preds[:, i, 0] = temp_cdrc_preds - cdrc_preds[:, i, 1] = temp_cdrc_lower_bound - cdrc_preds[:, i, 2] = temp_cdrc_upper_bound - - return np.round(cdrc_preds, 3) - - def _cdrc_predictions_binary(self, ci): - """Returns the predictions of CDRC for each value of the treatment grid. Essentially, - we're making predictions using the original treatment and gps_at_grid. - To be used when the outcome of interest is binary. - """ - # To keep track of cdrc predictions, we create an empty 2d array of shape - # (n_samples, treatment_grid_num, 2). The last dimension is of length 2 because - # we are going to keep track of the point estimate (log-odds) of the prediction, as well as - # the standard error of the prediction interval (again, this is for the log odds) - cdrc_preds = np.zeros((len(self.T), self.treatment_grid_num, 2), dtype=float) - - # Loop through each of the grid values, predict point estimate and get prediction interval - for i in range(0, self.treatment_grid_num): - - temp_T = np.repeat(self.grid_values[i], repeats=len(self.T)) - temp_gps = self.gps_at_grid[:, i] - - temp_cdrc_preds = logit( - self.gam_results.predict_proba(np.column_stack((temp_T, temp_gps))) - ) - - temp_cdrc_interval = logit( - self.gam_results.confidence_intervals( - np.column_stack((temp_T, temp_gps)), width=ci - ) - ) - - standard_error = ( - temp_cdrc_interval[:, 1] - temp_cdrc_preds - ) / self._calculate_z_score(ci) - - cdrc_preds[:, i, 0] = temp_cdrc_preds - cdrc_preds[:, i, 1] = standard_error - - return np.round(cdrc_preds, 3) - def _gps_values_at_grid(self): """Returns an array where we get the GPS-derived values for each element of the treatment grid. Resulting array will be of shape (n_samples, treatment_grid_num) @@ -699,15 +630,14 @@ def _determine_gps_function(self): if any(self.T <= 0): self.best_gps_family = "normal" self.gps_function, self.gps_deviance = self._create_normal_gps_function() - if self.verbose: - print( - f"Must fit `normal` GLM family to model treatment since treatment takes on zero or negative values..." - ) + self.if_verbose_print( + """Must fit `normal` GLM family to model treatment since + treatment takes on zero or negative values...""" + ) # If treatment has no negative values and user provides in put, use that. elif (all(self.T > 0)) & (not isinstance(self.gps_family, type(None))): - if self.verbose: - print(f"Fitting GPS model of family '{self.gps_family}'...") + self.if_verbose_print(f"Fitting GPS model of family '{self.gps_family}'...") if self.gps_family == "normal": self.best_gps_family = "normal" @@ -725,10 +655,12 @@ def _determine_gps_function(self): self.best_gps_family = "gamma" self.gps_function, self.gps_deviance = self._create_gamma_gps_function() - # If no zero or negative treatment values and user didn't provide input, figure out best-fitting family + # If no zero or negative treatment values and user didn't provide + # input, figure out best-fitting family elif (all(self.T > 0)) & (isinstance(self.gps_family, type(None))): - if self.verbose: - print(f"Fitting several GPS models and picking the best fitting one...") + self.if_verbose_print( + "Fitting several GPS models and" " picking the best fitting one..." + ) ( self.best_gps_family, @@ -736,11 +668,10 @@ def _determine_gps_function(self): self.gps_deviance, ) = self._find_best_gps_model() - if self.verbose: - print( - f"Best fitting model was {self.best_gps_family}, which " - f"produced a deviance of {self.gps_deviance}" - ) + self.if_verbose_print( + f"""Best fitting model was {self.best_gps_family}, which + produced a deviance of {self.gps_deviance}""" + ) def _create_normal_gps_function(self): """Models the GPS using a GLM of the Gaussian family""" @@ -808,103 +739,3 @@ def _find_best_gps_model(self): models_to_try_dict[best_model][0], models_to_try_dict[best_model][1], ) - - def predict(self, T): - """Calculates point estimate within the CDRC given treatment values. - Can only be used when outcome is continuous. Can be estimate for a single - data point or can be run in batch for many observations. Extrapolation will produce - untrustworthy results; the provided treatment should be within - the range of the training data. - - Parameters - ---------- - T: Numpy array, shape (n_samples,) - A continuous treatment variable. - - Returns - ---------- - array: Numpy array - Contains a set of CDRC point estimates - - """ - if self.outcome_type != "continuous": - raise TypeError("Your outcome must be continuous to use this function!") - - return np.apply_along_axis(self._create_predict, 0, T.reshape(1, -1)) - - def _create_predict(self, T): - """Takes a single treatment value and produces a single point estimate - in the case of a continuous outcome. - """ - return self.gam_results.predict( - np.array([T, self.gps_function(T).mean()]).reshape(1, -1) - ) - - def predict_interval(self, T, ci=0.95): - """Calculates the prediction confidence interval associated with a point estimate - within the CDRC given treatment values. Can only be used - when outcome is continuous. Can be estimate for a single data point - or can be run in batch for many observations. Extrapolation will produce - untrustworthy results; the provided treatment should be within - the range of the training data. - - Parameters - ---------- - T: Numpy array, shape (n_samples,) - A continuous treatment variable. - ci: float (default = 0.95) - The desired confidence interval to produce. Default value is 0.95, corresponding - to 95% confidence intervals. bounded (0, 1.0). - - Returns - ---------- - array: Numpy array - Contains a set of CDRC prediction intervals ([lower bound, higher bound]) - - """ - if self.outcome_type != "continuous": - raise TypeError("Your outcome must be continuous to use this function!") - - return np.apply_along_axis( - self._create_predict_interval, 0, T.reshape(1, -1) - ).T.reshape(-1, 2) - - def _create_predict_interval(self, T): - """Takes a single treatment value and produces confidence interval - associated with a point estimate in the case of a continuous outcome. - """ - return self.gam_results.prediction_intervals( - np.array([T, self.gps_function(T).mean()]).reshape(1, -1) - ) - - def predict_log_odds(self, T): - """Calculates the predicted log odds of the highest integer class. Can - only be used when the outcome is binary. Can be estimate for a single - data point or can be run in batch for many observations. Extrapolation will produce - untrustworthy results; the provided treatment should be within - the range of the training data. - - Parameters - ---------- - T: Numpy array, shape (n_samples,) - A continuous treatment variable. - - Returns - ---------- - array: Numpy array - Contains a set of log odds - """ - if self.outcome_type != "binary": - raise TypeError("Your outcome must be binary to use this function!") - - return np.apply_along_axis(self._create_log_odds, 0, T.reshape(1, -1)) - - def _create_log_odds(self, T): - """Take a single treatment value and produces the log odds of the higher - integer class, in the case of a binary outcome. - """ - return logit( - self.gam_results.predict_proba( - np.array([T, self.gps_function(T).mean()]).reshape(1, -1) - ) - ) diff --git a/causal_curve/gps_regressor.py b/causal_curve/gps_regressor.py new file mode 100644 index 0000000..adc32df --- /dev/null +++ b/causal_curve/gps_regressor.py @@ -0,0 +1,139 @@ +""" +Defines the Generalized Prospensity Score (GPS) regressor model class +""" + +import numpy as np + +from causal_curve.gps_core import GPS_Core + + +class GPS_Regressor(GPS_Core): + """ + A GPS tool that handles continuous outcomes. Inherits the GPS_core + base class. See that base class code its docstring for more details. + """ + + def __init__( + self, + gps_family=None, + treatment_grid_num=100, + lower_grid_constraint=0.01, + upper_grid_constraint=0.99, + spline_order=3, + n_splines=30, + lambda_=0.5, + max_iter=100, + random_seed=None, + verbose=False, + ): + GPS_Core.__init__( + self, + gps_family=None, + treatment_grid_num=100, + lower_grid_constraint=0.01, + upper_grid_constraint=0.99, + spline_order=3, + n_splines=30, + lambda_=0.5, + max_iter=100, + random_seed=None, + verbose=False, + ) + + def _cdrc_predictions_continuous(self, ci): + """Returns the predictions of CDRC for each value of the treatment grid. Essentially, + we're making predictions using the original treatment and gps_at_grid. + To be used when the outcome of interest is continuous. + """ + + # To keep track of cdrc predictions, we create an empty 3d array of shape + # (n_samples, treatment_grid_num, 3). The last dimension is of length 3 because + # we are going to keep track of the point estimate of the prediction, as well as + # the lower and upper bounds of the prediction interval + cdrc_preds = np.zeros((len(self.T), self.treatment_grid_num, 3), dtype=float) + + # Loop through each of the grid values, predict point estimate and get prediction interval + for i in range(0, self.treatment_grid_num): + temp_T = np.repeat(self.grid_values[i], repeats=len(self.T)) + temp_gps = self.gps_at_grid[:, i] + temp_cdrc_preds = self.gam_results.predict( + np.column_stack((temp_T, temp_gps)) + ) + temp_cdrc_interval = self.gam_results.confidence_intervals( + np.column_stack((temp_T, temp_gps)), width=ci + ) + temp_cdrc_lower_bound = temp_cdrc_interval[:, 0] + temp_cdrc_upper_bound = temp_cdrc_interval[:, 1] + cdrc_preds[:, i, 0] = temp_cdrc_preds + cdrc_preds[:, i, 1] = temp_cdrc_lower_bound + cdrc_preds[:, i, 2] = temp_cdrc_upper_bound + + return np.round(cdrc_preds, 3) + + def point_estimate(self, T): + """Calculates point estimate within the CDRC given treatment values. + Can only be used when outcome is continuous. Can be estimate for a single + data point or can be run in batch for many observations. Extrapolation will produce + untrustworthy results; the provided treatment should be within + the range of the training data. + + Parameters + ---------- + T: Numpy array, shape (n_samples,) + A continuous treatment variable. + + Returns + ---------- + array: Numpy array + Contains a set of CDRC point estimates + + """ + if self.outcome_type != "continuous": + raise TypeError("Your outcome must be continuous to use this function!") + + return np.apply_along_axis(self._create_point_estimate, 0, T.reshape(1, -1)) + + def _create_point_estimate(self, T): + """Takes a single treatment value and produces a single point estimate + in the case of a continuous outcome. + """ + return self.gam_results.predict( + np.array([T, self.gps_function(T).mean()]).reshape(1, -1) + ) + + def point_estimate_interval(self, T, ci=0.95): + """Calculates the prediction confidence interval associated with a point estimate + within the CDRC given treatment values. Can only be used + when outcome is continuous. Can be estimate for a single data point + or can be run in batch for many observations. Extrapolation will produce + untrustworthy results; the provided treatment should be within + the range of the training data. + + Parameters + ---------- + T: Numpy array, shape (n_samples,) + A continuous treatment variable. + ci: float (default = 0.95) + The desired confidence interval to produce. Default value is 0.95, corresponding + to 95% confidence intervals. bounded (0, 1.0). + + Returns + ---------- + array: Numpy array + Contains a set of CDRC prediction intervals ([lower bound, higher bound]) + + """ + if self.outcome_type != "continuous": + raise TypeError("Your outcome must be continuous to use this function!") + + return np.apply_along_axis( + self._create_point_estimate_interval, 0, T.reshape(1, -1), width=ci + ).T.reshape(-1, 2) + + def _create_point_estimate_interval(self, T, width): + """Takes a single treatment value and produces confidence interval + associated with a point estimate in the case of a continuous outcome. + """ + return self.gam_results.prediction_intervals( + np.array([T, self.gps_function(T).mean()]).reshape(1, -1), width=width + ) diff --git a/causal_curve/mediation.py b/causal_curve/mediation.py index d571aa5..bc44b7f 100644 --- a/causal_curve/mediation.py +++ b/causal_curve/mediation.py @@ -1,4 +1,3 @@ -# pylint: disable=bad-continuation """ Defines the Mediation test class """ @@ -11,7 +10,6 @@ from pygam import LinearGAM, s from causal_curve.core import Core -from causal_curve.utils import rand_seed_wrapper class Mediation(Core): @@ -135,7 +133,7 @@ def __init__( # Validate the params self._validate_init_params() - rand_seed_wrapper() + self.rand_seed_wrapper() if self.verbose: print("Using the following params for the mediation analysis:") @@ -160,7 +158,7 @@ def _validate_init_params(self): ) if (isinstance(self.treatment_grid_num, int)) and self.treatment_grid_num > 100: - raise ValueError(f"treatment_grid_num parameter is too high!") + raise ValueError("treatment_grid_num parameter is too high!") # Checks for lower_grid_constraint if not isinstance(self.lower_grid_constraint, float): @@ -269,7 +267,7 @@ def _validate_init_params(self): ) if (isinstance(self.spline_order, int)) and self.spline_order >= 30: - raise ValueError(f"spline_order parameter is too high!") + raise ValueError("spline_order parameter is too high!") # Checks for n_splines if not isinstance(self.n_splines, int): @@ -283,7 +281,7 @@ def _validate_init_params(self): ) if (isinstance(self.n_splines, int)) and self.n_splines >= 100: - raise ValueError(f"n_splines parameter is too high!") + raise ValueError("n_splines parameter is too high!") # Checks for lambda_ if not isinstance(self.lambda_, (int, float)): @@ -297,7 +295,7 @@ def _validate_init_params(self): ) if (isinstance(self.lambda_, (int, float))) and self.lambda_ >= 1000: - raise ValueError(f"lambda_ parameter is too high!") + raise ValueError("lambda_ parameter is too high!") # Checks for max_iter if not isinstance(self.max_iter, int): @@ -307,11 +305,11 @@ def _validate_init_params(self): if (isinstance(self.max_iter, int)) and self.max_iter <= 10: raise ValueError( - f"max_iter parameter is too low! Results won't be reliable!" + "max_iter parameter is too low! Results won't be reliable!" ) if (isinstance(self.max_iter, int)) and self.max_iter >= 1e6: - raise ValueError(f"max_iter parameter is unnecessarily high!") + raise ValueError("max_iter parameter is unnecessarily high!") # Checks for random_seed if not isinstance(self.random_seed, (int, type(None))): @@ -320,7 +318,7 @@ def _validate_init_params(self): ) if (isinstance(self.random_seed, int)) and self.random_seed < 0: - raise ValueError(f"random_seed parameter must be > 0") + raise ValueError("random_seed parameter must be > 0") # Checks for verbose if not isinstance(self.verbose, bool): @@ -332,15 +330,15 @@ def _validate_fit_data(self): """Verifies that T, M, and y are formatted the right way""" # Checks for T column if not is_float_dtype(self.T): - raise TypeError(f"Treatment data must be of type float") + raise TypeError("Treatment data must be of type float") # Checks for M column if not is_float_dtype(self.M): - raise TypeError(f"Mediation data must be of type float") + raise TypeError("Mediation data must be of type float") # Checks for Y column if not is_float_dtype(self.y): - raise TypeError(f"Outcome data must be of type float") + raise TypeError("Outcome data must be of type float") def _grid_values(self): """Produces initial grid values for the treatment variable""" @@ -498,7 +496,7 @@ def calculate_mediation(self, ci=0.95): # Bootstrap these general_indirect values bootstrap_overall_means = [] - for i in range(0, 1000): + for _ in range(0, 1000): bootstrap_overall_means.append( general_indirect.sample(frac=0.25, replace=True).mean() ) @@ -521,10 +519,10 @@ def calculate_mediation(self, ci=0.95): # Calculate overall, mean, indirect effect total_prop_mean = round(np.array(self.prop_indirect_list).mean(), 4) - total_prop_lower = self._clip_negatives( + total_prop_lower = self.clip_negatives( round(np.percentile(bootstrap_overall_means, q=lower * 100), 4) ) - total_prop_upper = self._clip_negatives( + total_prop_upper = self.clip_negatives( round(np.percentile(bootstrap_overall_means, q=upper * 100), 4) ) @@ -535,18 +533,12 @@ def calculate_mediation(self, ci=0.95): ) return final_results - def _clip_negatives(self, number): - """Helper function to clip negative numbers to zero""" - if number < 0: - return 0 - return number - def _bootstrap_analysis(self, temp_low_treatment, temp_high_treatment): """The top-level function used in the fitting method""" bootstrap_collection = [] - for i in range(0, self.bootstrap_replicates): + for _ in range(0, self.bootstrap_replicates): # Create single bootstrap replicate temp_t, temp_m, temp_y = self._create_bootstrap_replicate() # Create the models from this @@ -558,7 +550,6 @@ def _bootstrap_analysis(self, temp_low_treatment, temp_high_treatment): temp_mediator_model, temp_t, temp_m, - temp_y, temp_low_treatment, temp_high_treatment, ) @@ -612,7 +603,6 @@ def _mediator_prediction( temp_mediator_model, temp_t, temp_m, - temp_y, temp_low_treatment, temp_high_treatment, ): @@ -651,15 +641,15 @@ def _outcome_prediction( ["z0", temp_high_treatment, temp_low_treatment, predict_m0, predict_m0], ] - for input in inputs: + for element in inputs: # Set treatment values - t_1 = input[1] - t_0 = input[2] + t_1 = element[1] + t_0 = element[2] # Set mediator values - m_1 = input[3] - m_0 = input[4] + m_1 = element[3] + m_0 = element[4] pr_1 = temp_outcome_model.predict( np.column_stack((np.repeat(t_1, self.n), m_1)) @@ -669,6 +659,6 @@ def _outcome_prediction( np.column_stack((np.repeat(t_0, self.n), m_0)) ) - outcome_preds[input[0]] = (pr_1 - pr_0).mean() + outcome_preds[element[0]] = (pr_1 - pr_0).mean() return outcome_preds diff --git a/causal_curve/tmle.py b/causal_curve/tmle.py deleted file mode 100644 index 7dd1a72..0000000 --- a/causal_curve/tmle.py +++ /dev/null @@ -1,572 +0,0 @@ -# pylint: disable=bad-continuation -""" -Defines the Targetted Maximum likelihood Estimation (TMLE) model class -""" -from pprint import pprint - -import numpy as np -import pandas as pd -from pandas.api.types import is_float_dtype, is_numeric_dtype -from scipy.interpolate import interp1d -from scipy.stats import norm -from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor -from statsmodels.genmod.generalized_linear_model import GLM - -from causal_curve.core import Core -from causal_curve.utils import rand_seed_wrapper - - -class TMLE(Core): - """ - Constructs a causal dose response curve through a series of TMLE comparisons across a grid - of the treatment values. Gradient boosting is used for prediction in Q model and G model. - Assumes continuous treatment and outcome variable. - - WARNING: - - -In choosing `treatment_grid_bins` be very careful to respect the "positivity" assumption. - There must be sufficient data and variability of treatment within each bin the treatment - is split into. - - -This algorithm assumes you've already performed the necessary transformations to - categorical covariates (i.e. these variables are already one-hot encoded and - one of the categories is excluded for each set of dummy variables). - - -Please take care to ensure that the "ignorability" assumption is met (i.e. - all strong confounders are captured in your covariates and there is no - informative censoring), otherwise your results will be biased, sometimes strongly so. - - Parameters - ---------- - - treatment_grid_bins: list of floats or ints - Represents the edges of bins of treatment values that are used to construct a smooth curve - Look at the distribution of your treatment variable to determine which - family is more appropriate. Be mindful of the "positivity" assumption when determining - bins. In other words, make sure there aren't too few data points in each bin. Mean - treatment values between the bin edges will be used to generate the CDRC. - - n_estimators: int, optional (default = 100) - Optional argument to set the number of learners to use when sklearn - creates TMLE's Q and G models. - - learning_rate: float, optional (default = 0.1) - Optional argument to set the sklearn's learning rate for TMLE's Q and G models. - - max_depth: int, optional (default = 5) - Optional argument to set sklearn's maximum depth when creating TMLE's Q and G models. - - random_seed: int, optional (default = None) - Sets the random seed. - - verbose: bool, optional (default = False) - Determines whether the user will get verbose status updates. - - - Attributes - ---------- - psi_list: array of shape (len(treatment_grid_bins) - 2, ) - The estimated causal difference between treatment bins - - std_error_ic_list: array of shape (len(treatment_grid_bins) - 2, ) - The standard errors for the psi estimates found in `psi_list` - - - Methods - ---------- - fit: (self, T, X, y) - Fits the causal dose-response model - - calculate_CDRC: (self, ci, CDRC_grid_num) - Calculates the CDRC (and confidence interval) from TMLE estimation - - - Examples - -------- - >>> from causal_curve import TMLE - >>> tmle = TMLE(treatment_grid_bins = [0, 0.03, 0.05, 0.25, 0.5, 1.0, 2.0], - n_estimators=500, - learning_rate = 0.1, - max_depth = 5, - random_seed=111 - ) - >>> tmle.fit(T = df['Treatment'], X = df[['X_1', 'X_2']], y = df['Outcome']) - >>> tmle_results = tmle.calculate_CDRC(0.95) - - - References - ---------- - - van der Laan MJ and Rubin D. Targeted maximum likelihood learning. In: The International - Journal of Biostatistics, 2(1), 2006. - - van der Laan MJ and Gruber S. Collaborative double robust penalized targeted - maximum likelihood estimation. In: The International Journal of Biostatistics 6(1), 2010. - - """ - - def __init__( - self, - treatment_grid_bins, - n_estimators=100, - learning_rate=0.1, - max_depth=5, - random_seed=None, - verbose=False, - ): - - self.treatment_grid_bins = treatment_grid_bins - self.n_estimators = n_estimators - self.learning_rate = learning_rate - self.max_depth = max_depth - self.random_seed = random_seed - self.verbose = verbose - - # Validate the params - self._validate_init_params() - rand_seed_wrapper() - - if self.verbose: - print("Using the following params for TMLE model:") - pprint(self.get_params(), indent=4) - - def _validate_init_params(self): - """ - Checks that the params used when instantiating TMLE model are formatted correctly - """ - - # Checks for treatment_grid_bins - if not isinstance(self.treatment_grid_bins, list): - raise TypeError( - f"treatment_grid_bins parameter must be a list, " - f"but found type {type(self.treatment_grid_bins)}" - ) - - for element in self.treatment_grid_bins: - if not isinstance(element, (int, float)): - raise TypeError( - f"'{element}' in `treatment_grid_bins` list is not of type float or int, " - f"it is {type(element)}" - ) - - if len(self.treatment_grid_bins) < 2: - raise TypeError("treatment_grid_bins list must, at minimum, of length >= 2") - - # Checks for n_estimators - if not isinstance(self.n_estimators, int): - raise TypeError( - f"n_estimators parameter must be an integer, " - f"but found type {type(self.n_estimators)}" - ) - - if (self.n_estimators < 10) or (self.n_estimators > 100000): - raise TypeError("n_estimators parameter must be between 10 and 100000") - - # Checks for learning_rate - if not isinstance(self.learning_rate, (int, float)): - raise TypeError( - f"learning_rate parameter must be an integer or float, " - f"but found type {type(self.learning_rate)}" - ) - - if (self.learning_rate <= 0) or (self.learning_rate >= 1000): - raise TypeError( - "learning_rate parameter must be greater than 0 and less than 1000" - ) - - # Checks for max_depth - if not isinstance(self.max_depth, int): - raise TypeError( - f"max_depth parameter must be an integer, " - f"but found type {type(self.max_depth)}" - ) - - if self.max_depth <= 0: - raise TypeError("max_depth parameter must be greater than 0") - - # Checks for random_seed - if not isinstance(self.random_seed, (int, type(None))): - raise TypeError( - f"random_seed parameter must be an int, but found type {type(self.random_seed)}" - ) - - if (isinstance(self.random_seed, int)) and self.random_seed < 0: - raise ValueError(f"random_seed parameter must be > 0") - - # Checks for verbose - if not isinstance(self.verbose, bool): - raise TypeError( - f"verbose parameter must be a boolean type, but found type {type(self.verbose)}" - ) - - def _validate_fit_data(self): - """Verifies that T, X, and y are formatted the right way""" - # Checks for T column - if not is_float_dtype(self.t_data): - raise TypeError(f"Treatment data must be of type float") - - # Make sure all X columns are float or int - if isinstance(self.x_data, pd.Series): - if not is_numeric_dtype(self.x_data): - raise TypeError( - f"All covariate (X) columns must be int or float type (i.e. must be numeric)" - ) - - elif isinstance(self.x_data, pd.DataFrame): - for column in self.x_data: - if not is_numeric_dtype(self.x_data[column]): - raise TypeError( - """All covariate (X) columns must be int or float type - (i.e. must be numeric)""" - ) - - # Checks for Y column - if not is_float_dtype(self.y_data): - raise TypeError(f"Outcome data must be of type float") - - def _validate_calculate_CDRC_params(self, ci): - """Validates the parameters given to `calculate_CDRC`""" - - if not isinstance(ci, float): - raise TypeError( - f"`ci` parameter must be an float, but found type {type(ci)}" - ) - - if isinstance(ci, float) and ((ci <= 0) or (ci >= 1.0)): - raise ValueError("`ci` parameter should be between (0, 1)") - - def _initial_bucket_mean_prediction(self): - """Creates a model to predict the outcome variable given the provided inputs within - the first bucket of treatment_grid_bins. This returns the mean predicted outcome. - """ - - y = self.y_data[self.t_data < self.treatment_grid_bins[1]] - X = pd.concat([self.t_data, self.x_data], axis=1)[ - self.t_data < self.treatment_grid_bins[1] - ] - - init_model = GradientBoostingRegressor( - n_estimators=self.n_estimators, - max_depth=self.max_depth, - learning_rate=self.learning_rate, - random_state=self.random_seed, - ).fit(X, y) - - return init_model.predict(X).mean() - - def _create_treatment_comparison_df( - self, low_boundary, med_boundary, high_boundary - ): - """Given the current boundaries chosen from treatment_grid_bins, this filters - the input data appropriately. - """ - temp_y = self.y_data[ - ((self.t_data >= low_boundary) & (self.t_data <= high_boundary)) - ] - temp_x = self.x_data[ - ((self.t_data >= low_boundary) & (self.t_data <= high_boundary)) - ] - temp_t = self.t_data.copy() - temp_t = temp_t[((temp_t >= low_boundary) & (temp_t <= high_boundary))] - temp_t[((temp_t >= low_boundary) & (temp_t < med_boundary))] = 0 - temp_t[((temp_t >= med_boundary) & (temp_t <= high_boundary))] = 1 - - return temp_y, temp_x, temp_t - - def _collect_mean_t_levels(self): - """Collects the mean treatment value within each treatment bucket in treatment_grid_bins""" - - t_bin_means = [] - - for index, _ in enumerate(self.treatment_grid_bins): - if index == (len(self.treatment_grid_bins) - 1): - continue - - t_bin_means.append( - self.t_data[ - ( - (self.t_data >= self.treatment_grid_bins[index]) - & (self.t_data <= self.treatment_grid_bins[index + 1]) - ) - ].mean() - ) - - return t_bin_means - - def fit(self, T, X, y): - """Fits the TMLE causal dose-response model. For now, this only accepts pandas columns - with the same index. - - Parameters - ---------- - T: array-like, shape (n_samples,) - A continuous treatment variable - X: array-like, shape (n_samples, m_features) - Covariates, where n_samples is the number of samples - and m_features is the number of features - y: array-like, shape (n_samples,) - Outcome variable - - Returns - ---------- - self : object - - """ - self.t_data = T.reset_index(drop=True, inplace=False) - self.x_data = X.reset_index(drop=True, inplace=False) - self.y_data = y.reset_index(drop=True, inplace=False) - - # Validate this input data - self._validate_fit_data() - - # Get the mean, predicted outcome value within the first bucket - if self.verbose: - print( - "Calculating the mean, predicted outcome value within the first bucket..." - ) - self.outcome_start_val = self._initial_bucket_mean_prediction() - - # Loop through the comparisons in the treatment_grid_bins - if self.verbose: - print("Beginning main loop through treatment bins...") - - # Collect loop results in these lists - self.t_bin_means = self._collect_mean_t_levels() - self.psi_list = [] - self.std_error_ic_list = [] - - for index, _ in enumerate(self.treatment_grid_bins): - if (index == 0) or (index == len(self.treatment_grid_bins) - 1): - continue - if self.verbose: - print( - f"***** Starting iteration {index} of {len(self.treatment_grid_bins) - 2} *****" - ) - low_boundary = self.treatment_grid_bins[index - 1] - med_boundary = self.treatment_grid_bins[index] - high_boundary = self.treatment_grid_bins[index + 1] - - # Create comparison dataset - temp_y, temp_x, temp_t = self._create_treatment_comparison_df( - low_boundary, med_boundary, high_boundary - ) - - self.n_obs = len(temp_y) - - # Fit Q-model and get relevent predictions - if self.verbose: - print("Fitting Q-model and making predictions...") - self.y_hat_a, self.y_hat_1, self.y_hat_0 = self._q_model( - temp_y, temp_x, temp_t - ) - - # Fit G-model and get relevent predictions - if self.verbose: - print("Fitting G-model and making predictions...") - self.pi_hat1, self.pi_hat0 = self._g_model(temp_x, temp_t) - - # Estimate delta_hat - if self.verbose: - print("Estimating delta hat...") - self.delta_hat = self._delta_hat_estimation(temp_y, temp_x, temp_t) - - # Estimate targeted and corrected Psi - if self.verbose: - print("Estimating Psi...") - psi, std_error_IC = self._psi_estimation(temp_y, temp_t) - - self.psi_list.append(psi) - self.std_error_ic_list.append(std_error_IC) - - if self.verbose: - print(f"Finished this loop!") - - def calculate_CDRC(self, ci=0.95, CDRC_grid_num=100): - """Using the results of the fitted model, this generates the CDRC by interpolation - of the binned treatment comparisons. This returns a confidence interval as well, which - is also generated by interpolation. - - Parameters - ---------- - ci: float (default = 0.95) - The desired confidence interval to produce. Default value is 0.95, corresponding - to 95% confidence intervals. bounded (0, 1.0). - - CDRC_grid_num: int, optional (default = 100) - Linear interpolation over a quantile-based grid of treatment values is used - to produce the final CDRC. This parameter determines how many points - to include on that grid. Higher values will produce a finer estimate of the CDRC, - but this increases computation time. Default is usually a reasonable number. - - Returns - ---------- - dataframe: Pandas dataframe - Contains treatment grid values, the CDRC point estimate at that value, - and the associated lower and upper confidence interval bounds at that point. - - self: object - """ - - self._validate_calculate_CDRC_params(ci) - z_star = norm.ppf((1 + ci) / 2) - - if self.verbose: - print( - f"Estimating the CDRC and confidence intervals via cubic interpolation..." - ) - - # Collect discrete t_values - t_values = self.t_bin_means - - # Collect discrete y_values - y_values = [self.outcome_start_val] - - for index, item in enumerate(self.psi_list): - y_values.append(self.outcome_start_val + sum(self.psi_list[: (index + 1)])) - - # Collect discrete lower confidence bounds - lower_values = [self.outcome_start_val - (z_star * self.std_error_ic_list[0])] - for index, y_value in enumerate(y_values[1:]): - lower_values.append(y_value - (z_star * self.std_error_ic_list[index])) - - # Collect discrete upper confidence bounds - upper_values = [self.outcome_start_val + (z_star * self.std_error_ic_list[0])] - for index, y_value in enumerate(y_values[1:]): - upper_values.append(y_value + (z_star * self.std_error_ic_list[index])) - - # Perform cubic interpolation - CDRC_interp = interp1d(t_values, y_values, kind="cubic") - lower_interp = interp1d(t_values, lower_values, kind="cubic") - upper_interp = interp1d(t_values, upper_values, kind="cubic") - - CDRC_t_grid = self._grid_values(CDRC_grid_num, t_values) - - Treatment = CDRC_t_grid - CDRC = CDRC_interp(CDRC_t_grid) - Lower_CI = lower_interp(CDRC_t_grid) - Upper_CI = upper_interp(CDRC_t_grid) - - if self.verbose: - print(f"Done!") - - return pd.DataFrame( - { - "Treatment": Treatment, - "Causal_Dose_Response": CDRC, - "Lower_CI": Lower_CI, - "Upper_CI": Upper_CI, - } - ).round(3) - - def _grid_values(self, CDRC_grid_num, t_values): - """Produces grid values for use in estimating the final CDRC and confidence intervals.""" - - return np.quantile( - self.t_data[((self.t_data > t_values[0]) & (self.t_data < t_values[-1]))], - q=np.linspace( - start=0, - stop=1, - num=CDRC_grid_num, - ), - ) - - def _q_model(self, temp_y, temp_x, temp_t): - """Produces the Q-model and gets outcome predictions using the provided treatment - values, when the treatment is completely present and not present. - """ - - X = pd.concat([temp_t, temp_x], axis=1).to_numpy() - y = temp_y.to_numpy() - - Q_model = GradientBoostingRegressor( - n_estimators=self.n_estimators, - max_depth=self.max_depth, - learning_rate=self.learning_rate, - random_state=self.random_seed, - ).fit(X, y) - - # Make predictions with provided treatment values - y_hat_a = Q_model.predict(X) - - temp = np.column_stack((np.repeat(1, self.n_obs), np.asarray(temp_x))) - - # Make predictions when treatment is completely present - y_hat_1 = Q_model.predict(temp) - - # Make predictions when treatment is completely not present - temp = np.column_stack((np.repeat(0, self.n_obs), np.asarray(temp_x))) - - y_hat_0 = Q_model.predict(temp) - - return y_hat_a, y_hat_1, y_hat_0 - - def _g_model(self, temp_x, temp_t): - """Produces the G-model and gets treatment assignment predictions""" - - X = temp_x.to_numpy() - t = temp_t.to_numpy() - - G_model = GradientBoostingClassifier( - n_estimators=self.n_estimators, - max_depth=self.max_depth, - learning_rate=self.learning_rate, - random_state=self.random_seed, - ).fit(X, t) - - # Make predictions of receiving treatment - pi_hat1 = G_model.predict_proba(X)[:, 1] - - # Predictions of not receiving treatment - pi_hat0 = 1 - pi_hat1 - - return pi_hat1, pi_hat0 - - def _delta_hat_estimation(self, temp_y, temp_x, temp_t): - """Estimates delta to correct treatment estimation""" - H_a = [] - - for idx, treatment in enumerate(np.asarray(temp_t)): - if treatment == 1: - H_a.append(1 / self.pi_hat1[idx]) - elif treatment == 0: - H_a.append(-1 / self.pi_hat0[idx]) - - H_a = np.array(H_a) - - # Create GLM using H_a as a forced offset - targetting_model = GLM( - endog=np.asarray(temp_y), exog=H_a, offset=np.asarray(self.y_hat_a) - ).fit() - - return targetting_model.params[0] - - def _psi_estimation(self, temp_y, temp_t): - """Estimates final Psi for the treatment comparison, also estimates the - standard error via the influence curve - """ - - # Estimate Psi - H_1 = 1 / self.pi_hat1 - H_0 = -1 / self.pi_hat0 - - Y_star_1 = self.y_hat_1 + (self.delta_hat * H_1) - Y_star_0 = self.y_hat_0 + (self.delta_hat * H_0) - - Psi = round((Y_star_1 - Y_star_0).mean(), 3) - - # Use Psi and other various variables to estimate the standard error - D1 = ( - ((temp_t / self.pi_hat1) * (temp_y - self.y_hat_1)) - + self.y_hat_1 - - Y_star_1 - ) - D0 = ( - (((1 - temp_t) / (1 - self.pi_hat1)) * (temp_y - self.y_hat_0)) - + self.y_hat_0 - - Y_star_0 - ) - EIC = D1 - D0 - - std_error_IC = np.sqrt(EIC.var() / self.n_obs) - - return Psi, std_error_IC diff --git a/causal_curve/tmle_core.py b/causal_curve/tmle_core.py new file mode 100644 index 0000000..6c7cd8b --- /dev/null +++ b/causal_curve/tmle_core.py @@ -0,0 +1,601 @@ +""" +Defines the Targetted Maximum likelihood Estimation (TMLE) model class +""" +from pprint import pprint + +import numpy as np +import pandas as pd +from pandas.api.types import is_float_dtype, is_numeric_dtype +from pygam import LinearGAM, s +from scipy.interpolate import interp1d +from sklearn.neighbors import KernelDensity +from sklearn.ensemble import GradientBoostingRegressor +from statsmodels.nonparametric.kernel_regression import KernelReg + +from causal_curve.core import Core + + +class TMLE_Core(Core): + """ + Constructs a causal dose response curve via a modified version of Targetted + Maximum Likelihood Estimation (TMLE) across a grid of the treatment values. + Gradient boosting is used for prediction of the Q model and G models, simple + kernel regression is used processing those model results, and a generalized + additive model is used in the final step to contruct the final curve. + Assumes continuous treatment and outcome variable. + + WARNING: + + -The treatment values should be roughly normally-distributed for this tool + to work. Otherwise you may encounter internal math errors. + + -This algorithm assumes you've already performed the necessary transformations to + categorical covariates (i.e. these variables are already one-hot encoded and + one of the categories is excluded for each set of dummy variables). + + -Please take care to ensure that the "ignorability" assumption is met (i.e. + all strong confounders are captured in your covariates and there is no + informative censoring), otherwise your results will be biased, sometimes strongly so. + + Parameters + ---------- + + treatment_grid_num: int, optional (default = 100) + Takes the treatment, and creates a quantile-based grid across its values. + For instance, if the number 6 is selected, this means the algorithm will only take + the 6 treatment variable values at approximately the 0, 20, 40, 60, 80, and 100th + percentiles to estimate the causal dose response curve. + Higher value here means the final curve will be more finely estimated, + but also increases computation time. Default is usually a reasonable number. + + lower_grid_constraint: float, optional(default = 0.01) + This adds an optional constraint of the lower side of the treatment grid. + Sometimes data near the minimum values of the treatment are few in number + and thus generate unstable estimates. By default, this clips the bottom 1 percentile + or lower of treatment values. This can be as low as 0, indicating there is no + lower limit to how much treatment data is considered. + + upper_grid_constraint: float, optional (default = 0.99) + See above parameter. Just like above, but this is an upper constraint. + By default, this clips the top 99th percentile or higher of treatment values. + This can be as high as 1.0, indicating there is no upper limit to how much + treatment data is considered. + + n_estimators: int, optional (default = 200) + Optional argument to set the number of learners to use when sklearn + creates TMLE's Q and G models. + + learning_rate: float, optional (default = 0.01) + Optional argument to set the sklearn's learning rate for TMLE's Q and G models. + + max_depth: int, optional (default = 3) + Optional argument to set sklearn's maximum depth when creating TMLE's Q and G models. + + bandwidth: float, optional (default = 0.5) + Optional argument to set the bandwidth parameter of the internal + kernel density estimation and kernel regression methods. + + random_seed: int, optional (default = None) + Sets the random seed. + + verbose: bool, optional (default = False) + Determines whether the user will get verbose status updates. + + + Attributes + ---------- + + + + Methods + ---------- + fit: (self, T, X, y) + Fits the causal dose-response model + + calculate_CDRC: (self, ci, CDRC_grid_num) + Calculates the CDRC (and confidence interval) from TMLE estimation + + + Examples + -------- + + + + References + ---------- + + Kennedy EH, Ma Z, McHugh MD, Small DS. Nonparametric methods for doubly robust estimation + of continuous treatment effects. Journal of the Royal Statistical Society, Series B. 79(4), 2017, pp.1229-1245. + + van der Laan MJ and Rubin D. Targeted maximum likelihood learning. In: The International + Journal of Biostatistics, 2(1), 2006. + + van der Laan MJ and Gruber S. Collaborative double robust penalized targeted + maximum likelihood estimation. In: The International Journal of Biostatistics 6(1), 2010. + + """ + + def __init__( + self, + treatment_grid_num=100, + lower_grid_constraint=0.01, + upper_grid_constraint=0.99, + n_estimators=200, + learning_rate=0.01, + max_depth=3, + bandwidth=0.5, + random_seed=None, + verbose=False, + ): + + self.treatment_grid_num = treatment_grid_num + self.lower_grid_constraint = lower_grid_constraint + self.upper_grid_constraint = upper_grid_constraint + self.n_estimators = n_estimators + self.learning_rate = learning_rate + self.max_depth = max_depth + self.bandwidth = bandwidth + self.random_seed = random_seed + self.verbose = verbose + + # Validate the params + self._validate_init_params() + self.rand_seed_wrapper() + + self.if_verbose_print("Using the following params for TMLE model:") + if self.verbose: + pprint(self.get_params(), indent=4) + + def _validate_init_params(self): + """ + Checks that the params used when instantiating TMLE model are formatted correctly + """ + + # Checks for treatment_grid_num + if not isinstance(self.treatment_grid_num, int): + raise TypeError( + f"treatment_grid_num parameter must be an integer, " + f"but found type {type(self.treatment_grid_num)}" + ) + + if (isinstance(self.treatment_grid_num, int)) and self.treatment_grid_num < 10: + raise ValueError( + f"treatment_grid_num parameter should be >= 10 so your final curve " + f"has enough resolution, but found value {self.treatment_grid_num}" + ) + + if ( + isinstance(self.treatment_grid_num, int) + ) and self.treatment_grid_num >= 1000: + raise ValueError("treatment_grid_num parameter is too high!") + + # Checks for lower_grid_constraint + if not isinstance(self.lower_grid_constraint, float): + raise TypeError( + f"lower_grid_constraint parameter must be a float, " + f"but found type {type(self.lower_grid_constraint)}" + ) + + if ( + isinstance(self.lower_grid_constraint, float) + ) and self.lower_grid_constraint < 0: + raise ValueError( + f"lower_grid_constraint parameter cannot be < 0, " + f"but found value {self.lower_grid_constraint}" + ) + + if ( + isinstance(self.lower_grid_constraint, float) + ) and self.lower_grid_constraint >= 1.0: + raise ValueError( + f"lower_grid_constraint parameter cannot >= 1.0, " + f"but found value {self.lower_grid_constraint}" + ) + + # Checks for upper_grid_constraint + if not isinstance(self.upper_grid_constraint, float): + raise TypeError( + f"upper_grid_constraint parameter must be a float, " + f"but found type {type(self.upper_grid_constraint)}" + ) + + if ( + isinstance(self.upper_grid_constraint, float) + ) and self.upper_grid_constraint <= 0: + raise ValueError( + f"upper_grid_constraint parameter cannot be <= 0, " + f"but found value {self.upper_grid_constraint}" + ) + + if ( + isinstance(self.upper_grid_constraint, float) + ) and self.upper_grid_constraint > 1.0: + raise ValueError( + f"upper_grid_constraint parameter cannot > 1.0, " + f"but found value {self.upper_grid_constraint}" + ) + + # Checks for lower_grid_constraint isn't higher than upper_grid_constraint + if self.lower_grid_constraint >= self.upper_grid_constraint: + raise ValueError( + "lower_grid_constraint should be lower than upper_grid_constraint!" + ) + + # Checks for n_estimators + if not isinstance(self.n_estimators, int): + raise TypeError( + f"n_estimators parameter must be an integer, " + f"but found type {type(self.n_estimators)}" + ) + + if (self.n_estimators < 10) or (self.n_estimators > 100000): + raise TypeError("n_estimators parameter must be between 10 and 100000") + + # Checks for learning_rate + if not isinstance(self.learning_rate, (int, float)): + raise TypeError( + f"learning_rate parameter must be an integer or float, " + f"but found type {type(self.learning_rate)}" + ) + + if (self.learning_rate <= 0) or (self.learning_rate >= 1000): + raise TypeError( + "learning_rate parameter must be greater than 0 and less than 1000" + ) + + # Checks for max_depth + if not isinstance(self.max_depth, int): + raise TypeError( + f"max_depth parameter must be an integer, " + f"but found type {type(self.max_depth)}" + ) + + if self.max_depth <= 0: + raise TypeError("max_depth parameter must be greater than 0") + + # Checks for bandwidth + if not isinstance(self.bandwidth, (int, float)): + raise TypeError( + f"bandwidth parameter must be an integer or float, " + f"but found type {type(self.bandwidth)}" + ) + + if (self.bandwidth <= 0) or (self.bandwidth >= 1000): + raise TypeError( + "bandwidth parameter must be greater than 0 and less than 1000" + ) + + # Checks for random_seed + if not isinstance(self.random_seed, (int, type(None))): + raise TypeError( + f"random_seed parameter must be an int, but found type {type(self.random_seed)}" + ) + + if (isinstance(self.random_seed, int)) and self.random_seed < 0: + raise ValueError("random_seed parameter must be > 0") + + # Checks for verbose + if not isinstance(self.verbose, bool): + raise TypeError( + f"verbose parameter must be a boolean type, but found type {type(self.verbose)}" + ) + + def _validate_fit_data(self): + """Verifies that T, X, and y are formatted the right way""" + # Checks for T column + if not is_float_dtype(self.t_data): + raise TypeError("Treatment data must be of type float") + + # Make sure all X columns are float or int + if isinstance(self.x_data, pd.Series): + if not is_numeric_dtype(self.x_data): + raise TypeError( + "All covariate (X) columns must be int or float type (i.e. must be numeric)" + ) + + elif isinstance(self.x_data, pd.DataFrame): + for column in self.x_data: + if not is_numeric_dtype(self.x_data[column]): + raise TypeError( + """All covariate (X) columns must be int or float type + (i.e. must be numeric)""" + ) + + # Checks for Y column + if not is_float_dtype(self.y_data): + raise TypeError("Outcome data must be of type float") + + def _validate_calculate_CDRC_params(self, ci): + """Validates the parameters given to `calculate_CDRC`""" + + if not isinstance(ci, float): + raise TypeError( + f"`ci` parameter must be an float, but found type {type(ci)}" + ) + + if isinstance(ci, float) and ((ci <= 0) or (ci >= 1.0)): + raise ValueError("`ci` parameter should be between (0, 1)") + + def fit(self, T, X, y): + """Fits the TMLE causal dose-response model. For now, this only + accepts pandas columns. You *must* provide at least one covariate column. + + Parameters + ---------- + T: array-like, shape (n_samples,) + A continuous treatment variable + X: array-like, shape (n_samples, m_features) + Covariates, where n_samples is the number of samples + and m_features is the number of features + y: array-like, shape (n_samples,) + Outcome variable + + Returns + ---------- + self : object + + """ + self.t_data = T.reset_index(drop=True, inplace=False) + self.x_data = X.reset_index(drop=True, inplace=False) + self.y_data = y.reset_index(drop=True, inplace=False) + + # Validate this input data + self._validate_fit_data() + + # Capture covariate and treatment column names + self.treatment_col_name = self.t_data.name + + if len(self.x_data.shape) == 1: + self.covariate_col_names = [self.x_data.name] + else: + self.covariate_col_names = self.x_data.columns.values.tolist() + + # Note the size of the data + self.num_rows = len(self.t_data) + + # Produce expanded versions of the inputs + self.if_verbose_print("Transforming data for the Q-model and G-model") + self.grid_values, self.fully_expanded_x , self.fully_expanded_t_and_x = self._transform_inputs() + + # Fit G-model and get relevent predictions + self.if_verbose_print("Fitting G-model and making treatment assignment predictions...") + self.g_model_preds, self.g_model_2_preds = self._g_model() + + # Fit Q-model and get relevent predictions + self.if_verbose_print("Fitting Q-model and making outcome predictions...") + self.q_model_preds = self._q_model() + + # Calculating treatment assignment adjustment using G-model's predictions + self.if_verbose_print("Calculating treatment assignment adjustment using G-model's predictions...") + self.n_interpd_values, self.var_n_interpd_values = self._treatment_assignment_correction() + + # Adjusting outcome using Q-model's predictions + self.if_verbose_print("Adjusting outcome using Q-model's predictions...") + self.outcome_adjust, self.expand_outcome_adjust = self._outcome_adjustment() + + # Calculating corrected pseudo-outcome values + self.if_verbose_print("Calculating corrected pseudo-outcome values...") + self.pseudo_out = (self.y_data - self.outcome_adjust) / (self.n_interpd_values / self.var_n_interpd_values) + self.expand_outcome_adjust + + # Training final GAM model using pseudo-outcome values + self.if_verbose_print("Training final GAM model using pseudo-outcome values...") + self.final_gam = self._fit_final_gam() + + + def calculate_CDRC(self, ci=0.95): + """Using the results of the fitted model, this generates a dataframe of CDRC point estimates + at each of the values of the treatment grid. Connecting these estimates will produce + the overall estimated CDRC. Confidence interval is returned as well. + + Parameters + ---------- + ci: float (default = 0.95) + The desired confidence interval to produce. Default value is 0.95, corresponding + to 95% confidence intervals. bounded (0, 1.0). + + Returns + ---------- + dataframe: Pandas dataframe + Contains treatment grid values, the CDRC point estimate at that value, + and the associated lower and upper confidence interval bounds at that point. + + self: object + + """ + self._validate_calculate_CDRC_params(ci) + + self.if_verbose_print(""" + Generating predictions for each value of treatment grid, + and averaging to get the CDRC...""" + ) + + # Create CDRC predictions from the trained, final GAM + + self._cdrc_preds = self._cdrc_predictions_continuous(ci) + + return pd.DataFrame( + self._cdrc_preds, columns=["Treatment", "Causal_Dose_Response", "Lower_CI", "Upper_CI"] + ).round(3) + + + def _transform_inputs(self): + """Takes the treatment and covariates and transforms so they can + be used by the algo""" + + # Create treatment grid + grid_values = np.linspace( + start=self.t_data.min(), + stop=self.t_data.max(), + num=self.treatment_grid_num + ) + + # Create expanded treatment array + expanded_t = np.array([]) + for treat_value in grid_values: + expanded_t = np.append(expanded_t, ([treat_value] * self.num_rows)) + + # Create expanded treatment array with covariates + expanded_t_and_x = pd.concat( + [ + pd.DataFrame(expanded_t), + pd.concat( + [self.x_data] * self.treatment_grid_num + ).reset_index(drop = True, inplace = False), + ], + axis = 1, + ignore_index = True + ) + + expanded_t_and_x.columns = [self.treatment_col_name] + self.covariate_col_names + + fully_expanded_t_and_x = pd.concat( + [ + pd.concat([self.x_data, self.t_data], axis=1), + expanded_t_and_x + ], + axis = 0, + ignore_index = True + ) + + fully_expanded_x = fully_expanded_t_and_x[self.covariate_col_names] + + return grid_values, fully_expanded_x, fully_expanded_t_and_x + + def _g_model(self): + """Produces the G-model and gets treatment assignment predictions""" + + t = self.t_data.to_numpy() + X = self.x_data.to_numpy() + + g_model = GradientBoostingRegressor( + n_estimators=self.n_estimators, + max_depth=self.max_depth, + learning_rate=self.learning_rate, + random_state=self.random_seed + ).fit(X = X, y = t) + g_model_preds = g_model.predict(self.fully_expanded_x) + + g_model2 = GradientBoostingRegressor( + n_estimators=self.n_estimators, + max_depth=self.max_depth, + learning_rate=self.learning_rate, + random_state=self.random_seed + ).fit(X = X, y = ((t - g_model_preds[0:self.num_rows])**2)) + g_model_2_preds = g_model2.predict(self.fully_expanded_x) + + return g_model_preds, g_model_2_preds + + def _q_model(self): + """Produces the Q-model and gets outcome predictions using the provided treatment + values, when the treatment is completely present and not present. + """ + + X = pd.concat([self.t_data, self.x_data], axis=1).to_numpy() + y = self.y_data.to_numpy() + + q_model = GradientBoostingRegressor( + n_estimators=self.n_estimators, + max_depth=self.max_depth, + learning_rate=self.learning_rate, + random_state=self.random_seed + ).fit(X = X, y = y) + q_model_preds = q_model.predict(self.fully_expanded_t_and_x) + + return q_model_preds + + + def _treatment_assignment_correction(self): + """Uses the G-model and its predictions to adjust treatment assignment + """ + + t_standard = ( + (self.fully_expanded_t_and_x[self.treatment_col_name] - self.g_model_preds) / np.sqrt(self.g_model_2_preds) + ) + + interpd_values = interp1d( + self.one_dim_estimate_density(t_standard.values)[0], + self.one_dim_estimate_density(t_standard.values[0:self.num_rows])[1], + kind='linear' + )(t_standard) / np.sqrt(self.g_model_2_preds) + + n_interpd_values = interpd_values[0:self.num_rows] + + temp_interpd = interpd_values[self.num_rows:] + + zeros_mat = np.zeros((self.num_rows, self.treatment_grid_num)) + + for i in range(0, self.treatment_grid_num): + lower = i * self.num_rows + upper = i * self.num_rows + self.num_rows + zeros_mat[:,i] = temp_interpd[lower:upper] + + var_n_interpd_values = self.pred_from_loess( + train_x = self.grid_values, + train_y = zeros_mat.mean(axis = 0), + x_to_pred = self.t_data + ) + + return n_interpd_values, var_n_interpd_values + + + def _outcome_adjustment(self): + """Uses the Q-model and its predictions to adjust the outcome + """ + + outcome_adjust = self.q_model_preds[0:self.num_rows] + + temp_outcome_adjust = self.q_model_preds[self.num_rows:] + + zero_mat = np.zeros((self.num_rows, self.treatment_grid_num)) + for i in range(0, self.treatment_grid_num): + lower = i * self.num_rows + upper = i * self.num_rows + self.num_rows + zero_mat[:,i] = temp_outcome_adjust[lower:upper] + + expand_outcome_adjust = self.pred_from_loess( + train_x = self.grid_values, + train_y = zero_mat.mean(axis = 0), + x_to_pred = self.t_data + ) + + return outcome_adjust, expand_outcome_adjust + + def _fit_final_gam(self): + """We now regress the original treatment values against the pseudo-outcome values + """ + + return LinearGAM( + s(0, n_splines=30, spline_order=3), + max_iter=500, + lam=self.bandwidth + ).fit(self.t_data, y = self.pseudo_out) + + def one_dim_estimate_density(self, series): + """ + Takes in a numpy array, returns grid values for KDE and predicted probabilities + """ + series_grid = np.linspace( + start=series.min(), + stop=series.max(), + num=self.num_rows + ) + + kde = KernelDensity( + kernel='gaussian', + bandwidth=self.bandwidth + ).fit(series.reshape(-1, 1)) + + return series_grid, np.exp(kde.score_samples(series_grid.reshape(-1, 1))) + + def pred_from_loess(self, train_x, train_y, x_to_pred): + """ + Trains simple loess regression and returns predictions + """ + kr_model = KernelReg( + endog = train_y, + exog = train_x, + var_type = 'c', + bw = [self.bandwidth] + ) + + return kr_model.fit(x_to_pred)[0] diff --git a/causal_curve/tmle_regressor.py b/causal_curve/tmle_regressor.py new file mode 100644 index 0000000..9f08b15 --- /dev/null +++ b/causal_curve/tmle_regressor.py @@ -0,0 +1,100 @@ +""" +Defines the Targetted Maximum likelihood Estimation (TMLE) regressor model class +""" + +import numpy as np + +from causal_curve.tmle_core import TMLE_Core + + +class TMLE_Regressor(TMLE_Core): + """ + A TMLE tool that handles continuous outcomes. Inherits the TMLE_core + base class. See that base class code its docstring for more details. + """ + + def _cdrc_predictions_continuous(self, ci): + """Returns the predictions of CDRC for each value of the treatment grid. Essentially, + we're making predictions using the original treatment against the pseudo-outcome. + To be used when the outcome of interest is continuous. + """ + + # To keep track of cdrc predictions, we create an empty 2d array of shape + # (treatment_grid_num, 4). The last dimension is of length 4 because + # we are going to keep track of the treatment, point estimate of the prediction, as well as + # the lower and upper bounds of the prediction interval + cdrc_preds = np.zeros((self.treatment_grid_num, 4), dtype=float) + + # Loop through each of the grid values, get point estimate and get estimate interval + for i in range(0, self.treatment_grid_num): + temp_T = self.grid_values[i] + temp_cdrc_preds = self.final_gam.predict(temp_T) + temp_cdrc_interval = self.final_gam.confidence_intervals(temp_T, width=ci) + temp_cdrc_lower_bound = temp_cdrc_interval[:, 0] + temp_cdrc_upper_bound = temp_cdrc_interval[:, 1] + cdrc_preds[i, 0] = temp_T + cdrc_preds[i, 1] = temp_cdrc_preds + cdrc_preds[i, 2] = temp_cdrc_lower_bound + cdrc_preds[i, 3] = temp_cdrc_upper_bound + + return cdrc_preds + + def point_estimate(self, T): + """Calculates point estimate within the CDRC given treatment values. + Can only be used when outcome is continuous. Can be estimate for a single + data point or can be run in batch for many observations. Extrapolation will produce + untrustworthy results; the provided treatment should be within + the range of the training data. + + Parameters + ---------- + T: Numpy array, shape (n_samples,) + A continuous treatment variable. + + Returns + ---------- + array: Numpy array + Contains a set of CDRC point estimates + + """ + return np.apply_along_axis(self._create_point_estimate, 0, T.reshape(1, -1)) + + def _create_point_estimate(self, T): + """Takes a single treatment value and produces a single point estimate + in the case of a continuous outcome. + """ + return self.final_gam.predict(np.array([T]).reshape(1, -1)) + + def point_estimate_interval(self, T, ci=0.95): + """Calculates the prediction confidence interval associated with a point estimate + within the CDRC given treatment values. Can only be used + when outcome is continuous. Can be estimate for a single data point + or can be run in batch for many observations. Extrapolation will produce + untrustworthy results; the provided treatment should be within + the range of the training data. + + Parameters + ---------- + T: Numpy array, shape (n_samples,) + A continuous treatment variable. + ci: float (default = 0.95) + The desired confidence interval to produce. Default value is 0.95, corresponding + to 95% confidence intervals. bounded (0, 1.0). + + Returns + ---------- + array: Numpy array + Contains a set of CDRC prediction intervals ([lower bound, higher bound]) + + """ + return np.apply_along_axis( + self._create_point_estimate_interval, 0, T.reshape(1, -1), width=ci + ).T.reshape(-1, 2) + + def _create_point_estimate_interval(self, T, width): + """Takes a single treatment value and produces confidence interval + associated with a point estimate in the case of a continuous outcome. + """ + return self.final_gam.prediction_intervals( + np.array([T]).reshape(1, -1), width=width + ) diff --git a/causal_curve/utils.py b/causal_curve/utils.py deleted file mode 100644 index e9fdc41..0000000 --- a/causal_curve/utils.py +++ /dev/null @@ -1,22 +0,0 @@ -""" -Misc. utility functions -""" - -import numpy as np - - -def rand_seed_wrapper(random_seed=None): - """Sets the random seed using numpy - - Parameters - ---------- - random_seed: int - - Returns - ---------- - None - """ - if random_seed is None: - pass - else: - np.random_seed(random_seed) diff --git a/docs/GPS_Classifier.rst b/docs/GPS_Classifier.rst new file mode 100644 index 0000000..6d59249 --- /dev/null +++ b/docs/GPS_Classifier.rst @@ -0,0 +1,55 @@ +.. _GPS_Classifier: + +============================================================ +GPS_Classifier Tool (continuous treatments, binary outcomes) +============================================================ + +As with the other GPS tool, we calculate generalized propensity scores (GPS) but +with the classifier we can estimate the point-by-point causal contribution of +a continuous treatment to a binary outcome. The GPS_Classifier does this by +estimating the log odds of a positive outcome and odds ratio (odds of positive outcome / odds of negative outcome) along +the entire range of treatment values: + +.. image:: ../imgs/binary_OR_fig.png + + +Currently, the causal-curve package does not contain a TMLE implementation that is appropriate for a binary outcome, +so the GPS_Classifier tool will have to suffice for this sort of outcome. + +This tool works much like the _GPS_Regressor tool; as long as the outcome series in your dataframe contains +binary integer values (e.g. 0's and 1's) the ``fit()`` method will work as it's supposed to: + +>>> df.head(5) # a pandas dataframe with your data + X_1 X_2 Treatment Outcome +0 0.596685 0.162688 0.000039 1 +1 1.014187 0.916101 0.000197 0 +2 0.932859 1.328576 0.000223 0 +3 1.140052 0.555203 0.000339 0 +4 1.613471 0.340886 0.000438 1 + +With this dataframe, we can now calculate the GPS to estimate the causal relationship between +treatment and outcome. Let's use the default settings of the GPS tool: + +>>> from causal_curve import GPS_Classifier +>>> gps = GPS() +>>> gps.fit(T = df['Treatment'], X = df[['X_1', 'X_2']], y = df['Outcome']) +>>> gps_results = gps.calculate_CDRC(0.95) + +The ``gps_results`` object (a dataframe) now contains all of the data to produce the above plot. + +If you'd like to estimate the log odds at a specific point on the curve, use the +``predict_log_odds`` to do so. + +References +---------- + +Galagate, D. Causal Inference with a Continuous Treatment and Outcome: Alternative +Estimators for Parametric Dose-Response function with Applications. PhD thesis, 2016. + +Moodie E and Stephens DA. Estimation of dose–response functions for +longitudinal data using the generalised propensity score. In: Statistical Methods in +Medical Research 21(2), 2010, pp.149–166. + +Hirano K and Imbens GW. The propensity score with continuous treatments. +In: Gelman A and Meng XL (eds) Applied bayesian modeling and causal inference +from incomplete-data perspectives. Oxford, UK: Wiley, 2004, pp.73–84. diff --git a/docs/GPS_example.rst b/docs/GPS_Regressor.rst similarity index 71% rename from docs/GPS_example.rst rename to docs/GPS_Regressor.rst index 8d75e1c..ec320e8 100644 --- a/docs/GPS_example.rst +++ b/docs/GPS_Regressor.rst @@ -1,21 +1,18 @@ -.. _GPS_example: +.. _GPS_Regressor: -==================================================== -GPS Method for Causal Dose Response Curve Estimation -==================================================== +================================================================ +GPS_Regressor Tool (continuous treatments, continuous outcomes) +================================================================ -Generalized propensity score method ------------------------------------ - -In this example, we use this package's GPS tool to estimate the marginal causal curve of some +In this example, we use this package's GPS_Regressor tool to estimate the marginal causal curve of some continuous treatment on a continuous outcome, accounting for some mild confounding effects. To put this differently, the result of this will be an estimate of the average -of each individual's dose-response to the treatment. To do this we employ the -generalized propensity score (GPS) to correct the treatment prediction of the outcome. +of each individual's dose-response to the treatment. To do this we calculate +generalized propensity scores (GPS) to correct the treatment prediction of the outcome. -Compared with the package's TMLE method, this GPS method is more computationally efficient, -better suited for large datasets, but produces significantly wider confidence intervals. +Compared with the package's TMLE method, the GPS methods are more computationally efficient, +better suited for large datasets, but produces wider confidence intervals. In this example we use simulated data originally developed by Hirano and Imbens but adapted by others (see references). The advantage of this simulated data is it allows us @@ -71,10 +68,10 @@ The following code completes the data generation: >>> ).sort_values('Treatment', ascending = True) With this dataframe, we can now calculate the GPS to estimate the causal relationship between -treatment and outcome. Let's use the default settings of the GPS tool: +treatment and outcome. Let's use the default settings of the GPS_Regressor tool: ->>> from causal_curve import GPS ->>> gps = GPS() +>>> from causal_curve import GPS_Regressor +>>> gps = GPS_Regressor() >>> gps.fit(T = df['Treatment'], X = df[['X_1', 'X_2']], y = df['Outcome']) >>> gps_results = gps.calculate_CDRC(0.95) @@ -84,14 +81,9 @@ half the error of a simple LOESS estimate using only the treatment and the outco .. image:: ../imgs/cdrc/CDRC.png -A binary outcome can also be handled with the GPS tool. As long as the outcome series contains -binary integer values (e.g. 0's and 1's) the GPS `fit` method will work as it's supposed to. - -The GPS tool also allows you to estimate a specific set of points along the causal curve. -In the case of a continuous outcome, use the `predict` and `predict_interval` methods -to produce a point estimate and prediction interval, respectively. In the case of a -binary outcome, use the `predict_log_odds` methods to calculate the log odds of the -highest outcome class. +The GPS_Regressor tool also allows you to estimate a specific set of points along the causal curve. +Use the `predict` and `predict_interval` methods to produce a point estimate +and prediction interval, respectively. References ---------- diff --git a/docs/Mediation_example.rst b/docs/Mediation_example.rst index 9e50c0d..ba249ce 100644 --- a/docs/Mediation_example.rst +++ b/docs/Mediation_example.rst @@ -1,14 +1,11 @@ .. _Mediation_example: -========================================================== -Test for Mediation for continuous treatments and mediators -========================================================== +============================================================ +Mediation Tool (continuous treatment, mediator, and outcome) +============================================================ -Mediation test --------------- - -It trying to explore the causal relationships between various elements, oftentimes you'll use +In trying to explore the causal relationships between various elements, oftentimes you'll use your domain knowledge to sketch out your initial ideas about the causal connections. See the following causal DAG of the expected relationships between smoking, diabetes, obesity, age, and mortality (Havumaki et al.): diff --git a/docs/TMLE_Regressor.rst b/docs/TMLE_Regressor.rst new file mode 100644 index 0000000..7948faa --- /dev/null +++ b/docs/TMLE_Regressor.rst @@ -0,0 +1,51 @@ +.. _TMLE_Regressor: + +================================================================ +TMLE_Regressor Tool (continuous treatments, continuous outcomes) +================================================================ + + +In this example, we use this package's Targeted Maximum Likelihood Estimation (TMLE) +tool to estimate the marginal causal curve of some continuous treatment on a continuous outcome, +accounting for some mild confounding effects. + +The TMLE algorithm is doubly robust, meaning that as long as one of the two models contained +with the tool (the ``g`` or ``q`` models) performs well, then the overall tool will correctly +estimate the causal curve. + +Compared with the package's GPS methods incorporates more powerful machine learning techniques internally (gradient boosting) +and produces significantly smaller confidence intervals. However it is less computationally efficient +and will take longer to run. In addition, **the treatment values provided should +be roughly normally-distributed**, otherwise you may encounter internal math errors. + + +>>> df.head(5) # a pandas dataframe with your data + X_1 X_2 Treatment Outcome +0 0.596685 0.162688 0.000039 12.3 +1 1.014187 0.916101 0.000197 14.9 +2 0.932859 1.328576 0.000223 19.01 +3 1.140052 0.555203 0.000339 22.3 +4 1.613471 0.340886 0.000438 24.98 + + +>>> from causal_curve import TMLE_Regressor +>>> tmle = TMLE_Regressor( + random_seed=111, + verbose=True, +) + +>>> tmle.fit(T = df['Treatment'], X = df[['X_1', 'X_2']], y = df['Outcome']) +>>> gps_results = tmle.calculate_CDRC(0.99) + + +References +---------- + +Kennedy EH, Ma Z, McHugh MD, Small DS. Nonparametric methods for doubly robust estimation +of continuous treatment effects. Journal of the Royal Statistical Society, Series B. 79(4), 2017, pp.1229-1245. + +van der Laan MJ and Rubin D. Targeted maximum likelihood learning. In: ​U.C. Berkeley Division of +Biostatistics Working Paper Series, 2006. + +van der Laan MJ and Gruber S. Collaborative double robust penalized targeted +maximum likelihood estimation. In: The International Journal of Biostatistics 6(1), 2010. diff --git a/docs/TMLE_example.rst b/docs/TMLE_example.rst deleted file mode 100644 index f870b90..0000000 --- a/docs/TMLE_example.rst +++ /dev/null @@ -1,45 +0,0 @@ -.. _TMLE_example: - -===================================================== -TMLE Method for Causal Dose Response Curve Estimation -===================================================== - -Targeted Maximum Likelihood Estimation method ---------------------------------------------- - - -In this example, we use this package's Targeted Maximum Likelihood Estimation (TMLE) -tool to estimate the marginal causal curve of some continuous treatment on a continuous outcome, -accounting for some mild confounding effects. - -Compared with the package's GPS method, this TMLE method is double robust against model -misspecification, incorporates more powerful machine learning techniques internally (gradient boosting), -produces significantly smaller confidence intervals, however it is not computationally efficient -and will take longer to run. - -The quality of the estimate it produces is highly dependent on the user's choice -of the `treatment_grid_bins` parameter. If the bins are too small, you might violate the -'positivity' assumption, but if the buckets are too large, your final estiamte of the CDRC will -not be smooth. We recommend ensure there are at least 100 treatment observations within -each of your buckets. Exploring the treatment distribution and quantiles is recommended. - - ->>> from causal_curve import TMLE ->>> tmle = TMLE( - treatment_grid_bins = [1.0, 1.5, 2.0, ...], - random_seed=111, - verbose=True, -) - ->>> tmle.fit(T = df['Treatment'], X = df[['X_1', 'X_2']], y = df['Outcome']) ->>> gps_results = tmle.calculate_CDRC(0.99) - - -References ----------- - -van der Laan MJ and Rubin D. Targeted maximum likelihood learning. In: ​U.C. Berkeley Division of -Biostatistics Working Paper Series, 2006. - -van der Laan MJ and Gruber S. Collaborative double robust penalized targeted -maximum likelihood estimation. In: The International Journal of Biostatistics 6(1), 2010. diff --git a/docs/causal_curve.rst b/docs/causal_curve.rst index f1ed5f1..e1a2441 100644 --- a/docs/causal_curve.rst +++ b/docs/causal_curve.rst @@ -2,18 +2,52 @@ causal\_curve package ===================== -causal\_curve.gps module ------------------------- +causal\_curve.core module +------------------------- -.. automodule:: causal_curve.gps +.. automodule:: causal_curve.core :members: :undoc-members: :show-inheritance: -causal\_curve.tmle module -------------------------- +causal\_curve.gps_core module +----------------------------- + +.. automodule:: causal_curve.gps_core + :members: + :undoc-members: + :show-inheritance: + +causal\_curve.gps_regressor module +---------------------------------- + +.. automodule:: causal_curve.gps_regressor + :members: + :undoc-members: + :show-inheritance: + + +causal\_curve.gps_classifier module +----------------------------------- + +.. automodule:: causal_curve.gps_classifier + :members: + :undoc-members: + :show-inheritance: -.. automodule:: causal_curve.tmle + +causal\_curve.tmle_core module +------------------------------ + +.. automodule:: causal_curve.tmle_core + :members: + :undoc-members: + :show-inheritance: + +causal\_curve.tmle_regressor module +----------------------------------- + +.. automodule:: causal_curve.tmle_regressor :members: :undoc-members: :show-inheritance: @@ -26,21 +60,8 @@ causal\_curve.mediation module :undoc-members: :show-inheritance: -causal\_curve.core module -------------------------- - -.. automodule:: causal_curve.core - :members: - :undoc-members: - :show-inheritance: -causal\_curve.utils module --------------------------- -.. automodule:: causal_curve.utils - :members: - :undoc-members: - :show-inheritance: Module contents diff --git a/docs/changelog.rst b/docs/changelog.rst index 631c7d5..057e634 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -5,6 +5,16 @@ Change Log ========== +Version 1.0.0: **Major Update** +------------- +- Overhaul of the TMLE tool to make it dramatically more accurate and user-friendly. +- Improved TMLE example documentation +- Much like with `scikit-learn`, there are now separate model classes used for predicting binary or continuous outcomes +- Updating documentation to reflect API changes +- Added more tests +- Linted with `pylint` (added `.pylintrc` file) + + Version 0.5.2 ------------- - Fixed bug that prevented `causal-curve` modules from being shown in Sphinx documentation diff --git a/docs/conf.py b/docs/conf.py index d83b66f..9c8461a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -23,7 +23,7 @@ author = "Roni Kobrosly" # The full version, including alpha/beta/rc tags -release = "0.5.2" +release = "1.0.0" # -- General configuration --------------------------------------------------- diff --git a/docs/full_example.rst b/docs/full_example.rst index a269203..ed7fe7c 100644 --- a/docs/full_example.rst +++ b/docs/full_example.rst @@ -63,7 +63,7 @@ features as potentially confounding "nuisance" variables: - Whether the child spent time in a neonatal intensive care unit as a baby - Whether the child is experiencing food insecurity (is food sometimes not available due to lack of resources?). -In our simulated "experiment", these above confounders will be controlled for. +In our "experiment", these above confounders will be controlled for. By using either the GPS or TMLE tools included in `causal-curve` one can generate the causal dose-response curves for BLLs in relation to the four outcomes: diff --git a/docs/index.rst b/docs/index.rst index 9b07857..7b2d454 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -23,10 +23,11 @@ Welcome to causal-curve's documentation! .. toctree:: :maxdepth: 1 :hidden: - :caption: Tutorials of Individual Tools + :caption: Package Tools - GPS_example - TMLE_example + GPS_Regressor + GPS_Classifier + TMLE_Regressor Mediation_example .. toctree:: @@ -52,7 +53,7 @@ Welcome to causal-curve's documentation! **causal-curve** is a Python package with tools to perform causal inference -using observational data when the treatment of interest is continuous. +when the treatment of interest is continuous. .. image:: ../imgs/welcome_plot.png @@ -60,6 +61,8 @@ using observational data when the treatment of interest is continuous. Summary ------- +(**Version 1.0.0 released in Jan 2021!**) + There are many available methods to perform causal inference when your intervention of interest is binary, but few methods exist to handle continuous treatments. This is unfortunate because there are many scenarios (in industry and research) where these methods would be useful. This library attempts to @@ -67,16 +70,19 @@ address this gap, providing tools to estimate causal curves (AKA causal dose-res Both continuous and binary outcomes can be modeled with this package. -Quick example (of the ``GPS`` tool) ------------------------------------ +Quick example (of the ``GPS_Regressor`` tool) +--------------------------------------------- **causal-curve** uses a sklearn-like API that should feel familiar to python machine learning users. +This includes ``_Regressor`` and ``_Classifier`` models, and ``fit()`` methods. + The following example estimates the causal dose-response curve (CDRC) by calculating generalized propensity scores. ->>> from causal_curve import GPS +>>> from causal_curve import GPS_Regressor +>>> import numpy as np ->>> gps = GPS(treatment_grid_num = 200, random_seed = 512) +>>> gps = GPS_Regressor(treatment_grid_num = 200, random_seed = 512) >>> df # a pandas dataframe with your data X_1 X_2 Treatment Outcome @@ -88,10 +94,10 @@ generalized propensity scores. >>> gps.fit(T = df['Treatment'], X = df[['X_1', 'X_2']], y = df['Outcome']) >>> gps_results = gps.calculate_CDRC(ci = 0.95) ->>> gps_pred = gps.predict(0.0003) ->>> gps_pred_interval = gps.predict_interval(0.0003, ci = 0.95) +>>> gps_point = gps.point_estimate(np.array([0.0003])) +>>> gps_point_interval = gps.point_estimate_interval(np.array([0.0003]), ci = 0.95) -1. First we import our GPS class. +1. First we import the `GPS_Regressor` class. 2. Then we instantiate the class, providing any of the optional parameters. @@ -101,6 +107,6 @@ generalized propensity scores. 5. Estimate the points of the causal curve (along with 95% confidence interval bounds) with the ``.calculate_CDRC()`` method. -6. Estimate user-provided points along the causal curve with the ``.predict()``, ``.predict_interval()``, and ``.predict_log_odds()`` methods. +6. Generate point estimates along the causal curve with the ``.point_estimate()``, ``.point_estimate_interval()``, and ``.estimate_log_odds()`` methods. 7. Explore or plot your results! diff --git a/docs/intro.rst b/docs/intro.rst index 3d8b8af..773e60a 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -20,7 +20,7 @@ to analyzing observational data (e.g. confounding and selection bias) [@Hernán: As long as you have varying observational data on some treatment, your outcome of interest, and potentially confounding variables across your units of analysis (in addition to meeting the assumptions described below), -then you can essentially simulate a proper experiment and make causal claims. +then you can essentially estimate the results of a proper experiment and make causal claims. Interpreting the causal curve @@ -28,7 +28,7 @@ Interpreting the causal curve Two of the methods contained within this package produce causal curves for continuous treatments (see the GPS and TMLE methods). Both continuous and binary treatments can be modeled -(only the GPS tool can handle binary outcomes). +(only the `GPS_Classifier` tool can handle binary outcomes). **Continuous outcome:** @@ -51,7 +51,7 @@ generated through standard multivariable regression modeling in a few important .. image:: ../imgs/binary_OR_fig.png -In the case of binary outcome, the GPS tool can be used to estimate a curve of odds ratio. Every +In the case of binary outcome, the `GPS_Classifier` tool can be used to estimate a curve of odds ratios. Every point on the curve is relative to the lowest treatment value. The highest effect (relative to the lowest treatment value) is around a treatment value of -1.2. At this point in the treatment, the odds of a positive class occurring is 5.6 times higher compared with the lowest treatment value. This curve is always on @@ -80,16 +80,6 @@ None of the methods provided in causal-curve rely on inference via instrumental rely on the data from the observed treatment, confounders, and the outcome of interest (like the above GPS example). -Additional resources --------------------------------------------- - -There are a number of excellent blogs and books covering the math behind causal inference in more depth. -In addition to these, the following resources are recommended: - -- Miguel Hernan and Jamie Robin's book `"Causal Inference: What if" `_. -- Cloudera Fast Forward Labs Research's `book on causal inference `_. - - References ---------- diff --git a/examples/NHANES_BLL_example.ipynb b/examples/NHANES_BLL_example.ipynb index 8564059..349d58a 100644 --- a/examples/NHANES_BLL_example.ipynb +++ b/examples/NHANES_BLL_example.ipynb @@ -8,6 +8,13 @@ "All NHANES III data obtained here: https://wwwn.cdc.gov/nchs/nhanes/nhanes3/datafiles.aspx#core" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# *JANUARY 2021 NOTE* : This notebook was made using causal-curve's version 0.5.x API. Version 1.0.0 was released in Jan 2021 and this notebook will require slight tweaks to run" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -728,9 +735,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python (chelsea)", + "display_name": "Python 3", "language": "python", - "name": "chelsea" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -742,7 +749,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.6.12" } }, "nbformat": 4, diff --git a/setup.py b/setup.py index b956828..68a305f 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="causal-curve", - version="0.5.2", + version="1.0.0", author="Roni Kobrosly", author_email="roni.kobrosly@gmail.com", description="A python library with tools to perform causal inference using \ diff --git a/tests/conftest.py b/tests/conftest.py index 5082ba4..4d64d9e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,7 +5,7 @@ import pytest from scipy.stats import norm -from causal_curve import GPS +from causal_curve import GPS_Regressor, GPS_Classifier, TMLE_Regressor @pytest.fixture(scope="module") @@ -197,7 +197,7 @@ def GPS_fitted_model_continuous(): df = full_continuous_example_dataset() - fixture = GPS( + fixture = GPS_Regressor( treatment_grid_num=10, lower_grid_constraint=0.0, upper_grid_constraint=1.0, @@ -227,7 +227,7 @@ def GPS_fitted_model_binary(): df = full_binary_example_dataset() - fixture = GPS( + fixture = GPS_Classifier( gps_family="normal", treatment_grid_num=10, lower_grid_constraint=0.0, @@ -245,3 +245,27 @@ def GPS_fitted_model_binary(): ) return fixture + + +@pytest.fixture(scope="module") +def TMLE_fitted_model_continuous_fixture(): + """Returns a TMLE model that is already fit with data with a continuous outcome""" + return TMLE_fitted_model_continuous() + + +def TMLE_fitted_model_continuous(): + """Example GPS model that is fit with data including a continuous outcome""" + + df = full_continuous_example_dataset() + + fixture = TMLE_Regressor( + random_seed=100, + verbose=True, + ) + fixture.fit( + T=df["treatment"], + X=df[["x1", "x2"]], + y=df["outcome"], + ) + + return fixture diff --git a/tests/integration/test_gps.py b/tests/integration/test_gps.py index 925abed..6a97fbf 100644 --- a/tests/integration/test_gps.py +++ b/tests/integration/test_gps.py @@ -2,7 +2,7 @@ import pandas as pd -from causal_curve import GPS +from causal_curve import GPS_Regressor, GPS_Classifier def test_full_continuous_gps_flow(continuous_dataset_fixture): @@ -10,7 +10,7 @@ def test_full_continuous_gps_flow(continuous_dataset_fixture): Tests the full flow of the GPS tool when used with a continuous outcome """ - gps = GPS( + gps = GPS_Regressor( treatment_grid_num=10, lower_grid_constraint=0.0, upper_grid_constraint=1.0, @@ -37,12 +37,12 @@ def test_full_continuous_gps_flow(continuous_dataset_fixture): assert check.all() -def test_binary_continuous_gps_flow(binary_dataset_fixture): +def test_binary_gps_flow(binary_dataset_fixture): """ Tests the full flow of the GPS tool when used with a binary outcome """ - gps = GPS( + gps = GPS_Classifier( gps_family="normal", treatment_grid_num=10, lower_grid_constraint=0.0, diff --git a/tests/integration/test_tmle.py b/tests/integration/test_tmle.py index 2d02473..452a9b6 100644 --- a/tests/integration/test_tmle.py +++ b/tests/integration/test_tmle.py @@ -2,7 +2,7 @@ import pandas as pd -from causal_curve import TMLE +from causal_curve import TMLE_Regressor def test_full_tmle_flow(continuous_dataset_fixture): @@ -10,8 +10,7 @@ def test_full_tmle_flow(continuous_dataset_fixture): Tests the full flow of the TMLE tool """ - tmle = TMLE( - treatment_grid_bins=[22.1, 30, 40, 50, 60, 70, 80.1], + tmle = TMLE_Regressor( random_seed=100, verbose=True, ) diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py new file mode 100644 index 0000000..df08c68 --- /dev/null +++ b/tests/unit/test_core.py @@ -0,0 +1,71 @@ +""" General unit tests of the causal-curve package """ + +import numpy as np + +from causal_curve.core import Core + + +def test_get_params(): + """ + Tests the `get_params` method of the Core base class + """ + + core = Core() + core.a = 5 + core.b = 10 + + observed_results = core.get_params() + + assert observed_results == {"a": 5, "b": 10} + + +def test_if_verbose_print(capfd): + """ + Tests the `if_verbose_print` method of the Core base class + """ + + core = Core() + core.verbose = True + + core.if_verbose_print("This is a test") + out, err = capfd.readouterr() + + assert out == "This is a test\n" + + core.verbose = False + + core.if_verbose_print("This is a test") + out, err = capfd.readouterr() + + assert out == "" + + +def test_rand_seed_wrapper(): + """ + Tests the `rand_seed_wrapper` method of the Core base class + """ + + core = Core() + core.rand_seed_wrapper(123) + + assert np.random.get_state()[1][0] == 123 + + +def test_calculate_z_score(): + """ + Tests the `calculate_z_score` method of the Core base class + """ + + core = Core() + assert round(core.calculate_z_score(0.95), 3) == 1.960 + assert round(core.calculate_z_score(0.90), 3) == 1.645 + + +def test_clip_negatives(): + """ + Tests the `clip_negatives` method of the Core base class + """ + + core = Core() + assert core.clip_negatives(0.5) == 0.5 + assert core.clip_negatives(-1.5) == 0 diff --git a/tests/unit/test_general.py b/tests/unit/test_general.py deleted file mode 100644 index b600806..0000000 --- a/tests/unit/test_general.py +++ /dev/null @@ -1,17 +0,0 @@ -""" General unit tests of the causal-curve package """ - -from causal_curve.core import Core - - -def test_core(): - """ - Tests the `Core` base class - """ - - core = Core() - core.a = 5 - core.b = 10 - - observed_results = core.get_params() - - assert observed_results == {"a": 5, "b": 10} diff --git a/tests/unit/test_gps_classifier.py b/tests/unit/test_gps_classifier.py new file mode 100644 index 0000000..f049de5 --- /dev/null +++ b/tests/unit/test_gps_classifier.py @@ -0,0 +1,33 @@ +""" Unit tests for the GPS_Core class """ + +import numpy as np +from pygam import LinearGAM +import pytest + +from causal_curve.gps_core import GPS_Core +from tests.conftest import full_continuous_example_dataset + + +def test_predict_log_odds_method_good(GPS_fitted_model_binary_fixture): + """ + Tests the GPS `estimate_log_odds` method using appropriate data (with a binary outcome) + """ + observed_result = GPS_fitted_model_binary_fixture.estimate_log_odds(np.array([0.5])) + assert isinstance(observed_result[0][0], float) + assert len(observed_result[0]) == 1 + + observed_result = GPS_fitted_model_binary_fixture.estimate_log_odds( + np.array([0.5, 0.6, 0.7]) + ) + assert isinstance(observed_result[0][0], float) + assert len(observed_result[0]) == 3 + + +def test_predict_log_odds_method_bad(GPS_fitted_model_continuous_fixture): + """ + Tests the GPS `estimate_log_odds` method using appropriate data (with a continuous outcome) + """ + with pytest.raises(Exception) as bad: + observed_result = GPS_fitted_model_continuous_fixture.estimate_log_odds( + np.array([50]) + ) diff --git a/tests/unit/test_gps.py b/tests/unit/test_gps_core.py similarity index 60% rename from tests/unit/test_gps.py rename to tests/unit/test_gps_core.py index e550dd4..5fc97df 100644 --- a/tests/unit/test_gps.py +++ b/tests/unit/test_gps_core.py @@ -1,10 +1,10 @@ -""" Unit tests of the gps.py module """ +""" Unit tests for the GPS_Core class """ import numpy as np from pygam import LinearGAM import pytest -from causal_curve import GPS +from causal_curve.gps_core import GPS_Core from tests.conftest import full_continuous_example_dataset @@ -19,10 +19,10 @@ ) def test_gps_fit(df_fixture, family): """ - Tests the fit method of the GPS tool + Tests the fit method of the GPS_Core tool """ - gps = GPS( + gps = GPS_Core( gps_family=family, treatment_grid_num=10, lower_grid_constraint=0.0, @@ -119,7 +119,7 @@ def test_bad_gps_instantiation( Tests for exceptions when the GPS class if call with bad inputs. """ with pytest.raises(Exception) as bad: - GPS( + GPS_Core( gps_family=gps_family, treatment_grid_num=treatment_grid_num, lower_grid_constraint=lower_grid_constraint, @@ -133,87 +133,17 @@ def test_bad_gps_instantiation( ) -def test_calculate_z_score(): +def test_bad_param_calculate_CDRC(GPS_fitted_model_continuous_fixture): """ - Tests that that `_calculate_z_score` methods returns expected z-scores + Tests the GPS `calculate_CDRC` when the `ci` param is bad """ - gps = GPS() - assert round(gps._calculate_z_score(0.99), 2) == 2.58 - assert round(gps._calculate_z_score(0.95), 2) == 1.96 - assert round(gps._calculate_z_score(0.90), 2) == 1.64 - assert round(gps._calculate_z_score(0.80), 2) == 1.28 - -def test_predict_method_good(GPS_fitted_model_continuous_fixture): - """ - Tests the GPS `predict` method using appropriate data (with a continuous outcome) - """ - observed_result = GPS_fitted_model_continuous_fixture.predict(np.array([50])) - assert isinstance(observed_result[0][0], float) - assert len(observed_result[0]) == 1 - - observed_result = GPS_fitted_model_continuous_fixture.predict( - np.array([40, 50, 60]) - ) - assert isinstance(observed_result[0][0], float) - assert len(observed_result[0]) == 3 - - -def test_predict_method_bad(GPS_fitted_model_binary_fixture): - """ - Tests the GPS `predict` method using inappropriate data (with a binary outcome) - """ - with pytest.raises(Exception) as bad: - observed_result = GPS_fitted_model_binary_fixture.predict(np.array([50])) - - -def test_predict_interval_method_good(GPS_fitted_model_continuous_fixture): - """ - Tests the GPS `predict_interval` method using appropriate data (with a continuous outcome) - """ - observed_result = GPS_fitted_model_continuous_fixture.predict_interval( - np.array([50]) - ) - assert isinstance(observed_result[0][0], float) - assert observed_result.shape == (1, 2) - - observed_result = GPS_fitted_model_continuous_fixture.predict_interval( - np.array([40, 50, 60]) - ) - assert isinstance(observed_result[0][0], float) - assert observed_result.shape == (3, 2) - - -def test_predict_interval_method_bad(GPS_fitted_model_binary_fixture): - """ - Tests the GPS `predict_interval` method using appropriate data (with a continuous outcome) - """ with pytest.raises(Exception) as bad: - observed_result = GPS_fitted_model_binary_fixture.predict_interval( - np.array([50]) + observed_result = GPS_fitted_model_continuous_fixture.calculate_CDRC( + np.array([50]), ci={"param": 0.95} ) - -def test_predict_log_odds_method_good(GPS_fitted_model_binary_fixture): - """ - Tests the GPS `predict_log_odds` method using appropriate data (with a binary outcome) - """ - observed_result = GPS_fitted_model_binary_fixture.predict_log_odds(np.array([0.5])) - assert isinstance(observed_result[0][0], float) - assert len(observed_result[0]) == 1 - - observed_result = GPS_fitted_model_binary_fixture.predict_log_odds( - np.array([0.5, 0.6, 0.7]) - ) - assert isinstance(observed_result[0][0], float) - assert len(observed_result[0]) == 3 - - -def test_predict_log_odds_method_bad(GPS_fitted_model_continuous_fixture): - """ - Tests the GPS `predict_log_odds` method using appropriate data (with a continuous outcome) - """ with pytest.raises(Exception) as bad: - observed_result = GPS_fitted_model_continuous_fixture.predict_log_odds( - np.array([50]) + observed_result = GPS_fitted_model_continuous_fixture.calculate_CDRC( + np.array([50]), ci=1.05 ) diff --git a/tests/unit/test_gps_regressor.py b/tests/unit/test_gps_regressor.py new file mode 100644 index 0000000..700a18a --- /dev/null +++ b/tests/unit/test_gps_regressor.py @@ -0,0 +1,66 @@ +""" Unit tests for the GPS_Core class """ + +import numpy as np +from pygam import LinearGAM +import pytest + +from causal_curve import GPS_Regressor + + +def test_point_estimate_method_good(GPS_fitted_model_continuous_fixture): + """ + Tests the GPS `point_estimate` method using appropriate data (with a continuous outcome) + """ + + observed_result = GPS_fitted_model_continuous_fixture.point_estimate(np.array([50])) + assert isinstance(observed_result[0][0], float) + assert len(observed_result[0]) == 1 + + observed_result = GPS_fitted_model_continuous_fixture.point_estimate( + np.array([40, 50, 60]) + ) + assert isinstance(observed_result[0][0], float) + assert len(observed_result[0]) == 3 + + +def test_point_estimate_interval_method_good(GPS_fitted_model_continuous_fixture): + """ + Tests the GPS `point_estimate_interval` method using appropriate data (with a continuous outcome) + """ + observed_result = GPS_fitted_model_continuous_fixture.point_estimate_interval( + np.array([50]) + ) + assert isinstance(observed_result[0][0], float) + assert observed_result.shape == (1, 2) + + observed_result = GPS_fitted_model_continuous_fixture.point_estimate_interval( + np.array([40, 50, 60]) + ) + assert isinstance(observed_result[0][0], float) + assert observed_result.shape == (3, 2) + + +def test_point_estimate_method_bad(GPS_fitted_model_continuous_fixture): + """ + Tests the GPS `point_estimate` method using appropriate data (with a continuous outcome) + """ + + GPS_fitted_model_continuous_fixture.outcome_type = "binary" + + with pytest.raises(Exception) as bad: + observed_result = GPS_fitted_model_continuous_fixture.point_estimate( + np.array([50]) + ) + + +def test_point_estimate_interval_method_bad(GPS_fitted_model_continuous_fixture): + """ + Tests the GPS `point_estimate_interval` method using appropriate data (with a continuous outcome) + """ + + GPS_fitted_model_continuous_fixture.outcome_type = "binary" + + with pytest.raises(Exception) as bad: + observed_result = GPS_fitted_model_continuous_fixture.point_estimate_interval( + np.array([50]) + ) diff --git a/tests/unit/test_tmle.py b/tests/unit/test_tmle.py deleted file mode 100644 index b9671c1..0000000 --- a/tests/unit/test_tmle.py +++ /dev/null @@ -1,84 +0,0 @@ -""" Unit tests of the tmle.py module """ - -import pytest - -from causal_curve import TMLE - - -def test_tmle_fit(continuous_dataset_fixture): - """ - Tests the fit method GPS tool - """ - - tmle = TMLE( - treatment_grid_bins=[22.1, 30, 40, 50, 60, 70, 80.1], - random_seed=100, - verbose=True, - ) - tmle.fit( - T=continuous_dataset_fixture["treatment"], - X=continuous_dataset_fixture[["x1", "x2"]], - y=continuous_dataset_fixture["outcome"], - ) - - assert tmle.n_obs == 72 - assert len(tmle.psi_list) == 5 - assert len(tmle.std_error_ic_list) == 5 - - -@pytest.mark.parametrize( - ( - "treatment_grid_bins", - "n_estimators", - "learning_rate", - "max_depth", - "random_seed", - "verbose", - ), - [ - ([0, 1, "2", 3, 4], 100, 0.1, 5, None, False), - (5, 100, 0.1, 5, None, False), - ("5", 100, 0.1, 5, None, False), - ([22.1, 30, 40, 50, 60, 70, 80.1], "100", 0.1, 5, None, False), - ([22.1, 30, 40, 50, 60, 70, 80.1], 1, 0.1, 5, None, False), - ([22.1, 30, 40, 50, 60, 70, 80.1], 100, "0.1", 5, None, False), - ([22.1, 30, 40, 50, 60, 70, 80.1], 100, 1000000, 5, None, False), - ([22.1, 30, 40, 50, 60, 70, 80.1], 100, 0.1, "5", None, False), - ([22.1, 30, 40, 50, 60, 70, 80.1], 100, 0.1, -5, None, False), - ([22.1, 30, 40, 50, 60, 70, 80.1], 100, 0.1, 5, "None", False), - ([22.1, 30, 40, 50, 60, 70, 80.1], 100, 0.1, 5, -10, False), - ([22.1, 30, 40, 50, 60, 70, 80.1], 100, 0.1, 5, None, "False"), - ({"a": 5, "b": 6}, 100, 0.1, 5, None, False), - (["a", "b", "c"], 100, 0.1, 5, None, False), - ([1.0], 100, 0.1, 5, None, False), - ([22.1, 30, 40, 50, 60, 70, 80.1], 100.0, 0.1, 5, None, False), - ([22.1, 30, 40, 50, 60, 70, 80.1], 1.0, 0.1, 5, None, False), - ([22.1, 30, 40, 50, 60, 70, 80.1], 10000000, 0.1, 5, None, False), - ([22.1, 30, 40, 50, 60, 70, 80.1], 100, "hehe", 5, None, False), - ([22.1, 30, 40, 50, 60, 70, 80.1], 100, -0.1, 5, None, False), - ([22.1, 30, 40, 50, 60, 70, 80.1], 100, 10000000, 5, None, False), - ([22.1, 30, 40, 50, 60, 70, 80.1], 100, 0.1, "hehe", None, False), - ([22.1, 30, 40, 50, 60, 70, 80.1], 100, 0.1, 5.0, None, False), - ([22.1, 30, 40, 50, 60, 70, 80.1], 100, 0.1, -5, None, False), - ([22.1, 30, 40, 50, 60, 70, 80.1], 100, 0.1, 5, "hehe", False), - ([22.1, 30, 40, 50, 60, 70, 80.1], 100, 0.1, 5, -10, False), - ([22.1, 30, 40, 50, 60, 70, 80.1], 100, 0.1, 5, None, "thirty two"), - ], -) -def test_bad_tmle_instantiation( - treatment_grid_bins, - n_estimators, - learning_rate, - max_depth, - random_seed, - verbose, -): - with pytest.raises(Exception) as bad: - TMLE( - treatment_grid_bins=treatment_grid_bins, - n_estimators=n_estimators, - learning_rate=learning_rate, - max_depth=max_depth, - random_seed=random_seed, - verbose=verbose, - ) diff --git a/tests/unit/test_tmle_core.py b/tests/unit/test_tmle_core.py new file mode 100644 index 0000000..815b9f4 --- /dev/null +++ b/tests/unit/test_tmle_core.py @@ -0,0 +1,112 @@ +""" Unit tests of the tmle.py module """ + +import pytest + +from causal_curve.tmle_core import TMLE_Core + + +def test_tmle_fit(continuous_dataset_fixture): + """ + Tests the fit method GPS tool + """ + + tmle = TMLE_Core( + random_seed=100, + verbose=True, + ) + tmle.fit( + T=continuous_dataset_fixture["treatment"], + X=continuous_dataset_fixture[["x1", "x2"]], + y=continuous_dataset_fixture["outcome"], + ) + + assert tmle.num_rows == 500 + assert tmle.fully_expanded_t_and_x.shape == (50500, 3) + + +@pytest.mark.parametrize( + ( + "treatment_grid_num", + "lower_grid_constraint", + "upper_grid_constraint", + "n_estimators", + "learning_rate", + "max_depth", + "bandwidth", + "random_seed", + "verbose", + ), + [ + # treatment_grid_num + (100.0, 0.01, 0.99, 200, 0.01, 3, 0.5, None, False), + ("100.0", 0.01, 0.99, 200, 0.01, 3, 0.5, None, False), + (2, 0.01, 0.99, 200, 0.01, 3, 0.5, None, False), + (500000, 0.01, 0.99, 200, 0.01, 3, 0.5, None, False), + # lower_grid_constraint + (100, {0.01: "a"}, 0.99, 200, 0.01, 3, 0.5, None, False), + (100, -0.01, 0.99, 200, 0.01, 3, 0.5, None, False), + (100, 6.05, 0.99, 200, 0.01, 3, 0.5, None, False), + # upper_grid_constraint + (100, 0.01, [1, 2, 3], 200, 0.01, 3, 0.5, None, False), + (100, 0.01, -0.05, 200, 0.01, 3, 0.5, None, False), + (100, 0.01, 5.99, 200, 0.01, 3, 0.5, None, False), + (100, 0.9, 0.2, 200, 0.01, 3, 0.5, None, False), + # n_estimators + (100, 0.01, 0.99, "3.0", 0.01, 3, 0.5, None, False), + (100, 0.01, 0.99, -5, 0.01, 3, 0.5, None, False), + (100, 0.01, 0.99, 10000000, 0.01, 3, 0.5, None, False), + # learning_rate + (100, 0.01, 0.99, 200, ["a", "b"], 3, 0.5, None, False), + (100, 0.01, 0.99, 200, 5000000, 3, 0.5, None, False), + # max_depth + (100, 0.01, 0.99, 200, 0.01, "a", 0.5, None, False), + (100, 0.01, 0.99, 200, 0.01, -6, 0.5, None, False), + # bandwidth + (100, 0.01, 0.99, 200, 0.01, 3, "b", None, False), + (100, 0.01, 0.99, 200, 0.01, 3, -10, None, False), + # random seed + (100, 0.01, 0.99, 200, 0.01, 3, 0.5, "b", False), + (100, 0.01, 0.99, 200, 0.01, 3, 0.5, -10, False), + # verbose + (100, 0.01, 0.99, 200, 0.01, 3, 0.5, None, "Verbose"), + ], +) +def test_bad_tmle_instantiation( + treatment_grid_num, + lower_grid_constraint, + upper_grid_constraint, + n_estimators, + learning_rate, + max_depth, + bandwidth, + random_seed, + verbose, +): + with pytest.raises(Exception) as bad: + TMLE_Core( + treatment_grid_num=treatment_grid_num, + lower_grid_constraint=lower_grid_constraint, + upper_grid_constraint=upper_grid_constraint, + n_estimators=n_estimators, + learning_rate=learning_rate, + max_depth=max_depth, + bandwidth=bandwidth, + random_seed=random_seed, + verbose=verbose, + ) + + +def test_bad_param_calculate_CDRC_TMLE(TMLE_fitted_model_continuous_fixture): + """ + Tests the TMLE `calculate_CDRC` when the `ci` param is bad + """ + + with pytest.raises(Exception) as bad: + observed_result = TMLE_fitted_model_continuous_fixture.calculate_CDRC( + np.array([50]), ci={"param": 0.95} + ) + + with pytest.raises(Exception) as bad: + observed_result = TMLE_fitted_model_continuous_fixture.calculate_CDRC( + np.array([50]), ci=1.05 + ) diff --git a/tests/unit/test_tmle_regressor.py b/tests/unit/test_tmle_regressor.py new file mode 100644 index 0000000..39c6e72 --- /dev/null +++ b/tests/unit/test_tmle_regressor.py @@ -0,0 +1,41 @@ +""" Unit tests of the tmle.py module """ + +import numpy as np +import pytest + +from causal_curve import TMLE_Regressor + + +def test_point_estimate_method_good(TMLE_fitted_model_continuous_fixture): + """ + Tests the GPS `point_estimate` method using appropriate data (with a continuous outcome) + """ + + observed_result = TMLE_fitted_model_continuous_fixture.point_estimate( + np.array([50]) + ) + assert isinstance(observed_result[0][0], float) + assert len(observed_result[0]) == 1 + + observed_result = TMLE_fitted_model_continuous_fixture.point_estimate( + np.array([40, 50, 60]) + ) + assert isinstance(observed_result[0][0], float) + assert len(observed_result[0]) == 3 + + +def test_point_estimate_interval_method_good(TMLE_fitted_model_continuous_fixture): + """ + Tests the GPS `point_estimate_interval` method using appropriate data (with a continuous outcome) + """ + observed_result = TMLE_fitted_model_continuous_fixture.point_estimate_interval( + np.array([50]) + ) + assert isinstance(observed_result[0][0], float) + assert observed_result.shape == (1, 2) + + observed_result = TMLE_fitted_model_continuous_fixture.point_estimate_interval( + np.array([40, 50, 60]) + ) + assert isinstance(observed_result[0][0], float) + assert observed_result.shape == (3, 2)