diff --git a/DESCRIPTION b/DESCRIPTION index 017ad88..b95d6a4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,19 +1,17 @@ Package: designmatch Type: Package -Title: Construction of Optimally Matched Samples for Randomized - Experiments and Observational Studies that are Balanced and - Representative by Design -Version: 0.2.0 -Date: 2016-08-09 -Author: Jose R. Zubizarreta , Cinar Kilcioglu +Title: Matched Samples that are Balanced and Representative by Design +Version: 0.3.0 +Date: 2017-05-17 +Author: Jose R. Zubizarreta , Cinar Kilcioglu , Juan P. Vielma Maintainer: Jose R. Zubizarreta Depends: R (>= 3.2), lattice, MASS, slam, Rglpk Suggests: gurobi, Rcplex, Rsymphony SystemRequirements: GLPK library package (e.g., libglpk-dev on Debian/Ubuntu) License: GPL-2 | GPL-3 -Description: Includes functions for the construction of matched samples that are balanced and representative by design. Among others, these functions can be used for matching in observational studies with treated and control units, with cases and controls, in related settings with instrumental variables, and in discontinuity designs. Also, they can be used for the design of randomized experiments, for example, for matching before randomization. By default, 'designmatch' uses the 'GLPK' optimization solver, but its performance is greatly enhanced by the 'Gurobi' optimization solver and its associated R interface. For their installation, please follow the instructions at http://user.gurobi.com/download/gurobi-optimizer and http://www.gurobi.com/documentation/6.5/refman/r_api_overview.html. We have also included directions in the gurobi_installation file in the inst folder. +Description: Includes functions for the construction of matched samples that are balanced and representative by design. Among others, these functions can be used for matching in observational studies with treated and control units, with cases and controls, in related settings with instrumental variables, and in discontinuity designs. Also, they can be used for the design of randomized experiments, for example, for matching before randomization. By default, 'designmatch' uses the 'GLPK' optimization solver, but its performance is greatly enhanced by the 'Gurobi' optimization solver and its associated R interface. For their installation, please follow the instructions at and . We have also included directions in the gurobi_installation file in the inst folder. NeedsCompilation: no -Packaged: 2016-08-09 18:44:53 UTC; jrz +Packaged: 2017-05-18 01:39:38 UTC; jrz Repository: CRAN -Date/Publication: 2016-08-11 13:01:07 +Date/Publication: 2017-05-18 03:48:36 UTC diff --git a/MD5 b/MD5 index b32fef0..c055239 100644 --- a/MD5 +++ b/MD5 @@ -1,29 +1,34 @@ -3b16e7c71b53bcfd510a0b4c5467db3d *DESCRIPTION +e6ccf902847f31ee3b7a20750bbc8815 *DESCRIPTION a84e461889448bab6f00dc036a173876 *NAMESPACE -4035589e4087aa8c0dfc97e90db340f8 *R/absstddif.R +4035589e4087aa8c0dfc97e90db340f8 *R/absstddif.r 7d1a85944bcd2594e3ab13ffdd80bd1e *R/addcalip.r 06d2cede573309b27980bbb8869ad01e *R/bmatch.r +932c4f0f192b4f4901220dcb169c4d50 *R/cardmatch.r 1ccbbc3d8e654fcd4366a8c93350178b *R/constraintmatrix.r -ddd36bec26100d3064a1bf106c36740c *R/distmat.R -f562ca6fc19a2951c9fcaaa01cae81dd *R/ecdfplot.R +ddd36bec26100d3064a1bf106c36740c *R/distmat.r +692b745a194a270376faf4842b958a02 *R/distmatch.r +f562ca6fc19a2951c9fcaaa01cae81dd *R/ecdfplot.r d41d8cd98f00b204e9800998ecf8427e *R/errorhandling.r -a7c14294e78b350eca6b9b036f9f0817 *R/finetab.R -c98b06b08fd9add6edc0464b9d8d3443 *R/loveplot.R -d3df5386a7329cd2c93dedac69a83db4 *R/meantab.R +a7c14294e78b350eca6b9b036f9f0817 *R/finetab.r +c98b06b08fd9add6edc0464b9d8d3443 *R/loveplot.r +d3df5386a7329cd2c93dedac69a83db4 *R/meantab.r 4a4e30c24c6373c9118b32245b3f29fc *R/nmatch.r 98f291ae461a0b1c18ea5d8858b3eb22 *R/pairsplot.r 36d380e4bd2fb8bbd551af173cc12056 *R/problemparameters.r +3fc3b874e31fb6965591687cfc0481c9 *R/problemparameters_cardmatch.r faaeb63e221a183bf338d8005151bb03 *R/relaxation_b.r 1ee8331d65f0dab43a41bd5a4740fcdf *R/relaxation_n.r 8a6974d5a5f551af3ca1e1be7d555be6 *R/smahal.r 0187616874e4a366b4ba006ded775e2d *data/germancities.rda 7c8f4dadefa6fd57a06a23067605dda0 *data/lalonde.rda -433defcdb5377aa4ebb28a0b41f492d7 *inst/gurobi_installation.txt +fca1c5c2ab15ec2207f7f77919cca494 *inst/gurobi_installation.txt 8b5ab702a3ca62c67934d34220c09ab6 *inst/symphony_installation.txt a3432aa9ad196973c884751e3a7e0fb4 *man/absstddif.Rd e19dd3ffc9a2b389b9ad4c0a7a082600 *man/bmatch.Rd +2f242dd6c23bbb2cf7b56ab900a74572 *man/cardmatch.Rd 70d7c69f620facbd7955281a7d6bbf1b *man/designmatch-package.Rd 9d510e34095ba20a26ae6ed0e3b7807a *man/distmat.Rd +90779867557e9b66310864ca76f825f5 *man/distmatch.Rd 60560ddae0cc24fb6cedc7785379fd3d *man/ecdfplot.Rd 093455696515de415adc6c394e839715 *man/finetab.Rd 3e98c15e211274459bbe5239f7342766 *man/germancities.Rd diff --git a/R/absstddif.R b/R/absstddif.r similarity index 100% rename from R/absstddif.R rename to R/absstddif.r diff --git a/R/cardmatch.r b/R/cardmatch.r new file mode 100644 index 0000000..e46d0e6 --- /dev/null +++ b/R/cardmatch.r @@ -0,0 +1,249 @@ +#! cardmatch +cardmatch = function(t_ind, mom = NULL, fine = NULL, solver = NULL) { + + if (is.null(mom)) { + mom_covs = NULL + mom_tols = NULL + mom_targets = NULL + } else { + mom_covs = mom$covs + mom_tols = mom$tols + mom_targets = mom$targets + } + if (is.null(fine)) { + fine_covs = NULL + } else { + fine_covs = fine$covs + } + + if (is.null(solver)) { + solver = 'glpk' + t_max = 60 * 15 + approximate = 1 + } else { + t_max = solver$t_max + approximate = solver$approximate + trace = solver$trace + round_cplex = solver$round_cplex + solver = solver$name + } + + #! CALL ERROR HANDLING + + #! Generate the parameters + cat(format(" Building the matching problem..."), "\n") + prmtrs = .problemparameters_cardmatch(t_ind, mom_covs, mom_tols, mom_targets, fine_covs) + n_t = prmtrs$n_t + n_c = prmtrs$n_c + n_dec_vars = prmtrs$n_dec_vars + cvec = prmtrs$cvec + Amat = prmtrs$Amat + bvec = prmtrs$bvec + sense = prmtrs$sense + vtype = prmtrs$vtype + + #! Find matches and calculate the elapsed time + #! Gurobi + if (solver == "gurobi") { + #library(gurobi) + if (requireNamespace('gurobi', quietly=TRUE)) { + cat(format(" Gurobi optimizer is open..."), "\n") + model = list() + model$modelsense = 'max' + model$obj = cvec + model$A = Amat + model$sense = rep(NA, length(sense)) + model$sense[sense=="E"] = '=' + model$sense[sense=="L"] = '<=' + model$sense[sense=="G"] = '>=' + model$rhs = bvec + model$vtypes = vtype + + t_lim = list(TimeLimit = t_max, OutputFlag = trace) + + cat(format(" Finding the optimal matches..."), "\n") + ptm = proc.time() + out = gurobi::gurobi(model, t_lim) + time = (proc.time()-ptm)[3] + + if (out$status == "INFEASIBLE") { + cat(format(" Error: problem infeasible!"), "\n") + obj_total = NA + obj_dist_mat = NA + t_id = NA + c_id = NA + group_id = NA + time = NA + } + + if (out$status == "OPTIMAL" || out$status == "TIME_LIMIT") { + if (out$status == "OPTIMAL") { + cat(format(" Optimal matches found"), "\n") + } + + else { + cat(format(" Time limit reached, best suboptimal solution given"), "\n") + } + + #! Matched units indexes + t_id = (1:n_dec_vars)[t_ind==1 & out$x==1] + c_id = (1:n_dec_vars)[t_ind==0 & out$x==1] + + #! Group (or pair) identifier + group_id = c(1:(length(t_id)), 1:(length(c_id))) + + #! Optimal value of the objective function + obj_total = out$objval + } + } else { + stop('Required solver not installed') + } + + } + + #! CPLEX + else if (solver == "cplex") { + #library(Rcplex) + if (requireNamespace('Rcplex', quietly=TRUE)) { + cat(format(" CPLEX optimizer is open..."), "\n") + cat(format(" Finding the optimal matches..."), "\n") + ptm = proc.time() + out = Rcplex::Rcplex(objsense = 'max', cvec, Amat, bvec, sense = sense, vtype = vtype, n = 1, + control = list(trace = trace, round = round_cplex, tilim = t_max)) + time = (proc.time()-ptm)[3] + + if (out$status==108) { + cat(format(" Error: time limit exceeded, no integer solution!"), "\n") + obj_total = NA + obj_dist_mat = NA + t_id = NA + c_id = NA + group_id = NA + time = NA + } else if (is.na(out$obj)) { + cat(format(" Error: problem infeasible!"), "\n") + obj_total = NA + obj_dist_mat = NA + t_id = NA + c_id = NA + group_id = NA + time = NA + } + + if (!is.na(out$obj)) { + cat(format(" Optimal matches found"), "\n") + + #! Matched units indexes + t_id = (1:n_dec_vars)[t_ind==1 & out$xopt==1] + c_id = (1:n_dec_vars)[t_ind==0 & out$xopt==1] + + #! Group (or pair) identifier + group_id = c(1:(length(t_id)), 1:(length(c_id))) + + #! Optimal value of the objective function + obj_total = out$obj + + } + } else { + stop('Required solver not installed') + } + + } + + #! GLPK + else if (solver == "glpk") { + #library(Rglpk) + cat(format(" GLPK optimizer is open..."), "\n") + dir = rep(NA, length(prmtrs$sense)) + dir[prmtrs$sense=="E"] = '==' + dir[prmtrs$sense=="L"] = '<=' + dir[prmtrs$sense=="G"] = '>=' + + cat(format(" Finding the optimal matches..."), "\n") + ptm = proc.time() + out= Rglpk_solve_LP(cvec, Amat, dir, bvec, types = vtype, max = TRUE) + time = (proc.time()-ptm)[3] + + if (out$status!=0) { + cat(format(" Error: problem infeasible!"), "\n") + obj_total = NA + obj_dist_mat = NA + t_id = NA + c_id = NA + group_id = NA + time = NA + } + + if (out$status==0) { + cat(format(" Optimal matches found"), "\n") + + #! Matched units indexes + t_id = (1:n_dec_vars)[t_ind==1 & out$solution==1] + c_id = (1:n_dec_vars)[t_ind==0 & out$solution==1] + + #! Group (or pair) identifier + group_id = c(1:(length(t_id)), 1:(length(c_id))) + + #! Optimal value of the objective function + obj_total = out$optimum + + } + } + + #! Symphony + else { + #library(Rsymphony) + if (requireNamespace('Rsymphony', quietly=TRUE)) { + cat(format(" Symphony optimizer is open..."), "\n") + + dir = rep(NA, length(prmtrs$sense)) + dir[prmtrs$sense=="E"] = '==' + dir[prmtrs$sense=="L"] = '<=' + dir[prmtrs$sense=="G"] = '>=' + + cat(format(" Finding the optimal matches..."), "\n") + ptm = proc.time() + out= Rsymphony::Rsymphony_solve_LP(cvec, Amat, dir, bvec, types = vtype, max = TRUE, time_limit = t_max) + time = (proc.time()-ptm)[3] + + if (out$status==228) { + cat(format(" Error: problem exceeded the time limit and no feasible solution is found!"), "\n") + obj_total = NA + obj_dist_mat = NA + t_id = NA + c_id = NA + group_id = NA + time = NA + } + else if (out$status!=0) { + cat(format(" Error: problem infeasible!"), "\n") + obj_total = NA + obj_dist_mat = NA + t_id = NA + c_id = NA + group_id = NA + time = NA + } + + if (out$status==0) { + cat(format(" Optimal matches found"), "\n") + + #! Matched units indexes + t_id = (1:n_dec_vars)[t_ind==1 & out$solution==1] + c_id = (1:n_dec_vars)[t_ind==0 & out$solution==1] + + #! Group (or pair) identifier + group_id = c(1:(length(t_id)), 1:(length(c_id))) + + #! Optimal value of the objective function + obj_total = out$objval + + } + } else { + stop('Required solver not installed') + } + + } + #! Output + return(list(obj_total = obj_total, t_id = t_id, c_id = c_id, group_id = group_id, time = time)) +} \ No newline at end of file diff --git a/R/distmat.R b/R/distmat.r similarity index 100% rename from R/distmat.R rename to R/distmat.r diff --git a/R/distmatch.r b/R/distmatch.r new file mode 100644 index 0000000..43d98f4 --- /dev/null +++ b/R/distmatch.r @@ -0,0 +1,17 @@ +#! distmatch +distmatch = function(t_ind, dist_mat = NULL, solver = NULL) { + + # Subset matching weight + subset_weight = 0 + + # Total number of matched pairs + total_groups = sum(t_ind) + + # Match + out = bmatch(t_ind = t_ind, dist_mat = dist_mat, subset_weight = subset_weight, total_groups = total_groups, solver = solver) + + #! Output + return(list(obj_total = out$obj_total, obj_dist_mat = out$obj_dist_mat, + t_id = out$t_id, c_id = out$c_id, group_id = out$group_id, time = out$time, status = out$status)) + +} \ No newline at end of file diff --git a/R/ecdfplot.R b/R/ecdfplot.r similarity index 100% rename from R/ecdfplot.R rename to R/ecdfplot.r diff --git a/R/finetab.R b/R/finetab.r similarity index 100% rename from R/finetab.R rename to R/finetab.r diff --git a/R/loveplot.R b/R/loveplot.r similarity index 100% rename from R/loveplot.R rename to R/loveplot.r diff --git a/R/meantab.R b/R/meantab.r similarity index 100% rename from R/meantab.R rename to R/meantab.r diff --git a/R/problemparameters_cardmatch.r b/R/problemparameters_cardmatch.r new file mode 100644 index 0000000..df6e41d --- /dev/null +++ b/R/problemparameters_cardmatch.r @@ -0,0 +1,142 @@ +#! Generate the parameters for cardmatch +.problemparameters_cardmatch = function(t_ind, mom_covs, mom_tols, mom_targets, fine_covs) { + + #! Number of treated units and controls + n_t = sum(t_ind) + n_c = length(t_ind)-n_t + + #! Number of dec. vars. + n_dec_vars = n_t+n_c + + #! Coeffs. of the obj. fun., cvec + cvec = c(rep(1, n_t), rep(0, n_c)) + + #! Constraint matrix, Amat + row_ind_cur = 0 + #! Mom balance + if (!is.null(mom_covs)) { + rows_mom = NULL + cols_mom = NULL + vals_mom = NULL + n_mom_covs = ncol(mom_covs) + k = 1 + for (i in 1:n_mom_covs) { + #! Treated + rows_mom_plus = rep(row_ind_cur+k, n_t) + rows_mom_minus = rep(row_ind_cur+k+1, n_t) + rows_mom = c(rows_mom, rows_mom_plus, rows_mom_minus) + cols_mom = c(cols_mom, rep(1:n_t, 2)) + vals_plus = c(mom_covs[t_ind==1, i]-mom_targets[i]-mom_tols[i]) + vals_minus = c(mom_covs[t_ind==1, i]-mom_targets[i]+mom_tols[i]) + vals_mom = c(vals_mom, c(vals_plus, vals_minus)) + #! Controls + rows_mom_plus = rep(row_ind_cur+k+2, n_c) + rows_mom_minus = rep(row_ind_cur+k+3, n_c) + rows_mom = c(rows_mom, rows_mom_plus, rows_mom_minus) + cols_mom = c(cols_mom, rep(n_t+(1:n_c), 2)) + vals_plus = c(mom_covs[t_ind==0, i]-mom_targets[i]-mom_tols[i]) + vals_minus = c(mom_covs[t_ind==0, i]-mom_targets[i]+mom_tols[i]) + vals_mom = c(vals_mom, c(vals_plus, vals_minus)) + k = k+4 + } + row_ind_cur = max(rows_mom) + } + #! Fine balance + if (!is.null(fine_covs)) { + rows_fine = NULL + cols_fine = NULL + vals_fine = NULL + n_fine_covs = ncol(fine_covs) + k = 1 + for (i in 1:n_fine_covs) { + fine_covs_cats = unique(fine_covs[, i]) + for (j in fine_covs_cats) { + cols_fine_aux = which(fine_covs[, i]==j) + rows_fine = c(rows_fine, rep(row_ind_cur+k, length(cols_fine_aux))) + cols_fine = c(cols_fine, cols_fine_aux) + vals_fine = c(vals_fine, c(rep(1, sum(cols_fine_aux<=n_t)), rep(-1, sum(cols_fine_aux>n_t)))) + k = k+1 + } + } + row_ind_cur = max(rows_fine) + } + #! Equal number of matched controls and matched treated units + if (!is.null(mom_covs) & is.null(fine_covs)) { + rows_equal = rep(row_ind_cur+1, n_dec_vars) + cols_equal = 1:n_dec_vars + vals_equal = c(rep(1, n_t), rep(-1, n_c)) + } + #! Put together + rows_ind = NULL + cols_ind = NULL + vals = NULL + #! Mom balance + if (!is.null(mom_covs)) { + rows_ind = c(rows_ind, rows_mom) + cols_ind = c(cols_ind, cols_mom) + vals = c(vals, vals_mom) + } + #! Fine balance + if (!is.null(fine_covs)) { + rows_ind = c(rows_ind, rows_fine) + cols_ind = c(cols_ind, cols_fine) + vals = c(vals, vals_fine) + } + #! Equal number of matched controls and matched treated units + if (!is.null(mom_covs) & is.null(fine_covs)) { + rows_ind = c(rows_ind, rows_equal) + cols_ind = c(cols_ind, cols_equal) + vals = c(vals, vals_equal) + } + #! + aux = cbind(rows_ind, cols_ind, vals)[order(cols_ind), ] + Amat = simple_triplet_matrix(i = aux[, 1], j = aux[, 2], v = aux[, 3]) + + #! Constraint vector, bvec + bvec = NULL + #! Mom balance + if (!is.null(mom_covs)) { + bvec_mom = rep(0, length(unique(rows_mom))) + bvec = c(bvec, bvec_mom) + } + #! Fine balance + if (!is.null(fine_covs)) { + bvec_fine = rep(0, length(unique(rows_fine))) + bvec = c(bvec, bvec_fine) + } + #! Equal number of matched controls and matched treated units + if (!is.null(mom_covs) & is.null(fine_covs)) { + bvec = c(bvec, 0) + } + + #! Sense, sense + sense = NULL + #! Mom balance + if (!is.null(mom_covs)) { + sense_covs = rep(c("L", "G", "L", "G"), length(unique(rows_mom))/4) + sense = c(sense, sense_covs) + } + #! Fine balance + if (!is.null(fine_covs)) { + sense_fine = rep("E", length(unique(rows_fine))) + sense = c(sense, sense_fine) + } + if (!is.null(mom_covs) & is.null(fine_covs)) { + sense_equal = c("E") + sense = c(sense, sense_equal) + } + + #! Variable types, vtype + vtype = rep("B", n_dec_vars) + + # Output + return(list(n_t = n_t, + n_c = n_c, + n_dec_vars = n_dec_vars, + cvec = cvec, + Amat = Amat, + bvec = bvec, + sense = sense, + vtype = vtype)) + +} \ No newline at end of file diff --git a/inst/gurobi_installation.txt b/inst/gurobi_installation.txt index 2585d48..05d0798 100644 --- a/inst/gurobi_installation.txt +++ b/inst/gurobi_installation.txt @@ -1,23 +1,23 @@ For an exact solution, we strongly recommend running designmatch either with CPLEX or Gurobi. Between these two solvers, the R interface of Gurobi is considerably easier to install. Here we provide general instructions for manually installing Gurobi and its R interface in Mac and Windows machines. 1. Create a free academic license - Follow the instructions in: http://www.gurobi.com/documentation/6.0/quickstart_windows/creating_a_new_academic_li.html + Follow the instructions in: http://www.gurobi.com/documentation/7.0/quickstart_windows/creating_a_new_academic_li.html 2. Install the software 2.1. In http://www.gurobi.com/index, go to Downloads > Gurobi Software 2.2. Choose your operating system and press download 3. Retrieve and set up your Gurobi license - 2.1. Follow the instructions in: http://www.gurobi.com/documentation/6.0/quickstart_windows/retrieving_and_setting_up_.html - 2.2. Then follow the instructions in: http://www.gurobi.com/documentation/6.0/quickstart_windows/retrieving_a_free_academic.html + 2.1. Follow the instructions in: http://www.gurobi.com/documentation/7.0/quickstart_windows/retrieving_and_setting_up_.html + 2.2. Then follow the instructions in: http://www.gurobi.com/documentation/7.0/quickstart_windows/retrieving_a_free_academic.html 4. Test your license - Follow the instructions in: http://www.gurobi.com/documentation/6.0/quickstart_windows/testing_your_license.html + Follow the instructions in: http://www.gurobi.com/documentation/7.0/quickstart_windows/testing_your_license.html 5. Install the R interface of Gurobi - Follow the instructions in: http://www.gurobi.com/documentation/6.0/quickstart_windows/r_installing_the_r_package.html - * In Windows, in R run the command install.packages("PATH\\gurobi_6.0-5.zip", repos=NULL) where path leads to the file gurobi_6.0-5.zip (for example PATH=C:\\gurobi605\\win64\\R; note that the path may be different in your computer) - * In MAC, in R run the command install.packages('PATH/gurobi_6.0-5.tgz', repos=NULL) where path leads to the file gurobi_6.0-5.tgz (for example PATH=/Library/gurobi605/mac64/R; note that the path may be different in your computer) + Follow the instructions in: http://www.gurobi.com/documentation/7.0/quickstart_windows/r_installing_the_r_package.html + * In Windows, in R run the command install.packages("PATH\\gurobi_7.0-2.zip", repos=NULL) where path leads to the file gurobi_7.0-2.zip (for example PATH=C:\\gurobi702\\win64\\R; note that the path may be different in your computer) + * In MAC, in R run the command install.packages('PATH/gurobi_7.0-2.tgz', repos=NULL) where path leads to the file gurobi_7.0-2.tgz (for example PATH=/Library/gurobi605/mac64/R; note that the path may be different in your computer) 6. Test the installation Load the library and run the examples therein diff --git a/man/cardmatch.Rd b/man/cardmatch.Rd new file mode 100644 index 0000000..51490b2 --- /dev/null +++ b/man/cardmatch.Rd @@ -0,0 +1,187 @@ +\name{cardmatch} + +\alias{cardmatch} + +\title{Optimal cardinality matching in observational studies} + +\description{ + Function for optimal cardinality matching in observational studies. \code{cardmatch} finds the largest matched sample that is balanced and representative by design. The formulation of \code{cardmatch} has been simplified to handle larger data than \code{bmatch} or \code{nmatch}. Similar to \code{bmatch} or \code{nmatch}, the performance of \code{cardmatch} is greatly enhanced by using the \code{solver} options \code{cplex} or \code{gurobi}. +} + +\usage{ + cardmatch(t_ind, mom = NULL, fine = NULL, solver = NULL) +} + +\arguments{ + +\item{t_ind}{treatment indicator: a vector of zeros and ones indicating treatment (1 = treated; 0 = control).} + +\item{mom}{moment balance parameters: a list with three arguments, + +\code{mom = list(covs = mom_covs, tols = mom_tols, targets = mom_targets)}. + +\code{mom_covs} is a matrix where each column is a covariate whose mean is to be balanced. \code{mom_tols} is a vector of tolerances for the maximum difference in means for the covariates in \code{mom_covs}. \code{mom_targets} is a vector of target moments (e.g., means) of a distribution to be approximated by matched sampling. \code{mom_targets} is optional, but if \code{mom_covs} is specified then \code{mom_tols} needs to be specified too. If \code{mom_targets} is \code{NULL}, then \code{bmatch} will match treated and control units so that covariates in \code{mom_covs} differ at most by \code{mom_tols}. If \code{mom_targets} is specified, then \code{bmatch} will match treated and control units so that each matched group differs at most by \code{mom_tols} units from the respective moments in \code{mom_targets}. As a result, the matched groups will differ at most \code{mom_tols * 2} from each other. Under certain assumptions, \code{mom_targets} can be used for constructing a representative matched sample. The lengths of \code{mom_tols} and \code{mom_target} have to be equal to the number of columns of \code{mom_covs}. Note that the columns of \code{mom_covs} can be transformations of the original covariates to balance higher order single-dimensional moments like variances and skewness, and multidimensional moments such as correlations (Zubizarreta 2012).} + +\item{fine}{Fine balance parameters: a list with one argument, + +\code{fine = list(covs = fine_covs)}, + +where \code{fine_covs} is a matrix where each column is a nominal covariate for fine balance. Fine balance enforces exact distributional balance on nominal covariates, but without constraining treated and control units to be matched within each category of each nominal covariate as in exact matching. See chapter 10 of Rosenbaum (2010) for details.} + +\item{solver}{ +Optimization solver parameters: a list with four objects, + +\code{solver = list(name = name, t_max = t_max, approximate = 1, round_cplex = 0,}\cr +\code{ trace_cplex = 0)}. + +\code{solver} is a string that determines the optimization solver to be used. The options are: \code{cplex}, \code{glpk}, \code{gurobi} and \code{symphony}. The default solver is \code{glpk} with \code{approximate = 1}, so that by default an approximate solution is found (see \code{approximate} below). For an exact solution, we strongly recommend using \code{cplex} or \code{gurobi} as they are much faster than the other solvers, but they do require a license (free for academics, but not for people outside universities). Between \code{cplex} and \code{gurobi}, note that installing the R interface for \code{gurobi} is much simpler. + +\code{t_max} is a scalar with the maximum time limit for finding the matches. This option is specific to \code{cplex} and \code{gurobi}. If the optimal matches are not found within this time limit, a partial, suboptimal solution is given. + +\code{approximate} is a scalar that determines the method of solution. If \code{approximate = 1} (the default), an approximate solution is found via a relaxation of the original integer program. This method of solution is faster than \code{approximate = 0}, but some balancing constraints may be violated to some extent. This option works only with \code{n_controls = 1}, i.e. pair matching. + +\code{round_cplex} is binary specific to \code{cplex}. \code{round_cplex = 1} ensures that the solution found is integral by rounding and all the constraints are exactly statisfied; \code{round_cplex = 0} (the default) encodes there is no rounding which may return slightly infeasible integer solutions. + +\code{trace} is a binary specific to \code{cplex} and \code{gurobi}. \code{trace = 1} turns the optimizer output on. The default is \code{trace = 0}. +} +} + +\value{ + A list containing the optimal solution, with the following objects: + + \item{obj_total}{value of the objective function at the optimum;} + + \item{obj_dist_mat}{value of the total sum of distances term of the objective function at the optimum;} + + \item{t_id}{indexes of the matched controls at the optimum;} + + \item{c_id}{indexes of the matched controls at the optimum;} + + \item{group_id}{matched pairs or groups at the optimum;} + + \item{time}{time elapsed to find the optimal solution.} +} + +\references{ + Bennett, M., Vielma, J. P., and Zubizarreta, J. R. (2017), "Building a Representative Matched Sample with Treatment Doses," working paper. + + Visconti, G., and Zubizarreta, J. R. (2017), "Finding the Largest Matched Sample that is Balanced by Design: A Case Study of the Effect of an Earthquake on Electoral Outcomes," working paper. + + Zubizarreta, J. R. (2012), "Using Mixed Integer Programming for Matching in an Observational Study of Kidney Failure after Surgery," \emph{Journal of the American Statistical Association}, 107, 1360-1371. + + Zubizarreta, J. R., Paredes, R. D., and Rosenbaum, P. R. (2014), "Matching for Balance, Pairing for Heterogeneity in an Observational Study of the Effectiveness of For-profit and Not-for-profit High Schools in Chile," \emph{Annals of Applied Statistics}, 8, 204-231. +} + +\author{ + Jose R. Zubizarreta , Cinar Kilcioglu , Juan P. Vielma . +} + +\seealso{ + \pkg{sensitivitymv}, \pkg{sensitivitymw}. +} + +\examples{ + +# Load and attach data +data(lalonde) +attach(lalonde) + +################################# +# Step 1: use cardinality matching to find the largest sample of matched pairs for which +# all the covariates are finely balanced. +################################# + +# Discretize covariates +quantiles = function(covar, n_q) { + p_q = seq(0, 1, 1/n_q) + val_q = quantile(covar, probs = p_q, na.rm = TRUE) + covar_out = rep(NA, length(covar)) + for (i in 1:n_q) { + if (i==1) {covar_out[covar1 & i=val_q[i] & covar=val_q[i] & covar<=val_q[i+1]] = i}} + covar_out +} +age_5 = quantiles(age, 5) +education_5 = quantiles(education, 5) +re74_5 = quantiles(re74, 5) +re75_5 = quantiles(re75, 5) + +# Treatment indicator +t_ind = treatment + +# Fine balance +fine_covs = cbind(black, hispanic, married, nodegree, age_5, education_5, re74_5, re75_5) +fine = list(covs = fine_covs) + +# Solver options +t_max = 60*5 +solver = "glpk" +approximate = 0 +solver = list(name = solver, t_max = t_max, approximate = approximate, +round_cplex = 0, trace = 0) + +# Match +out_1 = cardmatch(t_ind, fine = fine, solver = solver) + +# Indices of the treated units and matched controls +t_id_1 = out_1$t_id +c_id_1 = out_1$c_id + +# Mean balance +covs = cbind(age, education, black, hispanic, married, nodegree, re74, re75) +meantab(covs, t_ind, t_id_1, c_id_1) + +# Fine balance (note here we are getting an approximate solution) +for (i in 1:ncol(fine_covs)) { + print(finetab(fine_covs[, i], t_id_1, c_id_1)) +} + +################################# +# Step 2: use optimal matching (minimum distance matching) to find the (re)pairing of +# treated and control that minimizes the total sum of covariate distances between matched +# pairs. For this, use the function 'distmatch' which is a wrapper for 'bmatch'. +################################# + +# New treatment indicator +t_ind_2 = t_ind[c(t_id_1, c_id_1)] +table(t_ind_2) + +# To build the distance matrix, the idea is to use strong predictors of the outcome +dist_mat_2 = abs(outer(re74[t_id_1], re74[c_id_1], "-")) +dim(dist_mat_2) + +# Match +out_2 = distmatch(t_ind_2, dist_mat_2, solver) + +# Indices of the treated units and matched controls +t_id_2 = t_id_1[out_2$t_id] +c_id_2 = c_id_1[out_2$c_id-length(out_2$c_id)] + +# Covariate balance is preserved... +meantab(covs, t_ind, t_id_2, c_id_2) +for (i in 1:ncol(fine_covs)) { + print(finetab(fine_covs[, i], t_id_2, c_id_2)) +} + +# ... but covariate distances are reduced +distances_step_1 = sum(diag(dist_mat_2)) +distances_step_2 = sum(diag(dist_mat_2[out_2$t_id, out_2$c_id-length(out_2$c_id)])) +distances_step_1 +distances_step_2 + +# The mean difference in outcomes is the same... +mean(re78[t_id_1]-re78[c_id_1]) +mean(re78[t_id_2]-re78[c_id_2]) + +# ... but their standard deviation is reduced +sd(re78[t_id_1]-re78[c_id_1]) +sd(re78[t_id_2]-re78[c_id_2]) + +} + +\keyword{Causal inference} +\keyword{Instrumental variable} +\keyword{Matching} +\keyword{Regression discontinuity} +\keyword{Observational study} diff --git a/man/distmatch.Rd b/man/distmatch.Rd new file mode 100644 index 0000000..d3a2421 --- /dev/null +++ b/man/distmatch.Rd @@ -0,0 +1,172 @@ +\name{distmatch} + +\alias{distmatch} + +\title{Optimal distance matching in observational studies} + +\description{ + Function for optimal distance matching in observational studies. \code{distmatch} minimizes the total sum of covariate distances between matches. \code{distmatch} is a wrapper to \code{bmatch}. +} + +\usage{ + distmatch(t_ind, dist_mat = NULL, solver = NULL) +} + +\arguments{ + +\item{t_ind}{treatment indicator: a vector of zeros and ones indicating treatment (1 = treated; 0 = control).} + +\item{dist_mat}{distance matrix: a matrix of positive distances between treated units (rows) and controls (columns). If \code{dist_mat = NULL} and \code{subset_weight = 1}, then bmatch will solve the cardinality matching problem in Zubizarreta et al. (2014).} + + +\item{solver}{ +Optimization solver parameters: a list with four objects, + +\code{solver = list(name = name, t_max = t_max, approximate = 1, round_cplex = 0,}\cr +\code{ trace_cplex = 0)}. + +\code{solver} is a string that determines the optimization solver to be used. The options are: \code{cplex}, \code{glpk}, \code{gurobi} and \code{symphony}. The default solver is \code{glpk} with \code{approximate = 1}, so that by default an approximate solution is found (see \code{approximate} below). For an exact solution, we strongly recommend using \code{cplex} or \code{gurobi} as they are much faster than the other solvers, but they do require a license (free for academics, but not for people outside universities). Between \code{cplex} and \code{gurobi}, note that installing the R interface for \code{gurobi} is much simpler. + +\code{t_max} is a scalar with the maximum time limit for finding the matches. This option is specific to \code{cplex} and \code{gurobi}. If the optimal matches are not found within this time limit, a partial, suboptimal solution is given. + +\code{approximate} is a scalar that determines the method of solution. If \code{approximate = 1} (the default), an approximate solution is found via a relaxation of the original integer program. This method of solution is faster than \code{approximate = 0}, but some balancing constraints may be violated to some extent. This option works only with \code{n_controls = 1}, i.e. pair matching. + +\code{round_cplex} is binary specific to \code{cplex}. \code{round_cplex = 1} ensures that the solution found is integral by rounding and all the constraints are exactly statisfied; \code{round_cplex = 0} (the default) encodes there is no rounding which may return slightly infeasible integer solutions. + +\code{trace} is a binary specific to \code{cplex} and \code{gurobi}. \code{trace = 1} turns the optimizer output on. The default is \code{trace = 0}. +} +} + +\value{ + A list containing the optimal solution, with the following objects: + + \item{obj_total}{value of the objective function at the optimum;} + + \item{obj_dist_mat}{value of the total sum of distances term of the objective function at the optimum;} + + \item{t_id}{indexes of the matched controls at the optimum;} + + \item{c_id}{indexes of the matched controls at the optimum;} + + \item{group_id}{matched pairs or groups at the optimum;} + + \item{time}{time elapsed to find the optimal solution.} +} + +\references{ + Rosenbaum, P. R. (2010), \emph{Design of Observational Studies}, Springer. +} + +\author{ + Jose R. Zubizarreta , Cinar Kilcioglu . +} + +\seealso{ + \pkg{sensitivitymv}, \pkg{sensitivitymw}. +} + +\examples{ + +# Load and attach data +data(lalonde) +attach(lalonde) + +################################# +# Step 1: use cardinality matching to find the largest sample of matched pairs for which +# all the covariates are finely balanced. +################################# + +# Discretize covariates +quantiles = function(covar, n_q) { + p_q = seq(0, 1, 1/n_q) + val_q = quantile(covar, probs = p_q, na.rm = TRUE) + covar_out = rep(NA, length(covar)) + for (i in 1:n_q) { + if (i==1) {covar_out[covar1 & i=val_q[i] & covar=val_q[i] & covar<=val_q[i+1]] = i}} + covar_out +} +age_5 = quantiles(age, 5) +education_5 = quantiles(education, 5) +re74_5 = quantiles(re74, 5) +re75_5 = quantiles(re75, 5) + +# Treatment indicator +t_ind = treatment + +# Fine balance +fine_covs = cbind(black, hispanic, married, nodegree, age_5, education_5, re74_5, re75_5) +fine = list(covs = fine_covs) + +# Solver options +t_max = 60*5 +solver = "glpk" +approximate = 0 +solver = list(name = solver, t_max = t_max, approximate = approximate, +round_cplex = 0, trace = 0) + +# Match +out_1 = cardmatch(t_ind, fine = fine, solver = solver) + +# Indices of the treated units and matched controls +t_id_1 = out_1$t_id +c_id_1 = out_1$c_id + +# Mean balance +covs = cbind(age, education, black, hispanic, married, nodegree, re74, re75) +meantab(covs, t_ind, t_id_1, c_id_1) + +# Fine balance (note here we are getting an approximate solution) +for (i in 1:ncol(fine_covs)) { + print(finetab(fine_covs[, i], t_id_1, c_id_1)) +} + +################################# +# Step 2: use optimal matching (minimum distance matching) to find the (re)pairing of +# treated and control that minimizes the total sum of covariate distances between matched +# pairs. For this, use the function 'distmatch' which is a wrapper for 'bmatch'. +################################# + +# New treatment indicator +t_ind_2 = t_ind[c(t_id_1, c_id_1)] +table(t_ind_2) + +# To build the distance matrix, the idea is to use strong predictors of the outcome +dist_mat_2 = abs(outer(re74[t_id_1], re74[c_id_1], "-")) +dim(dist_mat_2) + +# Match +out_2 = distmatch(t_ind_2, dist_mat_2, solver) + +# Indices of the treated units and matched controls +t_id_2 = t_id_1[out_2$t_id] +c_id_2 = c_id_1[out_2$c_id-length(out_2$c_id)] + +# Covariate balance is preserved... +meantab(covs, t_ind, t_id_2, c_id_2) +for (i in 1:ncol(fine_covs)) { + print(finetab(fine_covs[, i], t_id_2, c_id_2)) +} + +# ... but covariate distances are reduced +distances_step_1 = sum(diag(dist_mat_2)) +distances_step_2 = sum(diag(dist_mat_2[out_2$t_id, out_2$c_id-length(out_2$c_id)])) +distances_step_1 +distances_step_2 + +# The mean difference in outcomes is the same... +mean(re78[t_id_1]-re78[c_id_1]) +mean(re78[t_id_2]-re78[c_id_2]) + +# ... but their standard deviation is reduced +sd(re78[t_id_1]-re78[c_id_1]) +sd(re78[t_id_2]-re78[c_id_2]) + +} + +\keyword{Causal inference} +\keyword{Instrumental variable} +\keyword{Matching} +\keyword{Regression discontinuity} +\keyword{Observational study}