privacytoolsproject · MeganFantes · Jun 4, 2019 · Jun 4, 2019 · Jun 4, 2019 · Jun 5, 2019
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,4 @@
-.Rproj.user
+*.Rproj
 .Rhistory
 .RData
 *.Ruserdata
@@ -10,3 +10,4 @@ vignettes/*.pdf
 *.aux
 *.log
 *.synctex.gz
+.Rproj.user
diff --git a/R/mechanism-bootstrap.R b/R/mechanism-bootstrap.R
@@ -5,25 +5,68 @@
 #' @param sensitivity Sensitivity of the function
 #' @param epsilon Numeric differential privacy parameter
 #' @param fun Function to evaluate
+#' @param inputObject the Bootstrap mechanism object on which the input function will be evaluated
 #' @return Value of the function applied to one bootstrap sample
 #' @import stats
 #' @export
+# There are 2 options for handling empty partitions:
 
-bootstrapReplication <- function(x, n, sensitivity, epsilon, fun) {
+# 1: skip it entirely, and say the total number of partitions is just the number of partitions that are not empty
+
+bootstrapReplication <- function(x, n, sensitivity, epsilon, fun, inputObject, ...) {
     partition <- rmultinom(n=1, size=n, prob=rep(1 / n, n))
-    maxAppearances <- max(partition)
-    probs <- sapply(1:maxAppearances, dbinom, size=n, prob=(1 / n))
-    statPartitions <- vector('list', maxAppearances)
-    for (i in 1:maxAppearances) {
-        iVariance <- (i * probs[i] * (sensitivity^2)) / (2 * epsilon)
-        iStat <- fun(x[partition == i])
-        iNoise <- dpNoise(n=length(iStat), scale=sqrt(iVariance), dist='gaussian')
-        statPartitions[[i]] <- i * iStat + iNoise
+    # make a sorted vector of the partitions of the data
+    # because it is not guaranteed that every partition from 1:max.appearances will have a value in it
+    # so we need to loop through only the partitions that have data
+    validPartitions <- sort(unique(partition[,1]))
+    # we do not want the 0 partition, so we remove it from the list
+    validPartitions <- validPartitions[2:length(validPartitions)]
+    # print the unique values of the partition, to track which entries may result in NaN
+    #print(validPartitions)
+    probs <- sapply(1:length(validPartitions), dbinom, size=n, prob=(1 / n))
+    stat.partitions <- vector('list', length(validPartitions))
+    for (i in 1:length(validPartitions)) {
+        currentPartition <- validPartitions[i]
+        variance.currentPartition <- (currentPartition * probs[i] * (sensitivity^2)) / (2 * epsilon)
+        stat.currentPartition <- inputObject$bootStatEval(x[partition == currentPartition], fun, ...)
+        noise.currentPartition <- dpNoise(n=length(stat.currentPartition), scale=sqrt(variance.currentPartition), dist='gaussian')
+        stat.partitions[[i]] <- currentPartition * stat.currentPartition + noise.currentPartition
     }
-    statOut <- do.call(rbind, statPartitions)
-    return(apply(statOut, 2, sum))
+    stat.out <- do.call(rbind, stat.partitions)
+    # return(apply(stat.out, 2, sum))
+    returnedBootstrappedResult <- apply(stat.out, 2, sum)
+    return(returnedBootstrappedResult)
 }
 
+# 2: treat it as a partition with a statistic of value 0 and keep it in the calculation, adding noise and adding it to the final calculation
+
+# bootstrapReplication <- function(x, n, sensitivity, epsilon, fun, inputObject, ...) {
+#     partition <- rmultinom(n=1, size=n, prob=rep(1 / n, n))
+#     # make a sorted vector of the partitions of the data
+#     # because it is not guaranteed that every partition from 1:max.appearances will have a value in it
+#     validPartitions <- validPartitions <- sort(unique(partition[,1]))
+#     # print the unique values of the partition, to track which entries may result in NaN
+#     print(validPartitions)
+#     max.appearances <- max(partition)
+#     probs <- sapply(1:max.appearances, dbinom, size=n, prob=(1 / n))
+#     stat.partitions <- vector('list', max.appearances)
+#     for (i in 1:max.appearances) {
+#         variance.i <- (i * probs[i] * (sensitivity^2)) / (2 * epsilon)
+#         if (i %in% validPartitions) {
+#             currentPartition <- validPartitions[i]
+#             stat.i <- inputObject$bootStatEval(x[partition == currentPartition], fun, ...)
+#             noise.i <- dpNoise(n=length(stat.i), scale=sqrt(variance.i), dist='gaussian')
+#             stat.partitions[[i]] <- i * stat.i + noise.i
+#         } else {
+#             stat.i <- 0
+#             noise.i <- dpNoise(n=length(stat.i), scale=sqrt(variance.i), dist='gaussian')
+#             stat.partitions[[i]] <- i * stat.i + noise.i
+#         }
+#     }
+#     stat.out <- do.call(rbind, stat.partitions)
+#     return(apply(stat.out, 2, mean))
+# }
+
 
 #' Bootstrap mechanism
 #'
@@ -39,9 +82,9 @@ mechanismBootstrap <- setRefClass(
 )
 
 mechanismBootstrap$methods(
-    bootStatEval = function(xi,...) {
+    bootStatEval = function(xi, fun,...) {
         funArgs <- getFuncArgs(fun, inputList=list(...), inputObject=.self)
-        inputVals = c(list(x=x), funArgs)
+        inputVals = c(list(x=xi), funArgs)
         stat <- do.call(bootFun, inputVals)
         return(stat)
 })
@@ -58,11 +101,11 @@ mechanismBootstrap$methods(
 })
 
 mechanismBootstrap$methods(
-    evaluate = function(fun, x, sens, postFun) {
-        x <- censorData(x, .self$varType, .self$rng)
+    evaluate = function(fun, x, sens, postFun, ...) {
+        x <- censorData(x, .self$varType, .self$rng, rngFormat=.self$rngFormat)
         x <- fillMissing(x, .self$varType, .self$imputeRng[0], .self$imputeRng[1])
         epsilonPart <- epsilon / .self$nBoot
-        release <- replicate(.self$nBoot, bootstrapReplication(x, n, sens, epsilonPart, fun=.self$bootStatEval))
+        release <- replicate(.self$nBoot, bootstrapReplication(x, .self$n, sens, epsilonPart, fun, .self))
         stdError <- .self$bootSE(release, .self$nBoot, sens)
         out <- list('release' = release, 'stdError' = stdError)
         out <- postFun(out)

diff --git a/R/mechanism-laplace.R b/R/mechanism-laplace.R
@@ -17,7 +17,6 @@ mechanismLaplace$methods(
   #' Differentially private evaluation of input function "fun" with sensitivity "sens" on input data 
   #' "x" using the Laplace mechanism.
   #' 
-  #' @name Laplace Mechanism
   #' @references C. Dwork, A. Roth The Algorithmic Foundations of Differential Privacy, Chapter 3.3 The Laplace Mechanism p.30-37. August 2014.
   #'
   #' @param fun function of input x to add Laplace noise to.
@@ -59,8 +58,8 @@ mechanismLaplace$methods(
   evaluate = function(fun, x, sens, postFun, ...) {
     x <- censorData(x, .self$varType, .self$rng, .self$bins, .self$rngFormat)
     x <- fillMissing(x, .self$varType, imputeRng=.self$rng, categories=.self$imputeBins)
-    fun.args <- getFuncArgs(fun, inputList=list(...), inputObject=.self)
-    inputVals = c(list(x=x), fun.args)
+    funArgs <- getFuncArgs(fun, inputList=list(...), inputObject=.self)
+    inputVals = c(list(x=x), funArgs)
     trueVal <- do.call(fun, inputVals)  # Concern: are we confident that the environment this is happening in is getting erased.
     scale <- sens / .self$epsilon
     release <- trueVal + dpNoise(n=length(trueVal), scale=scale, dist='laplace')

diff --git a/R/statistic-mean.R b/R/statistic-mean.R
@@ -81,7 +81,13 @@ dpMean$methods(
     #' the \code{dpMean$release} function. 
     release = function(data, ...) {
         x <- data[, variable]
-        .self$result <- export(mechanism)$evaluate(mean, x, .self$sens, .self$postProcess, ...)
+        if (mechanism=='mechanismLaplace'){
+          .self$result <- export(mechanism)$evaluate(mean, x, .self$sens, .self$postProcess, ...)
+        }
+        else if (mechanism=='mechanismBootstrap'){
+          .self$result <- export(mechanism)$evaluate(bootMean, x, .self$sens, .self$postProcess, .self$n)
+        }
+
 })
 
 dpMean$methods(

diff --git a/man/Laplace-Mechanism.Rd b/man/Laplace-Mechanism.Rd
diff --git a/man/bootstrapReplication.Rd b/man/bootstrapReplication.Rd
diff --git a/man/dLap.Rd b/man/dLap.Rd
diff --git a/man/pLap.Rd b/man/pLap.Rd
diff --git a/man/qLap.Rd b/man/qLap.Rd
diff --git a/man/rLap.Rd b/man/rLap.Rd
diff --git a/tests/testthat/test-bootstrap.R b/tests/testthat/test-bootstrap.R
@@ -0,0 +1,13 @@
+context("bootstrap")
+
+test_that('bootstrap did not run, then three NaNs, now produces result that is way too big', {
+  data(PUMS5extract10000, package = "PSIlence")
+
+  n.boot <- 25
+  boot_mean <- dpMean$new(mechanism='mechanismBootstrap', varType='numeric', 
+                          variable='income', n=10000, epsilon=0.1, rng=c(0, 750000), 
+                          n.boot=n.boot)
+  boot_mean$release(PUMS5extract10000)
+  print(boot_mean$result)
+  print(mean(boot_mean$result$release))
+})
diff --git a/vignettes/dp-mean.Rmd b/vignettes/dp-mean.Rmd
@@ -37,7 +37,7 @@ Syntax
 
 ```{r, eval = FALSE}
 x1 <- c(3, 12, 20, 42, 33, 65, 70, 54, 33, 45)
-x2 <- c(TRUE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE)
+x2 <- c(TRUE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE)
 data <- data.frame(x1, x2)
 
 dpMeanExample <- dpMean$new(mechanism='mechanismLaplace', varType='numeric',