Fix bug in Issue #49

Also fix a bug that becomes apparent after fixing the original bug: Not all partitions in the bootstrap replication are necessarily be filled, yielding NaN values when the statistic is calculated. Add data validation to ensure only calculating statistic on partitions that contain values.
privacytoolsproject · Jun 7, 2019 · bfc6b85 · bfc6b85
1 parent a94cd77
commit bfc6b85
Showing 1 changed file with 23 additions and 14 deletions.
diff --git a/R/mechanism-bootstrap.R b/R/mechanism-bootstrap.R
@@ -5,20 +5,29 @@
 #' @param sensitivity Sensitivity of the function
 #' @param epsilon Numeric differential privacy parameter
 #' @param fun Function to evaluate
+#' @param inputObject the Bootstrap mechanism object on which the input function will be evaluated
 #' @return Value of the function applied to one bootstrap sample
 #' @import stats
 #' @export
 
-bootstrap.replication <- function(x, n, sensitivity, epsilon, fun) {
+bootstrap.replication <- function(x, n, sensitivity, epsilon, fun, inputObject, ...) {
     partition <- rmultinom(n=1, size=n, prob=rep(1 / n, n))
-    max.appearances <- max(partition)
-    probs <- sapply(1:max.appearances, dbinom, size=n, prob=(1 / n))
-    stat.partitions <- vector('list', max.appearances)
-    for (i in 1:max.appearances) {
-        variance.i <- (i * probs[i] * (sensitivity^2)) / (2 * epsilon)
-        stat.i <- fun(x[partition == i])
-        noise.i <- dpNoise(n=length(stat.i), scale=sqrt(variance.i), dist='gaussian')
-        stat.partitions[[i]] <- i * stat.i + noise.i
+    # make a sorted vector of the partitions of the data
+    # because it is not guaranteed that every partition from 1:max.appearances will have a value in it
+    # so we need to loop through only the partitions that have data
+    validPartitions <- sort(unique(partition[,1]))
+    # we do not want the 0 partition, so we remove it from the list
+    validPartitions <- validPartitions[2:length(validPartitions)]
+    # print the unique values of the partition, to track which entries may result in NaN
+    print(validPartitions)
+    probs <- sapply(1:length(validPartitions), dbinom, size=n, prob=(1 / n))
+    stat.partitions <- vector('list', length(validPartitions))
+    for (i in 1:length(validPartitions)) {
+        currentPartition <- validPartitions[i]
+        variance.currentPartition <- (currentPartition * probs[i] * (sensitivity^2)) / (2 * epsilon)
+        stat.currentPartition <- inputObject$bootStatEval(x[partition == currentPartition], fun, ...)
+        noise.currentPartition <- dpNoise(n=length(stat.currentPartition), scale=sqrt(variance.currentPartition), dist='gaussian')
+        stat.partitions[[i]] <- currentPartition * stat.currentPartition + noise.currentPartition
     }
     stat.out <- do.call(rbind, stat.partitions)
     return(apply(stat.out, 2, sum))
@@ -39,10 +48,10 @@ mechanismBootstrap <- setRefClass(
 )
 
 mechanismBootstrap$methods(
-    bootStatEval = function(xi) {
+    bootStatEval = function(xi, fun, ...) {
         fun.args <- getFuncArgs(fun, inputList=list(...), inputObject=.self)
-        input.vals = c(list(x=x), fun.args)
-        stat <- do.call(boot.fun, input.vals)
+        input.vals = c(list(x=xi), fun.args)
+        stat <- do.call(fun, input.vals)
         return(stat)
 })
 
@@ -58,11 +67,11 @@ mechanismBootstrap$methods(
 })
 
 mechanismBootstrap$methods(
-    evaluate = function(fun, x, sens, postFun) {
+    evaluate = function(fun, x, sens, postFun, ...) {
         x <- censordata(x, .self$var.type, .self$rng)
         x <- fillMissing(x, .self$var.type, .self$impute.rng[0], .self$impute.rng[1])
         epsilon.part <- epsilon / .self$n.boot
-        release <- replicate(.self$n.boot, bootstrap.replication(x, n, sens, epsilon.part, fun=.self$bootStatEval))
+        release <- replicate(.self$n.boot, bootstrap.replication(x, n, sens, epsilon.part, fun=fun, inputObject = .self, ...))
         std.error <- .self$bootSE(release, .self$n.boot, sens)
         out <- list('release' = release, 'std.error' = std.error)
         out <- postFun(out)