Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP - James to look at fix for empty partitions in bootstrap replication (MF bootstrap mechanism) #68

Open
wants to merge 19 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.Rproj.user
*.Rproj
.Rhistory
.RData
*.Ruserdata
Expand All @@ -10,3 +10,4 @@ vignettes/*.pdf
*.aux
*.log
*.synctex.gz
.Rproj.user
75 changes: 59 additions & 16 deletions R/mechanism-bootstrap.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,68 @@
#' @param sensitivity Sensitivity of the function
#' @param epsilon Numeric differential privacy parameter
#' @param fun Function to evaluate
#' @param inputObject the Bootstrap mechanism object on which the input function will be evaluated
#' @return Value of the function applied to one bootstrap sample
#' @import stats
#' @export
# There are 2 options for handling empty partitions:

bootstrapReplication <- function(x, n, sensitivity, epsilon, fun) {
# 1: skip it entirely, and say the total number of partitions is just the number of partitions that are not empty

bootstrapReplication <- function(x, n, sensitivity, epsilon, fun, inputObject, ...) {
partition <- rmultinom(n=1, size=n, prob=rep(1 / n, n))
maxAppearances <- max(partition)
probs <- sapply(1:maxAppearances, dbinom, size=n, prob=(1 / n))
statPartitions <- vector('list', maxAppearances)
for (i in 1:maxAppearances) {
iVariance <- (i * probs[i] * (sensitivity^2)) / (2 * epsilon)
iStat <- fun(x[partition == i])
iNoise <- dpNoise(n=length(iStat), scale=sqrt(iVariance), dist='gaussian')
statPartitions[[i]] <- i * iStat + iNoise
# make a sorted vector of the partitions of the data
# because it is not guaranteed that every partition from 1:max.appearances will have a value in it
# so we need to loop through only the partitions that have data
validPartitions <- sort(unique(partition[,1]))
# we do not want the 0 partition, so we remove it from the list
validPartitions <- validPartitions[2:length(validPartitions)]
# print the unique values of the partition, to track which entries may result in NaN
#print(validPartitions)
probs <- sapply(1:length(validPartitions), dbinom, size=n, prob=(1 / n))
stat.partitions <- vector('list', length(validPartitions))
for (i in 1:length(validPartitions)) {
currentPartition <- validPartitions[i]
variance.currentPartition <- (currentPartition * probs[i] * (sensitivity^2)) / (2 * epsilon)
stat.currentPartition <- inputObject$bootStatEval(x[partition == currentPartition], fun, ...)
noise.currentPartition <- dpNoise(n=length(stat.currentPartition), scale=sqrt(variance.currentPartition), dist='gaussian')
stat.partitions[[i]] <- currentPartition * stat.currentPartition + noise.currentPartition
}
statOut <- do.call(rbind, statPartitions)
return(apply(statOut, 2, sum))
stat.out <- do.call(rbind, stat.partitions)
# return(apply(stat.out, 2, sum))
returnedBootstrappedResult <- apply(stat.out, 2, sum)
return(returnedBootstrappedResult)
}

# 2: treat it as a partition with a statistic of value 0 and keep it in the calculation, adding noise and adding it to the final calculation

# bootstrapReplication <- function(x, n, sensitivity, epsilon, fun, inputObject, ...) {
# partition <- rmultinom(n=1, size=n, prob=rep(1 / n, n))
# # make a sorted vector of the partitions of the data
# # because it is not guaranteed that every partition from 1:max.appearances will have a value in it
# validPartitions <- validPartitions <- sort(unique(partition[,1]))
# # print the unique values of the partition, to track which entries may result in NaN
# print(validPartitions)
# max.appearances <- max(partition)
# probs <- sapply(1:max.appearances, dbinom, size=n, prob=(1 / n))
# stat.partitions <- vector('list', max.appearances)
# for (i in 1:max.appearances) {
# variance.i <- (i * probs[i] * (sensitivity^2)) / (2 * epsilon)
# if (i %in% validPartitions) {
# currentPartition <- validPartitions[i]
# stat.i <- inputObject$bootStatEval(x[partition == currentPartition], fun, ...)
# noise.i <- dpNoise(n=length(stat.i), scale=sqrt(variance.i), dist='gaussian')
# stat.partitions[[i]] <- i * stat.i + noise.i
# } else {
# stat.i <- 0
# noise.i <- dpNoise(n=length(stat.i), scale=sqrt(variance.i), dist='gaussian')
# stat.partitions[[i]] <- i * stat.i + noise.i
# }
# }
# stat.out <- do.call(rbind, stat.partitions)
# return(apply(stat.out, 2, mean))
# }


#' Bootstrap mechanism
#'
Expand All @@ -39,9 +82,9 @@ mechanismBootstrap <- setRefClass(
)

mechanismBootstrap$methods(
bootStatEval = function(xi,...) {
bootStatEval = function(xi, fun,...) {
funArgs <- getFuncArgs(fun, inputList=list(...), inputObject=.self)
inputVals = c(list(x=x), funArgs)
inputVals = c(list(x=xi), funArgs)
stat <- do.call(bootFun, inputVals)
return(stat)
})
Expand All @@ -58,11 +101,11 @@ mechanismBootstrap$methods(
})

mechanismBootstrap$methods(
evaluate = function(fun, x, sens, postFun) {
x <- censorData(x, .self$varType, .self$rng)
evaluate = function(fun, x, sens, postFun, ...) {
x <- censorData(x, .self$varType, .self$rng, rngFormat=.self$rngFormat)
x <- fillMissing(x, .self$varType, .self$imputeRng[0], .self$imputeRng[1])
epsilonPart <- epsilon / .self$nBoot
release <- replicate(.self$nBoot, bootstrapReplication(x, n, sens, epsilonPart, fun=.self$bootStatEval))
release <- replicate(.self$nBoot, bootstrapReplication(x, .self$n, sens, epsilonPart, fun, .self))
stdError <- .self$bootSE(release, .self$nBoot, sens)
out <- list('release' = release, 'stdError' = stdError)
out <- postFun(out)
Expand Down
5 changes: 2 additions & 3 deletions R/mechanism-laplace.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ mechanismLaplace$methods(
#' Differentially private evaluation of input function "fun" with sensitivity "sens" on input data
#' "x" using the Laplace mechanism.
#'
#' @name Laplace Mechanism
#' @references C. Dwork, A. Roth The Algorithmic Foundations of Differential Privacy, Chapter 3.3 The Laplace Mechanism p.30-37. August 2014.
#'
#' @param fun function of input x to add Laplace noise to.
Expand Down Expand Up @@ -59,8 +58,8 @@ mechanismLaplace$methods(
evaluate = function(fun, x, sens, postFun, ...) {
x <- censorData(x, .self$varType, .self$rng, .self$bins, .self$rngFormat)
x <- fillMissing(x, .self$varType, imputeRng=.self$rng, categories=.self$imputeBins)
fun.args <- getFuncArgs(fun, inputList=list(...), inputObject=.self)
inputVals = c(list(x=x), fun.args)
funArgs <- getFuncArgs(fun, inputList=list(...), inputObject=.self)
inputVals = c(list(x=x), funArgs)
trueVal <- do.call(fun, inputVals) # Concern: are we confident that the environment this is happening in is getting erased.
scale <- sens / .self$epsilon
release <- trueVal + dpNoise(n=length(trueVal), scale=scale, dist='laplace')
Expand Down
8 changes: 7 additions & 1 deletion R/statistic-mean.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,13 @@ dpMean$methods(
#' the \code{dpMean$release} function.
release = function(data, ...) {
x <- data[, variable]
.self$result <- export(mechanism)$evaluate(mean, x, .self$sens, .self$postProcess, ...)
if (mechanism=='mechanismLaplace'){
.self$result <- export(mechanism)$evaluate(mean, x, .self$sens, .self$postProcess, ...)
}
else if (mechanism=='mechanismBootstrap'){
.self$result <- export(mechanism)$evaluate(bootMean, x, .self$sens, .self$postProcess, .self$n)
}

})

dpMean$methods(
Expand Down
55 changes: 0 additions & 55 deletions man/Laplace-Mechanism.Rd

This file was deleted.

4 changes: 3 additions & 1 deletion man/bootstrapReplication.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 26 additions & 0 deletions man/dLap.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 27 additions & 0 deletions man/pLap.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 25 additions & 0 deletions man/qLap.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 25 additions & 0 deletions man/rLap.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions tests/testthat/test-bootstrap.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
context("bootstrap")

test_that('bootstrap did not run, then three NaNs, now produces result that is way too big', {
data(PUMS5extract10000, package = "PSIlence")

n.boot <- 25
boot_mean <- dpMean$new(mechanism='mechanismBootstrap', varType='numeric',
variable='income', n=10000, epsilon=0.1, rng=c(0, 750000),
n.boot=n.boot)
boot_mean$release(PUMS5extract10000)
print(boot_mean$result)
print(mean(boot_mean$result$release))
})
2 changes: 1 addition & 1 deletion vignettes/dp-mean.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ Syntax

```{r, eval = FALSE}
x1 <- c(3, 12, 20, 42, 33, 65, 70, 54, 33, 45)
x2 <- c(TRUE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE)
x2 <- c(TRUE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE)
data <- data.frame(x1, x2)

dpMeanExample <- dpMean$new(mechanism='mechanismLaplace', varType='numeric',
Expand Down