More accurate RDP computation and other improvements to Accounting

Python Accounting: - [PLD] Add a standalone method for computing hockey stick divergence for a single `epsilon`. This is done to address the commonly occurring use case of computing `delta` for a single `epsilon`. - [RDP] More stable loop termination criterion in fractional RDP order computation. Previously the loop sometimes terminated too early, resulting in underestimates of the RDP at some orders. Now it will run to convergence in most cases, and in case of too many iterations, it will return an RDP of inf at that order, guaranteeing that the resulting epsilon is a true upper bound. - [Requirements] Change `attrs` version from `>=22` to `>=22,<24`. Java: - Change maxContributions from Integer to int in ApproximateBounds Privacy on Beam: - Formatting changes in mean.go & mean_test.go Change-Id: I7322d5f5438ce5fe45670180470054664c297c3c GitOrigin-RevId: afd7fcc5d03cce8632305943c37ba70fd9a9bb23
google · Sep 24, 2024 · 6f7deb3 · 6f7deb3
1 parent 156c8fb
commit 6f7deb3
Show file tree

Hide file tree

Showing 7 changed files with 176 additions and 74 deletions.
diff --git a/java/main/com/google/privacy/differentialprivacy/ApproximateBounds.java b/java/main/com/google/privacy/differentialprivacy/ApproximateBounds.java
@@ -352,7 +352,7 @@ public abstract static class Params {
 
     abstract InputType inputType();
 
-    abstract Integer maxContributions();
+    abstract int maxContributions();
 
     public abstract Builder toBuilder();
 
@@ -378,7 +378,7 @@ public abstract static class Builder {
       public abstract Builder inputType(InputType inputType);
 
       /** The maximum number of contributions each privacy unit can make to the dataset. */
-      public abstract Builder maxContributions(Integer value);
+      public abstract Builder maxContributions(int value);
 
       abstract Params autoBuild();
 

diff --git a/privacy-on-beam/pbeam/mean.go b/privacy-on-beam/pbeam/mean.go
@@ -194,9 +194,7 @@ func MeanPerKey(s beam.Scope, pcol PrivatePCollection, params MeanParams) beam.P
 
 	// Combine all values for <id, partition> into a slice.
 	// Result is PCollection<kv.Pair{ID,K},[]float64>.
-	combined := beam.CombinePerKey(s,
-		&expandFloat64ValuesCombineFn{},
-		converted)
+	combined := beam.CombinePerKey(s, &expandFloat64ValuesCombineFn{}, converted)
 
 	// Result is PCollection<ID, pairArrayFloat64>.
 	rekeyed := beam.ParDo(s, rekeyArrayFloat64, combined)

diff --git a/privacy-on-beam/pbeam/mean_test.go b/privacy-on-beam/pbeam/mean_test.go
diff --git a/python/dp_accounting/dp_accounting/pld/pld_pmf.py b/python/dp_accounting/dp_accounting/pld/pld_pmf.py
@@ -35,11 +35,36 @@
 _MAX_PMF_SPARSE_SIZE = 1000
 
 
+def _get_delta_for_epsilon(infinity_mass: float,
+                           losses: Sequence[float],
+                           probs: Sequence[float],
+                           epsilon: float) -> float:
+  """Computes the epsilon-hockey stick divergence.
+
+  Args:
+    infinity_mass: The probability of the infinite loss.
+    losses: The privacy losses, assumed to be sorted in ascending order.
+    probs: The probabilities corresponding to losses.
+    epsilon: The epsilon in the epsilon-hockey stick divergence.
+
+  Returns:
+    The epsilon-hockey stick divergence.
+  """
+  # delta is inf_mass + sum_{loss} max(0, 1 - exp(epsilon - loss)) * prob
+  losses = np.asarray(losses)
+  probs = np.asarray(probs)
+  indices = losses > epsilon
+  return (
+      infinity_mass +
+      np.dot(-np.expm1(epsilon - losses[indices]), probs[indices])
+  )
+
+
 def _get_delta_for_epsilon_vectorized(infinity_mass: float,
                                       losses: Sequence[float],
                                       probs: Sequence[float],
                                       epsilons: Sequence[float]) -> np.ndarray:
-  """Computes the epsilon-hockey stick divergence.
+  """Computes the epsilon-hockey stick divergence for multiple epsilons.
 
   Args:
     infinity_mass: the probability of the infinite loss.
@@ -346,15 +371,11 @@ def get_delta_for_epsilon(
     """Computes the epsilon-hockey stick divergence."""
     losses = (np.arange(self.size) + self._lower_loss) * self._discretization
 
-    is_scalar = isinstance(epsilon, numbers.Number)
-    if is_scalar:
-      epsilon = [epsilon]
-
-    delta = _get_delta_for_epsilon_vectorized(self._infinity_mass, losses,
-                                              self._probs, epsilon)
-    if is_scalar:
-      delta = delta[0]
-    return delta
+    if isinstance(epsilon, numbers.Number):
+      return _get_delta_for_epsilon(self._infinity_mass, losses,
+                                    self._probs, epsilon)
+    return _get_delta_for_epsilon_vectorized(self._infinity_mass, losses,
+                                             self._probs, epsilon)
 
   def get_epsilon_for_delta(self, delta: float) -> float:
     """Computes epsilon for which hockey stick divergence is at most delta."""
@@ -499,15 +520,11 @@ def get_delta_for_epsilon(
       self, epsilon: Union[float, Sequence[float]]) -> Union[float, np.ndarray]:
     """Computes the epsilon-hockey stick divergence."""
     losses, probs = self._get_losses_probs()
-    is_scalar = isinstance(epsilon, numbers.Number)
-    if is_scalar:
-      epsilon = [epsilon]
-
-    delta = _get_delta_for_epsilon_vectorized(self._infinity_mass, losses,
-                                              probs, epsilon)
-    if is_scalar:
-      delta = delta[0]
-    return delta
+    if isinstance(epsilon, numbers.Number):
+      return _get_delta_for_epsilon(self._infinity_mass, losses, probs, epsilon)
+
+    return _get_delta_for_epsilon_vectorized(self._infinity_mass, losses,
+                                             probs, epsilon)
 
   def get_epsilon_for_delta(self, delta: float) -> float:
     """Computes epsilon for which hockey stick divergence is at most delta."""

diff --git a/python/dp_accounting/dp_accounting/rdp/rdp_privacy_accountant.py b/python/dp_accounting/dp_accounting/rdp/rdp_privacy_accountant.py
@@ -75,16 +75,21 @@ def _compute_log_a_int(q: float, sigma: float, alpha: int) -> float:
   return log_a
 
 
+_MAX_STEPS_LOG_A_FRAC = 1000
+
+
 def _compute_log_a_frac(q: float, sigma: float, alpha: float) -> float:
   """Computes log(A_alpha) for fractional alpha, 0 < q < 1."""
+  # Computation derived in Sec 3.3 of https://arxiv.org/pdf/1908.10530.
   # The two parts of A_alpha, integrals over (-inf,z0] and [z0, +inf), are
   # initialized to 0 in the log space:
   log_a0, log_a1 = -np.inf, -np.inf
   z0 = sigma**2 * math.log(1 / q - 1) + .5
   log1mq = math.log1p(-q)
 
-  i = 0
-  while True:  # do ... until loop
+  last_s0 = last_s1 = -np.inf
+
+  for i in range(_MAX_STEPS_LOG_A_FRAC):
     log_coef = _log_comb(alpha, i)
     j = alpha - i
 
@@ -100,11 +105,30 @@ def _compute_log_a_frac(q: float, sigma: float, alpha: float) -> float:
     log_a0 = _log_add(log_a0, log_s0)
     log_a1 = _log_add(log_a1, log_s1)
 
-    i += 1
-    if max(log_s0, log_s1) < -30:
-      break
-
-  return _log_add(log_a0, log_a1)
+    total = _log_add(log_a0, log_a1)
+
+    # Terminate when both s0 and s1 are decreasing and sufficiently small
+    # relative to total.
+    if (
+        log_s0 < last_s0
+        and log_s1 < last_s1
+        and max(log_s0, log_s1) < total - 30
+    ):
+      return total
+
+    last_s0 = log_s0
+    last_s1 = log_s1
+
+  logging.warning(
+      '_compute_log_a_frac failed to converge after %d iterations with q=%f'
+      ', sigma=%f, alpha=%f. Excluding this order from the epsilon '
+      'computation.',
+      _MAX_STEPS_LOG_A_FRAC,
+      q,
+      sigma,
+      alpha,
+  )
+  return np.inf
 
 
 def _log_erfc(x: float) -> float:

diff --git a/python/dp_accounting/dp_accounting/rdp/rdp_privacy_accountant_test.py b/python/dp_accounting/dp_accounting/rdp/rdp_privacy_accountant_test.py
@@ -732,6 +732,22 @@ def test_repeat_and_select_gaussian_poisson(self, sigma, mean):
       lb = min(rdp[j] for j in range(len(orders)) if orders[j] >= order)
       self.assertLessEqual(lb, accountant_rdp)
 
+  def test_log_a_frac_positive(self):
+    # Testing a combination of q, sigma and alpha that formerly returned a
+    # negative log_a_frac.
+    for order in np.linspace(58.5, 59.5, 21):
+      log_a = rdp_privacy_accountant._compute_log_a_frac(0.4, 12, order)
+      self.assertGreater(log_a, 0)
+
+  def test_log_a_frac_early_termination(self):
+    # Test an event that is known to not converge for small orders.
+    event = dp_event.PoissonSampledDpEvent(0.1, dp_event.GaussianDpEvent(1.0))
+    accountant = rdp_privacy_accountant.RdpAccountant()
+    with self.assertLogs(level='WARNING') as log:
+      accountant.compose(event)
+    self.assertNotEmpty([l for l in log.output if 'failed to converge' in l])
+    self.assertIn(np.inf, accountant._rdp)
+
 
 if __name__ == '__main__':
   absltest.main()
diff --git a/python/dp_accounting/requirements.txt b/python/dp_accounting/requirements.txt
@@ -2,7 +2,7 @@
 # the dependenices from `../learning/requirements.txt`.
 
 absl-py~=1.0
-attrs>=22
+attrs>=22,<24
 dm-tree~=0.1.8
 mpmath~=1.2
 numpy~=1.21