Skip to content

Commit

Permalink
Add dataset generators in DP Auditorium
Browse files Browse the repository at this point in the history
DP Auditorium:
- Add dataset generator for classifications tasks
- Add dataset generator for Pipeline DP tests
- Remove duplicated `VizierGeneratorConfig`

Change-Id: Id7e54f8c9c248192f7da035afa86b9ac2f1ecec0
GitOrigin-RevId: 8865b02e81bd6141c9511a38cd5145a598fb46ca
  • Loading branch information
Differential Privacy Team authored and miracvbasaran committed Feb 27, 2024
1 parent dc6456b commit bd6d07c
Show file tree
Hide file tree
Showing 10 changed files with 524 additions and 32 deletions.
1 change: 1 addition & 0 deletions python/dp_auditorium/dp_auditorium/configs/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,5 @@ py_library(
"privacy_test_runner_config.py",
"property_tester_config.py",
],
deps = [requirement("google-vizier")],
)
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

import dataclasses
import enum
from vizier.service import pyvizier as vz



class DataType(enum.Enum):
Expand Down Expand Up @@ -50,5 +52,43 @@ class VizierDatasetGeneratorConfig:
data_type: DataType
min_value: float
max_value: float
search_algorithm: str
search_algorithm: vz.Algorithm
metric_name: str


@dataclasses.dataclass(frozen=True)
class ClassificationDatasetGeneratorConfig:
"""Configuration for classification dataset generators supported by Vizier.
Configuration to generate pairs of add/remove-neighboring datasets, where each
record has the form `(features, label)`. `features` is a float array and label
an integer indicating a class.
Attributes:
study_name: String passed to Vizier to identify the study.
study_owner: String determining the owner of the study. A Vizier client
organizes studies by `owner` and `study_id`. For details see the class
`Study` in `vizier/_src/service/clients.py`
(https://github.com/google/vizier/tree/main).
num_vizier_parameters: Number of parameters created by vizier.
data_type: Data type of the Vizier parameters.
min_value: Minimum value of the Vizier parameters.
max_value: Maximum value of the Vizier parameters.
search_algorithm: Search algorithm used by Vizier to generate parameters.
See `vizier/_src/pyvizier/oss/study_config.py` for possible values.
metric_name: User-specifed key, which is passed to Vizier for the purpose of
storing metric values to be optimized. For instance, in a test using the
`RenyiPropertyTester`, this can be denoted as `renyi_divergence`.
sample_dim: dimension of the feature space.
num_samples: Number of samples in the largest dataset.
num_classes: Number of classes in the classification task.
"""
study_name: str
study_owner: str
min_value: float
max_value: float
search_algorithm: vz.Algorithm
metric_name: str
sample_dim: int
num_samples: int
num_classes: int
49 changes: 49 additions & 0 deletions python/dp_auditorium/dp_auditorium/generators/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ py_library(
srcs = ["__init__.py"],
deps = [
":constant_dataset_generator",
":pipeline_dp_vizier_dataset_generator",
":vizier_dataset_generator",
],
)
Expand Down Expand Up @@ -87,3 +88,51 @@ py_test(
"//dp_auditorium:interfaces",
],
)

py_library(
name = "pipeline_dp_vizier_dataset_generator",
srcs = ["pipeline_dp_vizier_dataset_generator.py"],
deps = [
requirement("numpy"),
":vizier_dataset_generator",
"//dp_auditorium:interfaces",
"//dp_auditorium/configs",
],
)

py_test(
name = "pipeline_dp_vizier_dataset_generator_test",
srcs = ["pipeline_dp_vizier_dataset_generator_test.py"],
deps = [
# Needed to fix strange dependency bugs in Vizier. Order is important!
requirement("absl-py"),
requirement("google-vizier"),
requirement("tensorflow"),
":pipeline_dp_vizier_dataset_generator",
"//dp_auditorium/configs",
],
)

py_library(
name = "classification_dataset_generator",
srcs = ["classification_dataset_generator.py"],
deps = [
requirement("google-vizier"),
requirement("numpy"),
":vizier_dataset_generator",
"//dp_auditorium:interfaces",
"//dp_auditorium/configs",
],
)

py_test(
name = "classification_dataset_generator_test",
srcs = ["classification_dataset_generator_test.py"],
deps = [
requirement("absl-py"),
requirement("google-vizier"),
requirement("tensorflow"),
":classification_dataset_generator",
"//dp_auditorium/configs",
],
)
2 changes: 2 additions & 0 deletions python/dp_auditorium/dp_auditorium/generators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,7 @@

"""Differential Privacy Data Generators."""

from dp_auditorium.generators.classification_dataset_generator import ClassificationDatasetGenerator
from dp_auditorium.generators.constant_dataset_generator import ConstantDatasetGenerator
from dp_auditorium.generators.pipeline_dp_vizier_dataset_generator import PipelineDpDatasetGenerator
from dp_auditorium.generators.vizier_dataset_generator import VizierDatasetGenerator
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
# Copyright 2024 Google LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""JAX-based dataset generator for binary classification problems."""

import numpy as np
from vizier.service import clients
from vizier.service import pyvizier as vz

from dp_auditorium import interfaces
from dp_auditorium.configs import dataset_generator_config
from dp_auditorium.generators import vizier_dataset_generator


def _initialize_vizier_problem_and_params_lookup_table(
num_float_params: int, num_int_params: int, features_min_value: float,
features_max_value: float, num_classes: int
) -> tuple[vz.ProblemStatement, dict[str, int]]:
"""Returns Vizier problem and parameters lookup table.
This function initializes a Vizier problem and adds named parameters to it.
`VizierDatasetGenerator` retrieves Vizier parameters (names and values) and
organizes them into a one-dimensional array called `vizier_params`, which will
serve as the input for the function
`get_neighboring_datasets_from_vizier_params`. This function creates mappings
between parameter names and their corresponding indices in `vizier_params`
to ensure consistency between `_extract_params_from_trial` and
`get_neighboring_datasets_from_vizier_params`, and that features and labels
are correctly utilized by classification mechanisms.
Args:
num_float_params: Number of Vizier parameters of type float. These will be
used as features.
num_int_params: Number of Vizier parameters of type int. These will be
used as labels.
features_min_value: Minimum value for feature parameters.
features_max_value: Maximum value for feature parameters.
num_classes: Number of class labels.
Returns:
A tuple where the first element is a Vizier problem and the second element a
dictionary where keys are the problem's parameters names and values are the
corresponding index in a flattened array `vizier_params`.
"""
idx_to_str = {idx: f'float{idx}' for idx in range(num_float_params)}
str_to_idx = {
name: int(name.split('float', 2)[1]) for name in idx_to_str.values()
}
idx_to_str_int = {
idx: f'int{idx}'
for idx in range(num_float_params, num_float_params + num_int_params)
}
str_to_idx_int = {
name: int(name.split('int', 2)[1]) for name in idx_to_str_int.values()
}
idx_to_str.update(idx_to_str_int)
str_to_idx.update(str_to_idx_int)

# Define problem parameters
problem = vz.ProblemStatement()

for i in range(num_float_params):
problem.search_space.root.add_float_param(
idx_to_str[i],
min_value=features_min_value,
max_value=features_max_value,
)
for i in range(num_float_params, num_float_params+num_int_params):
problem.search_space.root.add_int_param(
idx_to_str[i],
min_value=0,
max_value=num_classes,
)
return problem, str_to_idx


class ClassificationDatasetGenerator(
vizier_dataset_generator.VizierDatasetGenerator
):
"""Classification dataset generators supported by Vizier.
This class generate pairs of add/remove-neighboring datasets, where each
record has the form `(features, label)`. `features` is a float array and label
an integer indicating a class. Both features and labels are generated by
Vizier using the search algorithm specified in the configuration.
"""

def __init__(
self,
config: dataset_generator_config.ClassificationDatasetGeneratorConfig,
):
"""Initializes a dataset generator where records have features and labels.
Args:
config: A configuration proto for classification dataset generator.
"""

self._num_samples = config.num_samples
self._sample_dim = config.sample_dim
self._num_float_params = config.sample_dim * config.num_samples
self._num_int_params = config.num_samples
self._num_vizier_params = self._num_float_params + self._num_int_params

# Get indices to parameter names mapping assigned in Vizier and its inverse.
problem, str_to_idx = _initialize_vizier_problem_and_params_lookup_table(
num_float_params=self._num_float_params,
num_int_params=self._num_int_params,
features_min_value=config.min_value,
features_max_value=config.max_value,
num_classes=config.num_classes,
)

self._str_to_idx = str_to_idx

# Define metric.
self._metric_name = config.metric_name
problem.metric_information = [
vz.MetricInformation(
name=config.metric_name, goal=vz.ObjectiveMetricGoal.MAXIMIZE
)
]

study_config = vz.StudyConfig.from_problem(problem)
study_config.algorithm = config.search_algorithm

self._study_client = clients.Study.from_study_config(
study_config, owner=config.study_owner, study_id=config.study_name
)

self._trial_loaded = False
self._last_trial = None

def get_neighboring_datasets_from_vizier_params(
self, vizier_params: np.ndarray
) -> interfaces.NeighboringDatasetsType:
"""Transforms a one-dimensional numpy array to neighboring datasets.
Generates neighboring datasets where `data = (features, labels)`. `features`
is an array with shape (self._num_samples, self._sample_dim) and labels an
array with shape (self._num_samples,).
Args:
vizier_params: array with parameters generated by Vizier.
Returns:
Pair `(data1, data2)` of neighboring datasets under the add/remove
definition. `data2` will have one record less than `data1`.
"""

features = vizier_params[: self._num_float_params]
features = features.reshape(self._num_samples, self._sample_dim)
labels = vizier_params[self._num_float_params :]

data1 = features, labels
data2 = data1[0][:-1], data1[1][:-1]
return data1, data2
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright 2024 Google LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for classification_dataset_generator."""
from absl.testing import absltest
import tensorflow as tf
from vizier.service import clients

from dp_auditorium.configs import dataset_generator_config
from dp_auditorium.generators import classification_dataset_generator


clients.environment_variables.servicer_use_sql_ram()


class ClassificationDatasetGeneratorTest(tf.test.TestCase):

def setUp(self):
super().setUp()
self.sample_dim = 5
self.num_samples = 4
self.min_value = 13.7
self.max_value = 15.9
self.num_classes = 3

self.generator_config = (
dataset_generator_config.ClassificationDatasetGeneratorConfig(
sample_dim=self.sample_dim,
num_samples=self.num_samples,
num_classes=self.num_classes,
min_value=self.min_value,
max_value=self.max_value,
study_name='stub_study',
study_owner='stub_owner',
metric_name='stub_metric',
search_algorithm='RANDOM_SEARCH',
)
)
self.generator = (
classification_dataset_generator.ClassificationDatasetGenerator(
config=self.generator_config,
)
)

def test_get_neighboring_datasets_from_vizier_params_produces_correct_pair(
self,
):
"""Tests datasets have correct shapes and are adjacent."""

data1, data2 = self.generator(None)
# Check output shape
with self.subTest('data1-images-have-correct-shape'):
self.assertEqual(data1[0].shape, (self.num_samples, self.sample_dim))
with self.subTest('data1-labels-have-correct-shape'):
self.assertEqual(data1[1].shape, (self.num_samples,))
with self.subTest('data2-images-have-correct-shape'):
self.assertEqual(data2[0].shape, (self.num_samples - 1, self.sample_dim))
with self.subTest('data2-labels-have-correct-shape'):
self.assertEqual(data2[1].shape, (self.num_samples - 1,))

# Check output values range.
with self.subTest('data1-labels-in-range'):
self.assertAllInRange(data1[1], 0, self.num_classes)
with self.subTest('data2-labels-in-range'):
self.assertAllInRange(data2[1], 0, self.num_classes)
with self.subTest('data1-features-in-range'):
self.assertAllInRange(data1[0], self.min_value, self.max_value)
with self.subTest('data2-features-in-range'):
self.assertAllInRange(data2[0], self.min_value, self.max_value)


if __name__ == '__main__':
absltest.main()
Loading

0 comments on commit bd6d07c

Please sign in to comment.