From bd6d07cd0f7dc9ae029182de6bba331e85628235 Mon Sep 17 00:00:00 2001 From: Differential Privacy Team Date: Thu, 22 Feb 2024 09:20:46 -0800 Subject: [PATCH] Add dataset generators in DP Auditorium DP Auditorium: - Add dataset generator for classifications tasks - Add dataset generator for Pipeline DP tests - Remove duplicated `VizierGeneratorConfig` Change-Id: Id7e54f8c9c248192f7da035afa86b9ac2f1ecec0 GitOrigin-RevId: 8865b02e81bd6141c9511a38cd5145a598fb46ca --- .../dp_auditorium/configs/BUILD.bazel | 1 + .../configs/dataset_generator_config.py | 42 ++++- .../dp_auditorium/generators/BUILD.bazel | 49 ++++++ .../dp_auditorium/generators/__init__.py | 2 + .../classification_dataset_generator.py | 166 ++++++++++++++++++ .../classification_dataset_generator_test.py | 83 +++++++++ .../pipeline_dp_vizier_dataset_generator.py | 84 +++++++++ ...peline_dp_vizier_dataset_generator_test.py | 85 +++++++++ .../generators/vizier_dataset_generator.py | 40 ++--- .../vizier_dataset_generator_test.py | 4 +- 10 files changed, 524 insertions(+), 32 deletions(-) create mode 100644 python/dp_auditorium/dp_auditorium/generators/classification_dataset_generator.py create mode 100644 python/dp_auditorium/dp_auditorium/generators/classification_dataset_generator_test.py create mode 100644 python/dp_auditorium/dp_auditorium/generators/pipeline_dp_vizier_dataset_generator.py create mode 100644 python/dp_auditorium/dp_auditorium/generators/pipeline_dp_vizier_dataset_generator_test.py diff --git a/python/dp_auditorium/dp_auditorium/configs/BUILD.bazel b/python/dp_auditorium/dp_auditorium/configs/BUILD.bazel index d92abda3..0813d2b4 100644 --- a/python/dp_auditorium/dp_auditorium/configs/BUILD.bazel +++ b/python/dp_auditorium/dp_auditorium/configs/BUILD.bazel @@ -30,4 +30,5 @@ py_library( "privacy_test_runner_config.py", "property_tester_config.py", ], + deps = [requirement("google-vizier")], ) diff --git a/python/dp_auditorium/dp_auditorium/configs/dataset_generator_config.py b/python/dp_auditorium/dp_auditorium/configs/dataset_generator_config.py index 92d5e941..677e3eb4 100644 --- a/python/dp_auditorium/dp_auditorium/configs/dataset_generator_config.py +++ b/python/dp_auditorium/dp_auditorium/configs/dataset_generator_config.py @@ -15,6 +15,8 @@ import dataclasses import enum +from vizier.service import pyvizier as vz + class DataType(enum.Enum): @@ -50,5 +52,43 @@ class VizierDatasetGeneratorConfig: data_type: DataType min_value: float max_value: float - search_algorithm: str + search_algorithm: vz.Algorithm + metric_name: str + + +@dataclasses.dataclass(frozen=True) +class ClassificationDatasetGeneratorConfig: + """Configuration for classification dataset generators supported by Vizier. + + Configuration to generate pairs of add/remove-neighboring datasets, where each + record has the form `(features, label)`. `features` is a float array and label + an integer indicating a class. + + Attributes: + study_name: String passed to Vizier to identify the study. + study_owner: String determining the owner of the study. A Vizier client + organizes studies by `owner` and `study_id`. For details see the class + `Study` in `vizier/_src/service/clients.py` + (https://github.com/google/vizier/tree/main). + num_vizier_parameters: Number of parameters created by vizier. + data_type: Data type of the Vizier parameters. + min_value: Minimum value of the Vizier parameters. + max_value: Maximum value of the Vizier parameters. + search_algorithm: Search algorithm used by Vizier to generate parameters. + See `vizier/_src/pyvizier/oss/study_config.py` for possible values. + metric_name: User-specifed key, which is passed to Vizier for the purpose of + storing metric values to be optimized. For instance, in a test using the + `RenyiPropertyTester`, this can be denoted as `renyi_divergence`. + sample_dim: dimension of the feature space. + num_samples: Number of samples in the largest dataset. + num_classes: Number of classes in the classification task. + """ + study_name: str + study_owner: str + min_value: float + max_value: float + search_algorithm: vz.Algorithm metric_name: str + sample_dim: int + num_samples: int + num_classes: int diff --git a/python/dp_auditorium/dp_auditorium/generators/BUILD.bazel b/python/dp_auditorium/dp_auditorium/generators/BUILD.bazel index dcda3946..9981ac60 100644 --- a/python/dp_auditorium/dp_auditorium/generators/BUILD.bazel +++ b/python/dp_auditorium/dp_auditorium/generators/BUILD.bazel @@ -26,6 +26,7 @@ py_library( srcs = ["__init__.py"], deps = [ ":constant_dataset_generator", + ":pipeline_dp_vizier_dataset_generator", ":vizier_dataset_generator", ], ) @@ -87,3 +88,51 @@ py_test( "//dp_auditorium:interfaces", ], ) + +py_library( + name = "pipeline_dp_vizier_dataset_generator", + srcs = ["pipeline_dp_vizier_dataset_generator.py"], + deps = [ + requirement("numpy"), + ":vizier_dataset_generator", + "//dp_auditorium:interfaces", + "//dp_auditorium/configs", + ], +) + +py_test( + name = "pipeline_dp_vizier_dataset_generator_test", + srcs = ["pipeline_dp_vizier_dataset_generator_test.py"], + deps = [ + # Needed to fix strange dependency bugs in Vizier. Order is important! + requirement("absl-py"), + requirement("google-vizier"), + requirement("tensorflow"), + ":pipeline_dp_vizier_dataset_generator", + "//dp_auditorium/configs", + ], +) + +py_library( + name = "classification_dataset_generator", + srcs = ["classification_dataset_generator.py"], + deps = [ + requirement("google-vizier"), + requirement("numpy"), + ":vizier_dataset_generator", + "//dp_auditorium:interfaces", + "//dp_auditorium/configs", + ], +) + +py_test( + name = "classification_dataset_generator_test", + srcs = ["classification_dataset_generator_test.py"], + deps = [ + requirement("absl-py"), + requirement("google-vizier"), + requirement("tensorflow"), + ":classification_dataset_generator", + "//dp_auditorium/configs", + ], +) diff --git a/python/dp_auditorium/dp_auditorium/generators/__init__.py b/python/dp_auditorium/dp_auditorium/generators/__init__.py index d2947a48..03c48ffd 100644 --- a/python/dp_auditorium/dp_auditorium/generators/__init__.py +++ b/python/dp_auditorium/dp_auditorium/generators/__init__.py @@ -14,5 +14,7 @@ """Differential Privacy Data Generators.""" +from dp_auditorium.generators.classification_dataset_generator import ClassificationDatasetGenerator from dp_auditorium.generators.constant_dataset_generator import ConstantDatasetGenerator +from dp_auditorium.generators.pipeline_dp_vizier_dataset_generator import PipelineDpDatasetGenerator from dp_auditorium.generators.vizier_dataset_generator import VizierDatasetGenerator diff --git a/python/dp_auditorium/dp_auditorium/generators/classification_dataset_generator.py b/python/dp_auditorium/dp_auditorium/generators/classification_dataset_generator.py new file mode 100644 index 00000000..5036617e --- /dev/null +++ b/python/dp_auditorium/dp_auditorium/generators/classification_dataset_generator.py @@ -0,0 +1,166 @@ +# Copyright 2024 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""JAX-based dataset generator for binary classification problems.""" + +import numpy as np +from vizier.service import clients +from vizier.service import pyvizier as vz + +from dp_auditorium import interfaces +from dp_auditorium.configs import dataset_generator_config +from dp_auditorium.generators import vizier_dataset_generator + + +def _initialize_vizier_problem_and_params_lookup_table( + num_float_params: int, num_int_params: int, features_min_value: float, + features_max_value: float, num_classes: int +) -> tuple[vz.ProblemStatement, dict[str, int]]: + """Returns Vizier problem and parameters lookup table. + + This function initializes a Vizier problem and adds named parameters to it. + `VizierDatasetGenerator` retrieves Vizier parameters (names and values) and + organizes them into a one-dimensional array called `vizier_params`, which will + serve as the input for the function + `get_neighboring_datasets_from_vizier_params`. This function creates mappings + between parameter names and their corresponding indices in `vizier_params` + to ensure consistency between `_extract_params_from_trial` and + `get_neighboring_datasets_from_vizier_params`, and that features and labels + are correctly utilized by classification mechanisms. + + Args: + num_float_params: Number of Vizier parameters of type float. These will be + used as features. + num_int_params: Number of Vizier parameters of type int. These will be + used as labels. + features_min_value: Minimum value for feature parameters. + features_max_value: Maximum value for feature parameters. + num_classes: Number of class labels. + + Returns: + A tuple where the first element is a Vizier problem and the second element a + dictionary where keys are the problem's parameters names and values are the + corresponding index in a flattened array `vizier_params`. + """ + idx_to_str = {idx: f'float{idx}' for idx in range(num_float_params)} + str_to_idx = { + name: int(name.split('float', 2)[1]) for name in idx_to_str.values() + } + idx_to_str_int = { + idx: f'int{idx}' + for idx in range(num_float_params, num_float_params + num_int_params) + } + str_to_idx_int = { + name: int(name.split('int', 2)[1]) for name in idx_to_str_int.values() + } + idx_to_str.update(idx_to_str_int) + str_to_idx.update(str_to_idx_int) + + # Define problem parameters + problem = vz.ProblemStatement() + + for i in range(num_float_params): + problem.search_space.root.add_float_param( + idx_to_str[i], + min_value=features_min_value, + max_value=features_max_value, + ) + for i in range(num_float_params, num_float_params+num_int_params): + problem.search_space.root.add_int_param( + idx_to_str[i], + min_value=0, + max_value=num_classes, + ) + return problem, str_to_idx + + +class ClassificationDatasetGenerator( + vizier_dataset_generator.VizierDatasetGenerator +): + """Classification dataset generators supported by Vizier. + + This class generate pairs of add/remove-neighboring datasets, where each + record has the form `(features, label)`. `features` is a float array and label + an integer indicating a class. Both features and labels are generated by + Vizier using the search algorithm specified in the configuration. + """ + + def __init__( + self, + config: dataset_generator_config.ClassificationDatasetGeneratorConfig, + ): + """Initializes a dataset generator where records have features and labels. + + Args: + config: A configuration proto for classification dataset generator. + """ + + self._num_samples = config.num_samples + self._sample_dim = config.sample_dim + self._num_float_params = config.sample_dim * config.num_samples + self._num_int_params = config.num_samples + self._num_vizier_params = self._num_float_params + self._num_int_params + + # Get indices to parameter names mapping assigned in Vizier and its inverse. + problem, str_to_idx = _initialize_vizier_problem_and_params_lookup_table( + num_float_params=self._num_float_params, + num_int_params=self._num_int_params, + features_min_value=config.min_value, + features_max_value=config.max_value, + num_classes=config.num_classes, + ) + + self._str_to_idx = str_to_idx + + # Define metric. + self._metric_name = config.metric_name + problem.metric_information = [ + vz.MetricInformation( + name=config.metric_name, goal=vz.ObjectiveMetricGoal.MAXIMIZE + ) + ] + + study_config = vz.StudyConfig.from_problem(problem) + study_config.algorithm = config.search_algorithm + + self._study_client = clients.Study.from_study_config( + study_config, owner=config.study_owner, study_id=config.study_name + ) + + self._trial_loaded = False + self._last_trial = None + + def get_neighboring_datasets_from_vizier_params( + self, vizier_params: np.ndarray + ) -> interfaces.NeighboringDatasetsType: + """Transforms a one-dimensional numpy array to neighboring datasets. + + Generates neighboring datasets where `data = (features, labels)`. `features` + is an array with shape (self._num_samples, self._sample_dim) and labels an + array with shape (self._num_samples,). + + Args: + vizier_params: array with parameters generated by Vizier. + + Returns: + Pair `(data1, data2)` of neighboring datasets under the add/remove + definition. `data2` will have one record less than `data1`. + """ + + features = vizier_params[: self._num_float_params] + features = features.reshape(self._num_samples, self._sample_dim) + labels = vizier_params[self._num_float_params :] + + data1 = features, labels + data2 = data1[0][:-1], data1[1][:-1] + return data1, data2 diff --git a/python/dp_auditorium/dp_auditorium/generators/classification_dataset_generator_test.py b/python/dp_auditorium/dp_auditorium/generators/classification_dataset_generator_test.py new file mode 100644 index 00000000..effbf6c3 --- /dev/null +++ b/python/dp_auditorium/dp_auditorium/generators/classification_dataset_generator_test.py @@ -0,0 +1,83 @@ +# Copyright 2024 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for classification_dataset_generator.""" +from absl.testing import absltest +import tensorflow as tf +from vizier.service import clients + +from dp_auditorium.configs import dataset_generator_config +from dp_auditorium.generators import classification_dataset_generator + + +clients.environment_variables.servicer_use_sql_ram() + + +class ClassificationDatasetGeneratorTest(tf.test.TestCase): + + def setUp(self): + super().setUp() + self.sample_dim = 5 + self.num_samples = 4 + self.min_value = 13.7 + self.max_value = 15.9 + self.num_classes = 3 + + self.generator_config = ( + dataset_generator_config.ClassificationDatasetGeneratorConfig( + sample_dim=self.sample_dim, + num_samples=self.num_samples, + num_classes=self.num_classes, + min_value=self.min_value, + max_value=self.max_value, + study_name='stub_study', + study_owner='stub_owner', + metric_name='stub_metric', + search_algorithm='RANDOM_SEARCH', + ) + ) + self.generator = ( + classification_dataset_generator.ClassificationDatasetGenerator( + config=self.generator_config, + ) + ) + + def test_get_neighboring_datasets_from_vizier_params_produces_correct_pair( + self, + ): + """Tests datasets have correct shapes and are adjacent.""" + + data1, data2 = self.generator(None) + # Check output shape + with self.subTest('data1-images-have-correct-shape'): + self.assertEqual(data1[0].shape, (self.num_samples, self.sample_dim)) + with self.subTest('data1-labels-have-correct-shape'): + self.assertEqual(data1[1].shape, (self.num_samples,)) + with self.subTest('data2-images-have-correct-shape'): + self.assertEqual(data2[0].shape, (self.num_samples - 1, self.sample_dim)) + with self.subTest('data2-labels-have-correct-shape'): + self.assertEqual(data2[1].shape, (self.num_samples - 1,)) + + # Check output values range. + with self.subTest('data1-labels-in-range'): + self.assertAllInRange(data1[1], 0, self.num_classes) + with self.subTest('data2-labels-in-range'): + self.assertAllInRange(data2[1], 0, self.num_classes) + with self.subTest('data1-features-in-range'): + self.assertAllInRange(data1[0], self.min_value, self.max_value) + with self.subTest('data2-features-in-range'): + self.assertAllInRange(data2[0], self.min_value, self.max_value) + + +if __name__ == '__main__': + absltest.main() diff --git a/python/dp_auditorium/dp_auditorium/generators/pipeline_dp_vizier_dataset_generator.py b/python/dp_auditorium/dp_auditorium/generators/pipeline_dp_vizier_dataset_generator.py new file mode 100644 index 00000000..06a13b01 --- /dev/null +++ b/python/dp_auditorium/dp_auditorium/generators/pipeline_dp_vizier_dataset_generator.py @@ -0,0 +1,84 @@ +# Copyright 2024 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Pipeline DP dataset generator.""" + +import dataclasses +from typing import Any + +import numpy as np + +from dp_auditorium import interfaces +from dp_auditorium.configs import dataset_generator_config +from dp_auditorium.generators import vizier_dataset_generator + + +@dataclasses.dataclass +class PipelineDpDatasetGeneratorConfig: + max_num_privacy_ids: int + max_num_partitions: int + + +class PipelineDpDatasetGenerator( + vizier_dataset_generator.VizierDatasetGenerator +): + """Generates adjacent datasets from Vizier parameters.""" + + def __init__( + self, + config: dataset_generator_config.VizierDatasetGeneratorConfig, + pipeline_dp_generator_config: PipelineDpDatasetGeneratorConfig, + ) -> Any: + # Overwrites number of Vizier parameters to ensure all pairs of partition-id + # and privacy-id are populated. + config.num_vizier_parameters = ( + pipeline_dp_generator_config.max_num_privacy_ids + * pipeline_dp_generator_config.max_num_partitions + ) + super().__init__(config) + self._max_num_privacy_ids = pipeline_dp_generator_config.max_num_privacy_ids + self._max_num_partitions = pipeline_dp_generator_config.max_num_partitions + + def _generate_sparse_matrix_from_dense( + self, data: np.ndarray + ) -> list[tuple[int, int, float]]: + """Generates pd.Dataframe sparse representation of dense array `data`.""" + return [(x[0][0], x[0][1], x[1]) for x in np.ndenumerate(data)] + + def get_neighboring_datasets_from_vizier_params( + self, vizier_params: np.ndarray + ) -> interfaces.NeighboringDatasetsType: + """Transforms a one-dimensional numpy array to neighboring datasets. + + Pipeline-DP mechanisms receive input data in the form of a list of records + `[x1, x2,...]` where each record `xi` is a tuple + `xi=(partition_id, privacy_id, value)`. `vizier_params` contains values for + all pairs (partition_id, privacy_id). This functions receives those values, + reshapes them into to a dense matrix `all_data` where each row corresponds + to a privacy-id and each column to a partition-id. Finally, it generates + neighboring datasets `(data1, data2)` under the add/remove definition by + converting to sparse representation the first `max_num_privacy_ids-1` rows + of `all_data` as data1 and `all_data` as `data2`. + + Args: + vizier_params: array with parameters generated by Vizier. + + Returns: + Pair of neighboring datasets under the add/remove definition. + """ + all_data = vizier_params.reshape( + self._max_num_privacy_ids, self._max_num_partitions + ) + data1 = self._generate_sparse_matrix_from_dense(all_data[:-1, :]) + data2 = self._generate_sparse_matrix_from_dense(all_data) + return data1, data2 diff --git a/python/dp_auditorium/dp_auditorium/generators/pipeline_dp_vizier_dataset_generator_test.py b/python/dp_auditorium/dp_auditorium/generators/pipeline_dp_vizier_dataset_generator_test.py new file mode 100644 index 00000000..3e10b885 --- /dev/null +++ b/python/dp_auditorium/dp_auditorium/generators/pipeline_dp_vizier_dataset_generator_test.py @@ -0,0 +1,85 @@ +# Copyright 2024 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for pipeline_dp_dataset_generator.""" +import time + +from absl.testing import absltest +import tensorflow as tf +from vizier.service import clients + +from dp_auditorium.configs import dataset_generator_config +from dp_auditorium.generators import pipeline_dp_vizier_dataset_generator + + +clients.environment_variables.servicer_use_sql_ram() + + +class PipelineDpDatasetGeneratorTest(tf.test.TestCase): + + def setUp(self): + super().setUp() + self.num_params = 5 + self.min_value = 0.1 + self.max_value = 1.5 + self.max_num_partitions = 3 + self.max_num_privacy_ids = 2 + self.vizier_config = dataset_generator_config.VizierDatasetGeneratorConfig( + study_name='stub_study' + str(time.time()), + study_owner='owner', + num_vizier_parameters=self.num_params, + data_type=dataset_generator_config.DataType.DATA_TYPE_FLOAT, + min_value=self.min_value, + max_value=self.max_value, + search_algorithm='GRID_SEARCH', + metric_name='divergence_estimator', + ) + self.pipeline_dp_config = ( + pipeline_dp_vizier_dataset_generator.PipelineDpDatasetGeneratorConfig( + max_num_partitions=self.max_num_partitions, + max_num_privacy_ids=self.max_num_privacy_ids, + ) + ) + self.generator = ( + pipeline_dp_vizier_dataset_generator.PipelineDpDatasetGenerator( + config=self.vizier_config, + pipeline_dp_generator_config=self.pipeline_dp_config, + ) + ) + + def test_get_neighboring_datasets_from_vizier_params(self): + """Tests datasets have correct shapes and are adjacent.""" + num_entries_data1 = (self.max_num_privacy_ids - 1) * self.max_num_partitions + num_entries_data2 = self.max_num_privacy_ids * self.max_num_partitions + + data1, data2 = self.generator(None) + with self.subTest('data1-has-correct-shape'): + self.assertLen(data1, num_entries_data1) + + with self.subTest('data2-has-correct-shape'): + self.assertLen(data2, num_entries_data2) + + # data1 should contain ids 0 through `self.max_num_privacy_ids - 2` + # and data2 should contain ids 0 through `self.max_num_privacy_ids - 1` + with self.subTest('data1-has-correct-ids'): + self.assertAllInRange( + [x[0] for x in data1], 0, self.max_num_privacy_ids - 2 + ) + with self.subTest('data2-has-correct-ids'): + self.assertAllInRange( + [x[0] for x in data2], 0, self.max_num_privacy_ids - 1 + ) + + +if __name__ == '__main__': + absltest.main() diff --git a/python/dp_auditorium/dp_auditorium/generators/vizier_dataset_generator.py b/python/dp_auditorium/dp_auditorium/generators/vizier_dataset_generator.py index d61643a5..574a90f0 100644 --- a/python/dp_auditorium/dp_auditorium/generators/vizier_dataset_generator.py +++ b/python/dp_auditorium/dp_auditorium/generators/vizier_dataset_generator.py @@ -14,8 +14,7 @@ """DatasetGenerator that connects with Vizier to suggest trials.""" import abc -import dataclasses -from typing import Optional, Union +from typing import Optional from absl import logging import numpy as np @@ -26,14 +25,6 @@ from dp_auditorium.configs import dataset_generator_config -@dataclasses.dataclass(frozen=True) -class VizierGeneratorConfig: - num_params: int - min_value: Union[float, int] - max_value: Union[float, int] - data_type: dataset_generator_config.DataType - - def _get_trial( study_client: clients.Study, ) -> tuple[Optional[clients.Trial], bool]: @@ -66,14 +57,14 @@ def _get_params_name_mapping( def _add_params_to_vizier_problem( vizier_problem: vz.ProblemStatement, idx_to_str: dict[int, str], - input_config: VizierGeneratorConfig, + input_config: dataset_generator_config.VizierDatasetGeneratorConfig, ) -> vz.ProblemStatement: """Adds variables to optimize be optimized to a vizier problem.""" if ( input_config.data_type == dataset_generator_config.DataType.DATA_TYPE_FLOAT ): - for i in range(input_config.num_params): + for i in range(input_config.num_vizier_parameters): vizier_problem.search_space.root.add_float_param( idx_to_str[i], min_value=input_config.min_value, @@ -83,11 +74,15 @@ def _add_params_to_vizier_problem( input_config.data_type == dataset_generator_config.DataType.DATA_TYPE_INT32 ): - for i in range(input_config.num_params): + for i in range(input_config.num_vizier_parameters): + logging.info( + 'When using `DATA_TYPE_INT32`, `min_value` and `max_value` of type' + ' float will be converted to `int` type.' + ) vizier_problem.search_space.root.add_int_param( idx_to_str[i], - min_value=input_config.min_value, - max_value=input_config.max_value, + min_value=int(input_config.min_value), + max_value=int(input_config.max_value), ) else: raise NotImplementedError( @@ -117,14 +112,7 @@ def __init__( Args: config: A configuration proto for Vizier dataset generator. - - Raises: - ValueError: Unsupported Vizier algorithm. """ - vizier_algorithms = [algorithm.value for algorithm in vz.Algorithm] - if config.search_algorithm not in vizier_algorithms: - raise ValueError('Unsupported algorithm: %s' % config.search_algorithm) - # Get indices to parameter names mapping assigned in Vizier and its inverse. idx_to_str, str_to_idx = _get_params_name_mapping( config.num_vizier_parameters @@ -135,12 +123,7 @@ def __init__( problem = _add_params_to_vizier_problem( vizier_problem=problem, idx_to_str=idx_to_str, - input_config=VizierGeneratorConfig( - num_params=config.num_vizier_parameters, - min_value=config.min_value, - max_value=config.max_value, - data_type=config.data_type, - ), + input_config=config, ) # Define metric. @@ -162,7 +145,6 @@ def __init__( self._trial_loaded = False self._last_trial = None - logging.info('Finished initializing dataset generator.') def _load_trial(self) -> None: if not self._trial_loaded: diff --git a/python/dp_auditorium/dp_auditorium/generators/vizier_dataset_generator_test.py b/python/dp_auditorium/dp_auditorium/generators/vizier_dataset_generator_test.py index b49a09e5..e93556e4 100644 --- a/python/dp_auditorium/dp_auditorium/generators/vizier_dataset_generator_test.py +++ b/python/dp_auditorium/dp_auditorium/generators/vizier_dataset_generator_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Tests for vizier_dataset_generator.""" - +import time from absl.testing import absltest import numpy as np import tensorflow as tf @@ -45,7 +45,7 @@ def setUp(self): self.min_value = 0.0 self.max_value = 1.0 self.config = dataset_generator_config.VizierDatasetGeneratorConfig( - study_name='stub_study', + study_name='stub_study'+str(time.time()), study_owner='owner', num_vizier_parameters=self.num_params, data_type=dataset_generator_config.DataType.DATA_TYPE_FLOAT,