From a8f75bb1aa876789db2bfce1eaffd4c06c894e2d Mon Sep 17 00:00:00 2001 From: saksena Date: Wed, 15 Nov 2017 15:40:05 -0800 Subject: [PATCH 01/32] Fixing the delete cluster operations for Redshift, and permitting spinning on the delete status, while the service is asynchronously decommissioned. This avoids running the delete command multiple times and correctly tracks the delete time and status of the resource. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=175892551 --- perfkitbenchmarker/providers/aws/redshift.py | 13 ++++++++- perfkitbenchmarker/resource.py | 28 ++++++++++++++++++-- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/perfkitbenchmarker/providers/aws/redshift.py b/perfkitbenchmarker/providers/aws/redshift.py index 3b396a492a..b87a4d9381 100644 --- a/perfkitbenchmarker/providers/aws/redshift.py +++ b/perfkitbenchmarker/providers/aws/redshift.py @@ -30,7 +30,8 @@ FLAGS = flags.FLAGS -VALID_EXIST_STATUSES = ['creating', 'available', 'deleting'] +VALID_EXIST_STATUSES = ['creating', 'available'] +DELETION_STATUSES = ['deleting'] READY_STATUSES = ['available'] @@ -163,6 +164,7 @@ def __init__(self, edw_service_spec): self.db = '' self.user = '' self.password = '' + self.supports_wait_on_delete = True def _Create(self): """Create a new redshift cluster.""" @@ -300,6 +302,15 @@ def _Delete(self): '--skip-final-cluster-snapshot'] vm_util.IssueCommand(cmd) + def _IsDeleting(self): + """Method to check if the cluster is being deleting.""" + stdout, _, _ = self.__DescribeCluster() + if not stdout: + return False + else: + return (json.loads(stdout)['Clusters'][0]['ClusterStatus'] in + DELETION_STATUSES) + def _DeleteDependencies(self): """Delete dependencies of a redshift cluster.""" self.cluster_subnet_group._Delete() diff --git a/perfkitbenchmarker/resource.py b/perfkitbenchmarker/resource.py index 9b7014e38c..97583b7950 100644 --- a/perfkitbenchmarker/resource.py +++ b/perfkitbenchmarker/resource.py @@ -91,6 +91,20 @@ def _IsReady(self): """ return True + def _IsDeleting(self): + """Return true if the underlying resource is getting deleted. + + Supplying this method is optional. Potentially use when the resource has an + aynchcronous deletion operation to avoid rerunning the deletion command and + track the deletion time correctly. If the subclass does not implement it + then it just returns false. + + Returns: + True if the resource was being deleted, False if the resource was in a non + deleting state. + """ + return False + def _PostCreate(self): """Method that will be called once after _CreateReource is called. @@ -150,8 +164,6 @@ def _DeleteResource(self): 'Deletion of %s failed.' % type(self).__name__) except NotImplementedError: pass - self.deleted = True - self.delete_end_time = time.time() def Create(self): """Creates a resource and its dependencies.""" @@ -176,7 +188,19 @@ def WaitUntilReady(): def Delete(self): """Deletes a resource and its dependencies.""" + + # Retryable method which allows waiting for deletion of the resource. + @vm_util.Retry(poll_interval=5, fuzz=0, timeout=3600, + retryable_exceptions=( + errors.Resource.RetryableDeletionError,)) + def WaitUntilDeleted(): + if self._IsDeleting(): + raise errors.Resource.RetryableDeletionError('Not yet deleted') + if self.user_managed: return self._DeleteResource() + WaitUntilDeleted() + self.deleted = True + self.delete_end_time = time.time() self._DeleteDependencies() From 3a2705615e1eba71b08a1797b1486ef9ff448918 Mon Sep 17 00:00:00 2001 From: dlott Date: Wed, 15 Nov 2017 16:34:59 -0800 Subject: [PATCH 02/32] Import latest changes from GitHub (https://github.com/GoogleCloudPlatform/PerfKitBenchmarker/commit/903eb82d4e7ee5ed2ac2953cf6ce1b80459497ed) ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=175900388 --- perfkitbenchmarker/linux_benchmarks/edw_benchmark.py | 2 +- perfkitbenchmarker/providers/aws/redshift.py | 3 +-- tox.ini | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/perfkitbenchmarker/linux_benchmarks/edw_benchmark.py b/perfkitbenchmarker/linux_benchmarks/edw_benchmark.py index 7ed713bd4e..513661c42b 100644 --- a/perfkitbenchmarker/linux_benchmarks/edw_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/edw_benchmark.py @@ -43,7 +43,7 @@ vm_spec: *default_single_core """ flags.DEFINE_list('edw_benchmark_scripts', 'sample.sql', 'Comma separated ' - 'list of scripts.') + 'list of scripts.') FLAGS = flags.FLAGS diff --git a/perfkitbenchmarker/providers/aws/redshift.py b/perfkitbenchmarker/providers/aws/redshift.py index b87a4d9381..f2810ab21d 100644 --- a/perfkitbenchmarker/providers/aws/redshift.py +++ b/perfkitbenchmarker/providers/aws/redshift.py @@ -242,8 +242,7 @@ def Restore(self, snapshot_identifier, cluster_identifier): '--cluster-parameter-group-name', self.cluster_subnet_group.name, '--publicly-accessible', - '--automated-snapshot-retention-period=0' - ] + '--automated-snapshot-retention-period=0'] stdout, stderr, _ = vm_util.IssueCommand(cmd) if not stdout: raise errors.Resource.CreationError('Cluster creation failure: ' diff --git a/tox.ini b/tox.ini index 12a26bf9ef..23cf80a3b8 100644 --- a/tox.ini +++ b/tox.ini @@ -41,5 +41,5 @@ setenv = PYTHONPATH = {toxinidir}/perfkitbenchmarker/scripts:{toxinidir}/perfkitbenchmarker/scripts/object_storage_api_test_scripts:{env:PYTHONPATH:} [flake8] -ignore = E111,E129,E303 +ignore = E111,E125,E129,E303,E501 max-line-length = 80 From 3713424cc1e53a251ac5f2311c081dbdaa00678c Mon Sep 17 00:00:00 2001 From: ehankland Date: Thu, 16 Nov 2017 11:03:58 -0800 Subject: [PATCH 03/32] Delete resources before attempting to publish ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=175988443 --- perfkitbenchmarker/pkb.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/perfkitbenchmarker/pkb.py b/perfkitbenchmarker/pkb.py index 9ff27625c7..ce7d7de0e0 100755 --- a/perfkitbenchmarker/pkb.py +++ b/perfkitbenchmarker/pkb.py @@ -615,10 +615,12 @@ def RunBenchmark(spec, collector): DoCleanupPhase(spec, detailed_timer) raise finally: - if FLAGS.publish_after_run: - collector.PublishSamples() + # Deleting resources should happen first so any errors with publishing + # don't prevent teardown. if stages.TEARDOWN in FLAGS.run_stage: spec.Delete() + if FLAGS.publish_after_run: + collector.PublishSamples() events.benchmark_end.send(benchmark_spec=spec) # Pickle spec to save final resource state. spec.Pickle() From 358714f23af620089eba950d5e8fead3c0968f92 Mon Sep 17 00:00:00 2001 From: saksena Date: Thu, 16 Nov 2017 14:28:26 -0800 Subject: [PATCH 04/32] Edw benchmark improvements to driver and sql script handling. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=176019861 --- .../linux_benchmarks/edw_benchmark.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/perfkitbenchmarker/linux_benchmarks/edw_benchmark.py b/perfkitbenchmarker/linux_benchmarks/edw_benchmark.py index 513661c42b..1854a53a31 100644 --- a/perfkitbenchmarker/linux_benchmarks/edw_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/edw_benchmark.py @@ -20,6 +20,7 @@ import copy +import os from perfkitbenchmarker import configs from perfkitbenchmarker import data @@ -59,15 +60,16 @@ def Prepare(benchmark_spec): def Run(benchmark_spec): """Run phase executes the sql scripts on edw cluster and collects duration.""" - driver_name = '{}_driver.sh'.format(benchmark_spec.edw_service.SERVICE_TYPE) - driver_path = data.ResourcePath(driver_name) - - scripts_name = '{}_sql'.format(benchmark_spec.edw_service.SERVICE_TYPE) - scripts_path = data.ResourcePath(scripts_name) - vm = benchmark_spec.vms[0] + driver_name = '{}_driver.sh'.format(benchmark_spec.edw_service.SERVICE_TYPE) + driver_path = data.ResourcePath(os.path.join('edw', driver_name)) vm.PushFile(driver_path) - vm.PushFile(scripts_path) + + scripts_dir = '{}_sql'.format(benchmark_spec.edw_service.SERVICE_TYPE) + scripts_list = FLAGS.edw_benchmark_scripts + for script in scripts_list: + script_path = data.ResourcePath(os.path.join('edw', scripts_dir, script)) + vm.PushFile(script_path) driver_perms_update_cmd = 'chmod 755 {}'.format(driver_name) vm.RemoteCommand(driver_perms_update_cmd) From a12bfe60a75d9f535e8a18aeda2837211787ccf7 Mon Sep 17 00:00:00 2001 From: Steven Deitz Date: Thu, 16 Nov 2017 15:46:28 -0800 Subject: [PATCH 05/32] Add a flag to enable AOF mode in Redis server. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=176033925 --- perfkitbenchmarker/linux_packages/redis_server.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/perfkitbenchmarker/linux_packages/redis_server.py b/perfkitbenchmarker/linux_packages/redis_server.py index eac06068d4..eeac8d4b50 100644 --- a/perfkitbenchmarker/linux_packages/redis_server.py +++ b/perfkitbenchmarker/linux_packages/redis_server.py @@ -22,6 +22,8 @@ flags.DEFINE_integer('redis_total_num_processes', 1, 'Total number of redis server processes.', lower_bound=1) +flags.DEFINE_boolean('redis_enable_aof', False, + 'Enable append-only file (AOF) with appendfsync always.') REDIS_VERSION = '2.8.9' @@ -59,6 +61,18 @@ def Configure(vm): sed_cmd = (r"sed -i -e '/^save /d' -e 's/# *save \"\"/save \"\"/' " "{0}/redis.conf").format(REDIS_DIR) vm.RemoteCommand(sed_cmd) + if FLAGS.redis_enable_aof: + vm.RemoteCommand( + (r'sed -i -e "s/appendonly no/appendonly yes/g" {0}/redis.conf' + ).format(REDIS_DIR)) + vm.RemoteCommand(( + r'sed -i -e "s/appendfsync everysec/# appendfsync everysec/g" ' + r'{0}/redis.conf' + ).format(REDIS_DIR)) + vm.RemoteCommand(( + r'sed -i -e "s/# appendfsync always/appendfsync always/g" ' + r'{0}/redis.conf' + ).format(REDIS_DIR)) for i in range(FLAGS.redis_total_num_processes): port = REDIS_FIRST_PORT + i vm.RemoteCommand( From f6f656ddd40ad8a413558d2142fb98f22f206002 Mon Sep 17 00:00:00 2001 From: ferneyhough Date: Thu, 16 Nov 2017 16:17:53 -0800 Subject: [PATCH 06/32] Enable support for high-availability Google CloudSQL Postgres databases (https://cloud.google.com/sql/docs/postgres/high-availability). High availability instances can be created by using the --managed_db_high_availability=true flag. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=176038709 --- .../gcp/gcp_managed_relational_db.py | 35 +++- .../gcp/gcp_managed_relational_db_test.py | 175 ++++++++++-------- 2 files changed, 121 insertions(+), 89 deletions(-) diff --git a/perfkitbenchmarker/providers/gcp/gcp_managed_relational_db.py b/perfkitbenchmarker/providers/gcp/gcp_managed_relational_db.py index 75b07d0470..ad83edb78d 100644 --- a/perfkitbenchmarker/providers/gcp/gcp_managed_relational_db.py +++ b/perfkitbenchmarker/providers/gcp/gcp_managed_relational_db.py @@ -51,6 +51,10 @@ IS_READY_TIMEOUT = 600 # 10 minutes +class UnsupportedDatabaseEngineException(Exception): + pass + + class GCPManagedRelationalDb(managed_relational_db.BaseManagedRelationalDb): """A GCP CloudSQL database resource. @@ -67,10 +71,6 @@ def __init__(self, managed_relational_db_spec): self.spec = managed_relational_db_spec self.project = FLAGS.project or util.GetDefaultProject() self.instance_id = 'pkb-db-instance-' + FLAGS.run_uri - if (self.spec.engine == managed_relational_db.POSTGRES and - self.spec.high_availability): - raise Exception(('High availability configuration is not supported on ' - 'CloudSQL PostgreSQL')) def _Create(self): """Creates the Cloud SQL instance and authorizes traffic from anywhere.""" @@ -116,12 +116,8 @@ def _Create(self): cmd_string.append('--cpu={}'.format(cpus)) cmd_string.append('--memory={}MiB'.format(memory)) - # High availability only supported on MySQL. A check is done in - # __init__ to ensure that a Postgres HA configuration is not requested. - if (self.spec.high_availability and - self.spec.engine == managed_relational_db.MYSQL): - ha_flag = '--failover-replica-name=replica-' + self.instance_id - cmd_string.append(ha_flag) + if self.spec.high_availability: + cmd_string.append(self._GetHighAvailabilityFlag()) if self.spec.backup_enabled: cmd_string.append('--backup') @@ -134,6 +130,25 @@ def _Create(self): _, _, _ = cmd.Issue() + def _GetHighAvailabilityFlag(self): + """Returns a flag that enables high-availability for the specified engine. + + Returns: + Flag (as string) to be appended to the gcloud sql create command. + + Raises: + UnsupportedDatabaseEngineException: + if engine does not support high availability. + """ + if self.spec.engine == managed_relational_db.MYSQL: + return '--failover-replica-name=replica-' + self.instance_id + elif self.spec.engine == managed_relational_db.POSTGRES: + return '--availability-type=REGIONAL' + else: + raise UnsupportedDatabaseEngineException( + 'High availability not supported on engine {0}'.format( + self.spec.engine)) + def _ValidateSpec(self): """Validates PostgreSQL spec for CPU and memory. diff --git a/tests/providers/gcp/gcp_managed_relational_db_test.py b/tests/providers/gcp/gcp_managed_relational_db_test.py index 18eaef4604..894720b959 100644 --- a/tests/providers/gcp/gcp_managed_relational_db_test.py +++ b/tests/providers/gcp/gcp_managed_relational_db_test.py @@ -29,7 +29,6 @@ from perfkitbenchmarker.providers.gcp import gce_virtual_machine from perfkitbenchmarker.providers.gcp import util from perfkitbenchmarker import disk -from perfkitbenchmarker import data _BENCHMARK_NAME = 'name' _BENCHMARK_UID = 'benchmark_uid' @@ -37,7 +36,28 @@ _FLAGS = None -class GcpManagedRelationalDbTestCase(unittest.TestCase): +def CreateManagedDbFromSpec(spec_dict): + mock_db_spec = mock.Mock( + spec=benchmark_config_spec._ManagedRelationalDbSpec) + mock_db_spec.configure_mock(**spec_dict) + db_class = gcp_managed_relational_db.GCPManagedRelationalDb(mock_db_spec) + return db_class + + +@contextlib.contextmanager +def PatchCriticalObjects(stdout='', stderr='', return_code=0): + """A context manager that patches a few critical objects with mocks.""" + retval = (stdout, stderr, return_code) + with mock.patch(vm_util.__name__ + '.IssueCommand', + return_value=retval) as issue_command, \ + mock.patch('__builtin__.open'), \ + mock.patch(vm_util.__name__ + '.NamedTemporaryFile'), \ + mock.patch(util.__name__ + '.GetDefaultProject', + return_value='fakeproject'): + yield issue_command + + +class GcpMysqlManagedRelationalDbTestCase(unittest.TestCase): def createMySQLSpecDict(self): vm_spec = virtual_machine.BaseVmSpec('NAME', @@ -59,33 +79,6 @@ def createMySQLSpecDict(self): 'backup_start_time': '07:00', } - def createPostgresSpecDict(self): - machine_type = { - 'machine_type': {'cpus': 1, 'memory': '3840MiB'}, - 'zone': 'us-west1-b', - } - vm_spec = gce_virtual_machine.GceVmSpec('NAME', **machine_type) - disk_spec = disk.BaseDiskSpec('NAME', **{'disk_size': 50}) - return { - 'engine': POSTGRES, - 'engine_version': '5.7', - 'run_uri': '123', - 'database_name': 'fakedbname', - 'database_password': 'fakepassword', - 'vm_spec': vm_spec, - 'disk_spec': disk_spec, - 'high_availability': False, - 'backup_enabled': True, - 'backup_start_time': '07:00' - } - - def createManagedDbFromSpec(self, spec_dict): - mock_db_spec = mock.Mock( - spec=benchmark_config_spec._ManagedRelationalDbSpec) - mock_db_spec.configure_mock(**spec_dict) - db_class = gcp_managed_relational_db.GCPManagedRelationalDb(mock_db_spec) - return db_class - def setUp(self): flag_values = {'run_uri': '123', 'project': None} @@ -99,32 +92,20 @@ def setUp(self): self.mock_db_spec.configure_mock(**mock_db_spec_attrs) def testNoHighAvailability(self): - with self._PatchCriticalObjects() as issue_command: - db = self.createManagedDbFromSpec(self.createMySQLSpecDict()) + with PatchCriticalObjects() as issue_command: + db = CreateManagedDbFromSpec(self.createMySQLSpecDict()) db._Create() - self.assertEquals(issue_command.call_count, 1) + self.assertEqual(issue_command.call_count, 1) command_string = ' '.join(issue_command.call_args[0][0]) self.assertNotIn('--failover-replica-name', command_string) self.assertNotIn('replica-pkb-db-instance-123', command_string) - @contextlib.contextmanager - def _PatchCriticalObjects(self, stdout='', stderr='', return_code=0): - """A context manager that patches a few critical objects with mocks.""" - retval = (stdout, stderr, return_code) - with mock.patch(vm_util.__name__ + '.IssueCommand', - return_value=retval) as issue_command, \ - mock.patch('__builtin__.open'), \ - mock.patch(vm_util.__name__ + '.NamedTemporaryFile'), \ - mock.patch(util.__name__ + '.GetDefaultProject', - return_value='fakeproject'): - yield issue_command - def testCreate(self): - with self._PatchCriticalObjects() as issue_command: + with PatchCriticalObjects() as issue_command: vm = gcp_managed_relational_db.GCPManagedRelationalDb(self.mock_db_spec) vm._Create() - self.assertEquals(issue_command.call_count, 1) + self.assertEqual(issue_command.call_count, 1) command_string = ' '.join(issue_command.call_args[0][0]) self.assertTrue( @@ -139,26 +120,13 @@ def testCreate(self): self.assertIn('--gce-zone=us-west1-b', command_string) self.assertIn('--region=us-west1', command_string) - def testCreatePostgres(self): - with self._PatchCriticalObjects() as issue_command: - spec = self.createPostgresSpecDict() - spec['engine'] = 'postgres' - spec['engine_version'] = '9.6' - db = self.createManagedDbFromSpec(spec) - db._Create() - self.assertEquals(issue_command.call_count, 1) - command_string = ' '.join(issue_command.call_args[0][0]) - self.assertIn('database-version=POSTGRES_9_6', command_string) - self.assertIn('--cpu=1', command_string) - self.assertIn('--memory=3840MiB', command_string) - def testCreateWithBackupDisabled(self): - with self._PatchCriticalObjects() as issue_command: + with PatchCriticalObjects() as issue_command: spec = self.mock_db_spec spec.backup_enabled = False vm = gcp_managed_relational_db.GCPManagedRelationalDb(self.mock_db_spec) vm._Create() - self.assertEquals(issue_command.call_count, 1) + self.assertEqual(issue_command.call_count, 1) command_string = ' '.join(issue_command.call_args[0][0]) self.assertTrue( @@ -171,10 +139,10 @@ def testCreateWithBackupDisabled(self): self.assertNotIn('--backup-start-time=07:00', command_string) def testDelete(self): - with self._PatchCriticalObjects() as issue_command: + with PatchCriticalObjects() as issue_command: vm = gcp_managed_relational_db.GCPManagedRelationalDb(self.mock_db_spec) vm._Delete() - self.assertEquals(issue_command.call_count, 1) + self.assertEqual(issue_command.call_count, 1) command_string = ' '.join(issue_command.call_args[0][0]) self.assertTrue( command_string.startswith( @@ -187,8 +155,8 @@ def testIsReady(self): with open(path) as fp: test_output = fp.read() - with self._PatchCriticalObjects(stdout=test_output): - db = self.createManagedDbFromSpec(self.createMySQLSpecDict()) + with PatchCriticalObjects(stdout=test_output): + db = CreateManagedDbFromSpec(self.createMySQLSpecDict()) self.assertEqual(True, db._IsReady()) def testExists(self): @@ -198,17 +166,17 @@ def testExists(self): with open(path) as fp: test_output = fp.read() - with self._PatchCriticalObjects(stdout=test_output): - db = self.createManagedDbFromSpec(self.createMySQLSpecDict()) + with PatchCriticalObjects(stdout=test_output): + db = CreateManagedDbFromSpec(self.createMySQLSpecDict()) self.assertEqual(True, db._Exists()) def testHighAvailability(self): - with self._PatchCriticalObjects() as issue_command: + with PatchCriticalObjects() as issue_command: spec = self.createMySQLSpecDict() spec['high_availability'] = True - db = self.createManagedDbFromSpec(spec) + db = CreateManagedDbFromSpec(spec) db._Create() - self.assertEquals(issue_command.call_count, 1) + self.assertEqual(issue_command.call_count, 1) command_string = ' '.join(issue_command.call_args[0][0]) self.assertIn('--failover-replica-name', command_string) @@ -221,22 +189,43 @@ def testParseEndpoint(self): with open(path) as fp: test_output = fp.read() - with self._PatchCriticalObjects(): - db = self.createManagedDbFromSpec(self.createMySQLSpecDict()) - self.assertEquals('', db._ParseEndpoint(None)) + with PatchCriticalObjects(): + db = CreateManagedDbFromSpec(self.createMySQLSpecDict()) + self.assertEqual('', db._ParseEndpoint(None)) self.assertIn('10.10.0.35', db._ParseEndpoint(json.loads(test_output))) + +class GcpPostgresManagedRelationlDbTestCase(unittest.TestCase): + + def createPostgresSpecDict(self): + machine_type = { + 'machine_type': {'cpus': 1, 'memory': '3840MiB'}, + 'zone': 'us-west1-b', + } + vm_spec = gce_virtual_machine.GceVmSpec('NAME', **machine_type) + disk_spec = disk.BaseDiskSpec('NAME', **{'disk_size': 50}) + return { + 'engine': POSTGRES, + 'engine_version': '5.7', + 'run_uri': '123', + 'database_name': 'fakedbname', + 'database_password': 'fakepassword', + 'vm_spec': vm_spec, + 'disk_spec': disk_spec, + 'high_availability': False, + 'backup_enabled': True, + 'backup_start_time': '07:00' + } + def testValidateSpec(self): - with self._PatchCriticalObjects(): - db_sql = self.createManagedDbFromSpec(self.createMySQLSpecDict()) - self.assertRaises(data.ResourceNotFound, db_sql._ValidateSpec) - db_postgres = self.createManagedDbFromSpec(self.createPostgresSpecDict()) + with PatchCriticalObjects(): + db_postgres = CreateManagedDbFromSpec(self.createPostgresSpecDict()) db_postgres._ValidateSpec() def testValidateMachineType(self): - with self._PatchCriticalObjects(): - db = self.createManagedDbFromSpec(self.createPostgresSpecDict()) + with PatchCriticalObjects(): + db = CreateManagedDbFromSpec(self.createPostgresSpecDict()) self.assertRaises(ValueError, db._ValidateMachineType, 0, 0) self.assertRaises(ValueError, db._ValidateMachineType, 3840, 0) self.assertRaises(ValueError, db._ValidateMachineType, 255, 1) @@ -245,5 +234,33 @@ def testValidateMachineType(self): db._ValidateMachineType(db.spec.vm_spec.memory, db.spec.vm_spec.cpus) + def testCreateNonHighAvailability(self): + with PatchCriticalObjects() as issue_command: + spec = self.createPostgresSpecDict() + spec['engine'] = 'postgres' + spec['engine_version'] = '9.6' + db = CreateManagedDbFromSpec(spec) + db._Create() + self.assertEqual(issue_command.call_count, 1) + command_string = ' '.join(issue_command.call_args[0][0]) + self.assertIn('database-version=POSTGRES_9_6', command_string) + self.assertIn('--cpu=1', command_string) + self.assertIn('--memory=3840MiB', command_string) + self.assertNotIn('--availability-type=REGIONAL', command_string) + + def testCreateHighAvailability(self): + with PatchCriticalObjects() as issue_command: + spec = self.createPostgresSpecDict() + spec['high_availability'] = True + spec['engine'] = 'postgres' + spec['engine_version'] = '9.6' + db = CreateManagedDbFromSpec(spec) + db._Create() + self.assertEqual(issue_command.call_count, 1) + command_string = ' '.join(issue_command.call_args[0][0]) + + self.assertIn('--availability-type=REGIONAL', command_string) + + if __name__ == '__main__': unittest.main() From c9dd03bc628ba56c11a02b52eda6030e640e8047 Mon Sep 17 00:00:00 2001 From: saksena Date: Fri, 17 Nov 2017 12:02:00 -0800 Subject: [PATCH 07/32] Refactoring the sql execution driver script to reference sql queries in the current directory. Increasing the default timeout for resource creation to handle multi TB clusters. Adding the snapshot details to the metadata. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=176141018 --- perfkitbenchmarker/data/edw/redshift_driver.sh | 2 +- perfkitbenchmarker/providers/aws/redshift.py | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/perfkitbenchmarker/data/edw/redshift_driver.sh b/perfkitbenchmarker/data/edw/redshift_driver.sh index fb453d9a77..8aaeefa33d 100755 --- a/perfkitbenchmarker/data/edw/redshift_driver.sh +++ b/perfkitbenchmarker/data/edw/redshift_driver.sh @@ -30,7 +30,7 @@ START_TIME=$SECONDS for REDSHIFT_SCRIPT in "${REDSHIFT_SCRIPT_LIST[@]}" do - PGPASSWORD=$REDSHIFT_PASSWORD psql -h $REDSHIFT_HOST -p 5439 -d $REDSHIFT_DB -U $REDSHIFT_USER -f redshift_sql/$REDSHIFT_SCRIPT > /dev/null & + PGPASSWORD=$REDSHIFT_PASSWORD psql -h $REDSHIFT_HOST -p 5439 -d $REDSHIFT_DB -U $REDSHIFT_USER -f $REDSHIFT_SCRIPT > /dev/null & pid=$! pids="$pids $pid" done diff --git a/perfkitbenchmarker/providers/aws/redshift.py b/perfkitbenchmarker/providers/aws/redshift.py index f2810ab21d..99a772bc5f 100644 --- a/perfkitbenchmarker/providers/aws/redshift.py +++ b/perfkitbenchmarker/providers/aws/redshift.py @@ -54,6 +54,14 @@ def AddTags(resource_arn, region, **kwargs): vm_util.IssueCommand(tag_cmd) +def GetDefaultRegion(): + """Get the default region for the aws account.""" + cmd_prefix = util.AWS_PREFIX + default_region_cmd = cmd_prefix + ['configure', 'get', 'region'] + stdout, _, _ = vm_util.IssueCommand(default_region_cmd) + return stdout + + class RedshiftClusterSubnetGroup(object): """Cluster Subnet Group associated with a Redshift cluster launched in a vpc. @@ -154,7 +162,9 @@ def __init__(self, edw_service_spec): if FLAGS.zones: self.zone = FLAGS.zones[0] self.region = util.GetRegionFromZone(self.zone) - self.cmd_prefix += ['--region', self.region] + else: + self.region = GetDefaultRegion() + self.cmd_prefix += ['--region', self.region] self.arn = '' self.cluster_identifier = 'pkb-' + FLAGS.run_uri self.cluster_subnet_group = None @@ -165,6 +175,7 @@ def __init__(self, edw_service_spec): self.user = '' self.password = '' self.supports_wait_on_delete = True + self.snapshot = None def _Create(self): """Create a new redshift cluster.""" @@ -173,6 +184,7 @@ def _Create(self): self.cluster_parameter_group = RedshiftClusterParameterGroup( self.concurrency, self.cmd_prefix) if self.spec.snapshot: + self.snapshot = self.spec.snapshot self.Restore(self.spec.snapshot, self.cluster_identifier) else: # TODO(saksena@): Implmement the new Redshift cluster creation @@ -319,4 +331,7 @@ def GetMetadata(self): """Return a dictionary of the metadata for this cluster.""" basic_data = super(Redshift, self).GetMetadata() basic_data['edw_cluster_concurrency'] = self.concurrency + basic_data['region'] = self.region + if self.snapshot is not None: + basic_data['snapshot'] = self.snapshot return basic_data From 3d5ced1308a4017f02888632270356d97faf61ee Mon Sep 17 00:00:00 2001 From: dlott Date: Mon, 20 Nov 2017 10:11:46 -0800 Subject: [PATCH 08/32] Linter fixes. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=176377490 --- tests/aws_test.py | 48 ++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/tests/aws_test.py b/tests/aws_test.py index e6446cbc0f..5a9fdd8da6 100644 --- a/tests/aws_test.py +++ b/tests/aws_test.py @@ -93,7 +93,7 @@ def testVpcPresent(self): class AwsVirtualMachineTestCase(unittest.TestCase): - def openJsonData(self, filename): + def open_json_data(self, filename): path = os.path.join(os.path.dirname(__file__), 'data', filename) with open(path) as f: @@ -135,9 +135,9 @@ def setUp(self): network_mock.placement_group = placement_group self.vm.network = network_mock - self.response = self.openJsonData('aws-describe-instance.json') - self.sir_response =\ - self.openJsonData('aws-describe-spot-instance-requests.json') + self.response = self.open_json_data('aws-describe-instance.json') + self.sir_response = self.open_json_data( + 'aws-describe-spot-instance-requests.json') def testInstancePresent(self): util.IssueRetryableCommand.side_effect = [(self.response, None)] @@ -188,8 +188,8 @@ def testCreateSpot(self): def testCreateSpotLowPriceFails(self): response_low = json.loads(self.sir_response) response_low['SpotInstanceRequests'][0]['Status']['Code'] = 'price-too-low' - response_low['SpotInstanceRequests'][0]['Status']['Message'] = \ - 'Your price is too low.' + response_low['SpotInstanceRequests'][0]['Status']['Message'] = ( + 'Your price is too low.') response_cancel = ( '{"CancelledSpotInstanceRequests":' '[{"State": "cancelled","SpotInstanceRequestId":"sir-2mcg43gk"}]}') @@ -199,19 +199,19 @@ def testCreateSpotLowPriceFails(self): self.assertRaises(errors.Resource.CreationError, self.vm._CreateSpot) - def testDeleteCancelsSpotInstanceRequest(self): - self.vm.spot_instance_request_id = 'sir-abc' + def testDeleteCancelsSpotInstanceRequest(self): + self.vm.spot_instance_request_id = 'sir-abc' - self.vm._Delete() + self.vm._Delete() - vm_util.IssueCommand.assert_called_with( - ['aws', - '--output', - 'json', - '--region=us-east-1', - 'ec2', - 'cancel-spot-instance-requests', - '--spot-instance-request-ids=sir-abc']) + vm_util.IssueCommand.assert_called_with( + ['aws', + '--output', + 'json', + '--region=us-east-1', + 'ec2', + 'cancel-spot-instance-requests', + '--spot-instance-request-ids=sir-abc']) class AwsIsRegionTestCase(unittest.TestCase): @@ -250,19 +250,20 @@ def setUp(self): path = os.path.join(os.path.dirname(__file__), 'data', 'describe_image_output.txt') with open(path) as fp: - self.describeImageOutput = fp.read() + self.describe_image_output = fp.read() def testInvalidMachineType(self): self.assertEqual(aws_virtual_machine.GetBlockDeviceMap('invalid'), None) def testValidMachineTypeWithNoRootVolumeSize(self): - expected = [{"DeviceName": "/dev/xvdb", - "VirtualName": "ephemeral0"}] + expected = [{'DeviceName': '/dev/xvdb', + 'VirtualName': 'ephemeral0'}] actual = json.loads(aws_virtual_machine.GetBlockDeviceMap('c1.medium')) self.assertEqual(actual, expected) def testValidMachineTypeWithSpecifiedRootVolumeSize(self): - util.IssueRetryableCommand.side_effect = [(self.describeImageOutput, None)] + util.IssueRetryableCommand.side_effect = [(self.describe_image_output, + None)] desired_root_volume_size_gb = 35 machine_type = 'c1.medium' image_id = 'ami-a9d276c9' @@ -289,10 +290,11 @@ def setUp(self): path = os.path.join(os.path.dirname(__file__), 'data', 'describe_image_output.txt') with open(path) as fp: - self.describeImageOutput = fp.read() + self.describe_image_output = fp.read() def testOk(self): - util.IssueRetryableCommand.side_effect = [(self.describeImageOutput, None)] + util.IssueRetryableCommand.side_effect = [(self.describe_image_output, + None)] image_id = 'ami-a9d276c9' region = 'us-west-2' expected = { From 69471b5f0365ef9e3f9a1d7d26805b64389cc707 Mon Sep 17 00:00:00 2001 From: ferneyhough Date: Mon, 20 Nov 2017 11:00:47 -0800 Subject: [PATCH 09/32] Changed NewlineDelimitedJSONPublisher (used by BigQueryPublisher) to sort the metadata dict by key before converting it to the 'labels' string. This will result in the 'labels' string being sorted by key in alphabetical order. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=176385451 --- perfkitbenchmarker/publisher.py | 6 +++--- tests/publisher_test.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/perfkitbenchmarker/publisher.py b/perfkitbenchmarker/publisher.py index 0fd83a7eda..b2e5118521 100755 --- a/perfkitbenchmarker/publisher.py +++ b/perfkitbenchmarker/publisher.py @@ -132,16 +132,16 @@ def GetLabelsFromDict(metadata): - """Converts a metadata dictionary to a string of labels. + """Converts a metadata dictionary to a string of labels sorted by key. Args: metadata: a dictionary of string key value pairs. Returns: - A string of labels in the format that Perfkit uses. + A string of labels, sorted by key, in the format that Perfkit uses. """ labels = [] - for k, v in metadata.iteritems(): + for k, v in sorted(metadata.iteritems()): labels.append('|%s:%s|' % (k, v)) return ','.join(labels) diff --git a/tests/publisher_test.py b/tests/publisher_test.py index 80c0475b40..b88a4fe9fa 100644 --- a/tests/publisher_test.py +++ b/tests/publisher_test.py @@ -89,7 +89,7 @@ def testMetadataConvertedToLabels(self): ('foo', 'bar')])}] self.instance.PublishSamples(samples) d = json.load(self.fp) - self.assertDictEqual({'test': 'testa', 'labels': '|key:value|,|foo:bar|'}, + self.assertDictEqual({'test': 'testa', 'labels': '|foo:bar|,|key:value|'}, d) def testJSONRecordPerLine(self): From 85e788440eb127209a26dbfbcdde80cd128f4e24 Mon Sep 17 00:00:00 2001 From: yuyanting Date: Mon, 20 Nov 2017 18:30:04 -0800 Subject: [PATCH 10/32] - Remove i3 image checking ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=176449000 --- perfkitbenchmarker/providers/aws/aws_virtual_machine.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/perfkitbenchmarker/providers/aws/aws_virtual_machine.py b/perfkitbenchmarker/providers/aws/aws_virtual_machine.py index e05ce6aec4..cf5594a90c 100644 --- a/perfkitbenchmarker/providers/aws/aws_virtual_machine.py +++ b/perfkitbenchmarker/providers/aws/aws_virtual_machine.py @@ -320,10 +320,6 @@ def __init__(self, vm_spec): raise ValueError( 'In order to use dedicated hosts, you must specify an availability ' 'zone, not a region ("zone" was %s).' % self.zone) - if self.machine_type[:2].lower() == 'i3' and not self.image: - # TODO(user): Remove this check when pkb defaults to ubuntu-1604. - raise ValueError( - 'In order to use i3 instances, you must specify --image.') if self.use_spot_instance and self.spot_price <= 0.0: raise ValueError( From 2541bc95ff3c666b1765509279af3a0985baad87 Mon Sep 17 00:00:00 2001 From: dlott Date: Sun, 27 Aug 2017 16:45:52 -0700 Subject: [PATCH 11/32] Add aws_image_name_filter flag to ease specifying images. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=176580390 --- CHANGES.next.md | 1 + .../providers/aws/aws_virtual_machine.py | 56 +++++++++++-------- perfkitbenchmarker/providers/aws/flags.py | 2 + 3 files changed, 37 insertions(+), 22 deletions(-) diff --git a/CHANGES.next.md b/CHANGES.next.md index 7ea486ac21..f330cdb429 100644 --- a/CHANGES.next.md +++ b/CHANGES.next.md @@ -9,6 +9,7 @@ - Windows benchmarks can now be run from linux controllers ###Enhancements: +- Add aws_image_name_filter flag to ease specifying images. ###Bug fixes and maintenance updates: - Moved GPU-related specs from GceVmSpec to BaseVmSpec diff --git a/perfkitbenchmarker/providers/aws/aws_virtual_machine.py b/perfkitbenchmarker/providers/aws/aws_virtual_machine.py index cf5594a90c..409feb1bac 100644 --- a/perfkitbenchmarker/providers/aws/aws_virtual_machine.py +++ b/perfkitbenchmarker/providers/aws/aws_virtual_machine.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from collections import OrderedDict """Class to represent an AWS Virtual Machine object. @@ -21,16 +20,17 @@ import base64 import collections +from collections import OrderedDict import json import logging -import uuid import threading import time - +import uuid from perfkitbenchmarker import disk from perfkitbenchmarker import errors from perfkitbenchmarker import flags from perfkitbenchmarker import linux_virtual_machine +from perfkitbenchmarker import providers from perfkitbenchmarker import resource from perfkitbenchmarker import virtual_machine from perfkitbenchmarker import vm_util @@ -39,7 +39,6 @@ from perfkitbenchmarker.providers.aws import aws_disk from perfkitbenchmarker.providers.aws import aws_network from perfkitbenchmarker.providers.aws import util -from perfkitbenchmarker import providers FLAGS = flags.FLAGS @@ -87,7 +86,7 @@ def GetRootBlockDeviceSpecForImage(image_id, region): - """ Queries the CLI and returns the root block device specification as a dict. + """Queries the CLI and returns the root block device specification as a dict. Args: image_id: The EC2 image id to query @@ -107,8 +106,8 @@ def GetRootBlockDeviceSpecForImage(image_id, region): stdout, _ = util.IssueRetryableCommand(command) images = json.loads(stdout) assert images - assert len(images) == 1, \ - 'Expected to receive only one image description for %s' % image_id + assert len(images) == 1, ( + 'Expected to receive only one image description for %s' % image_id) image_spec = images[0] root_device_name = image_spec['RootDeviceName'] block_device_mappings = image_spec['BlockDeviceMappings'] @@ -123,9 +122,9 @@ def GetBlockDeviceMap(machine_type, root_volume_size_gb=None, Args: machine_type: The machine type to create a block device map for. - root_volume_size: The desired size of the root volume, in GiB, + root_volume_size_gb: The desired size of the root volume, in GiB, or None to the default provided by AWS. - image: The image id (AMI) to use in order to lookup the default + image_id: The image id (AMI) to use in order to lookup the default root device specs. This is only required if root_volume_size is specified. region: The region which contains the specified image. This is only @@ -136,15 +135,18 @@ def GetBlockDeviceMap(machine_type, root_volume_size_gb=None, with the AWS CLI, or if the machine type has no local disks, it will return None. If root_volume_size_gb and image_id are provided, the block device map will include the specification for the root volume. + + Raises: + ValueError: If required parameters are not passed. """ mappings = [] if root_volume_size_gb is not None: if image_id is None: raise ValueError( - "image_id must be provided if root_volume_size_gb is specified") + 'image_id must be provided if root_volume_size_gb is specified') if region is None: raise ValueError( - "region must be provided if image_id is specified") + 'region must be provided if image_id is specified') root_block_device = GetRootBlockDeviceSpecForImage(image_id, region) root_block_device['Ebs']['VolumeSize'] = root_volume_size_gb # The 'Encrypted' key must be removed or the CLI will complain @@ -158,7 +160,7 @@ def GetBlockDeviceMap(machine_type, root_volume_size_gb=None, od['VirtualName'] = 'ephemeral%s' % i od['DeviceName'] = '/dev/xvd%s' % chr(ord(DRIVE_START_LETTER) + i) mappings.append(od) - if len(mappings): + if mappings: return json.dumps(mappings) return None @@ -295,6 +297,9 @@ def __init__(self, vm_spec): Args: vm_spec: virtual_machine.BaseVirtualMachineSpec object of the vm. + + Raises: + ValueError: If an incompatible vm_spec is passed. """ super(AwsVirtualMachine, self).__init__(vm_spec) self.region = util.GetRegionFromZone(self.zone) @@ -340,11 +345,20 @@ def group_id(self): def _GetDefaultImage(cls, machine_type, region): """Returns the default image given the machine type and region. + If specified, aws_image_name_filter will override os_type defaults. If no default is configured, this will return None. + + Args: + machine_type: The machine_type of the VM, used to determine virtualization + type. + region: The region of the VM, as images are region specific. """ if cls.IMAGE_NAME_FILTER is None: return None + if FLAGS.aws_image_name_filter: + cls.IMAGE_NAME_FILTER = FLAGS.aws_image_name_filter + prefix = machine_type.split('.')[0] virt_type = 'paravirtual' if prefix in NON_HVM_PREFIXES else 'hvm' @@ -537,10 +551,10 @@ def _CreateSpot(self): '--client-token=%s' % self.client_token, '--launch-specification=%s' % json.dumps(launch_specification, separators=(',', ':'))] - stdout, stderr, _ = vm_util.IssueCommand(create_cmd) + stdout, _, _ = vm_util.IssueCommand(create_cmd) create_response = json.loads(stdout) - self.spot_instance_request_id =\ - create_response['SpotInstanceRequests'][0]['SpotInstanceRequestId'] + self.spot_instance_request_id = ( + create_response['SpotInstanceRequests'][0]['SpotInstanceRequestId']) util.AddDefaultTags(self.spot_instance_request_id, self.region) @@ -550,18 +564,18 @@ def _CreateSpot(self): 'ec2', 'describe-spot-instance-requests', '--spot-instance-request-ids=%s' % self.spot_instance_request_id] - stdout, stderr, _ = vm_util.IssueCommand(describe_sir_cmd) + stdout, _, _ = vm_util.IssueCommand(describe_sir_cmd) sir_response = json.loads(stdout)['SpotInstanceRequests'] assert len(sir_response) == 1, 'Expected exactly 1 SpotInstanceRequest' status_code = sir_response[0]['Status']['Code'] - if status_code in SPOT_INSTANCE_REQUEST_HOLDING_STATUSES or \ - status_code in SPOT_INSTANCE_REQUEST_TERMINAL_STATUSES: + if (status_code in SPOT_INSTANCE_REQUEST_HOLDING_STATUSES or + status_code in SPOT_INSTANCE_REQUEST_TERMINAL_STATUSES): message = sir_response[0]['Status']['Message'] raise errors.Resource.CreationError(message) - elif status_code == "fulfilled": + elif status_code == 'fulfilled': self.id = sir_response[0]['InstanceId'] break @@ -584,7 +598,6 @@ def _Delete(self): '--spot-instance-request-ids=%s' % self.spot_instance_request_id] vm_util.IssueCommand(cancel_cmd) - def _Exists(self): """Returns true if the VM exists.""" describe_cmd = util.AWS_PREFIX + [ @@ -680,7 +693,7 @@ def __init__(self, vm_spec): @vm_util.Retry() def _GetDecodedPasswordData(self): - # Retreive a base64 encoded, encrypted password for the VM. + # Retrieve a base64 encoded, encrypted password for the VM. get_password_cmd = util.AWS_PREFIX + [ 'ec2', 'get-password-data', @@ -698,7 +711,6 @@ def _GetDecodedPasswordData(self): # Decode the password data. return base64.b64decode(password_data) - def _PostCreate(self): """Retrieve generic VM info and then retrieve the VM's password.""" super(WindowsAwsVirtualMachine, self)._PostCreate() diff --git a/perfkitbenchmarker/providers/aws/flags.py b/perfkitbenchmarker/providers/aws/flags.py index ebce5c62e1..20e50539f6 100644 --- a/perfkitbenchmarker/providers/aws/flags.py +++ b/perfkitbenchmarker/providers/aws/flags.py @@ -42,3 +42,5 @@ 'The path to the kops binary.') flags.DEFINE_string('edw_redshift_cluster_version', 'redshift-1.0', 'The redshift version to use for the cluster.') +flags.DEFINE_string('aws_image_name_filter', None, + 'The filter to use when searching for an image for a VM.') From a373593297100f219823584c61a9947c60aab1cd Mon Sep 17 00:00:00 2001 From: ferneyhough Date: Mon, 27 Nov 2017 14:33:54 -0800 Subject: [PATCH 12/32] Fix uninstall command in Tensorflow package ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=177072996 --- perfkitbenchmarker/linux_packages/tensorflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perfkitbenchmarker/linux_packages/tensorflow.py b/perfkitbenchmarker/linux_packages/tensorflow.py index 6dab32dc5e..7d4590a6de 100644 --- a/perfkitbenchmarker/linux_packages/tensorflow.py +++ b/perfkitbenchmarker/linux_packages/tensorflow.py @@ -48,5 +48,5 @@ def Install(vm): def Uninstall(vm): """Uninstalls TensorFlow on the VM.""" - vm.RemoteCommand('sudo pip uninstall --upgrade tensorflow', + vm.RemoteCommand('sudo pip uninstall tensorflow', should_log=True) From cc9e112d3cb65e2573763e9bcef6860eae8a3384 Mon Sep 17 00:00:00 2001 From: ferneyhough Date: Mon, 27 Nov 2017 14:37:53 -0800 Subject: [PATCH 13/32] Add V100 GPU support for GCE ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=177073589 --- perfkitbenchmarker/pkb.py | 2 +- perfkitbenchmarker/virtual_machine.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/perfkitbenchmarker/pkb.py b/perfkitbenchmarker/pkb.py index ce7d7de0e0..77f3c52d78 100755 --- a/perfkitbenchmarker/pkb.py +++ b/perfkitbenchmarker/pkb.py @@ -135,7 +135,7 @@ 'specified.') flags.DEFINE_enum( 'gpu_type', None, - ['k80', 'p100'], + ['k80', 'p100', 'v100'], 'Type of gpus to attach to the VM. Requires gpu_count to be ' 'specified.') flags.DEFINE_integer('num_vms', 1, 'For benchmarks which can make use of a ' diff --git a/perfkitbenchmarker/virtual_machine.py b/perfkitbenchmarker/virtual_machine.py index 5d6bf46701..2a1c0687b2 100644 --- a/perfkitbenchmarker/virtual_machine.py +++ b/perfkitbenchmarker/virtual_machine.py @@ -49,7 +49,7 @@ flags.DEFINE_list('vm_metadata', [], 'Metadata to add to the vm ' 'via the provider\'s AddMetadata function. It expects' 'key:value pairs') -VALID_GPU_TYPES = ['k80', 'p100'] +VALID_GPU_TYPES = ['k80', 'p100', 'v100'] def GetVmSpecClass(cloud): From 264ad24cb834b95ed0a88a07060a7162fd148520 Mon Sep 17 00:00:00 2001 From: ferneyhough Date: Mon, 27 Nov 2017 16:48:53 -0800 Subject: [PATCH 14/32] Add support for running on multiple VMs in the Tensorflow benchmark. The VMs are run independently, i.e, are not configured to use distributed training. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=177091389 --- .../linux_benchmarks/tensorflow_benchmark.py | 106 ++++++++++++------ 1 file changed, 69 insertions(+), 37 deletions(-) diff --git a/perfkitbenchmarker/linux_benchmarks/tensorflow_benchmark.py b/perfkitbenchmarker/linux_benchmarks/tensorflow_benchmark.py index 655f823488..c8226d16cb 100644 --- a/perfkitbenchmarker/linux_benchmarks/tensorflow_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/tensorflow_benchmark.py @@ -19,6 +19,7 @@ from perfkitbenchmarker import configs from perfkitbenchmarker import flags from perfkitbenchmarker import sample +from perfkitbenchmarker import vm_util from perfkitbenchmarker.linux_packages import cuda_toolkit_8 from perfkitbenchmarker.linux_packages import tensorflow @@ -149,21 +150,23 @@ def _UpdateBenchmarkSpecWithFlags(benchmark_spec): benchmark_spec.benchmarks_commit_hash = FLAGS.tf_benchmarks_commit_hash -def _InstallTensorFlowBenchmarks(benchmark_spec): - """Install and set up TensorFlow Benchmarks on the target vm. +def _PrepareVm(vm): + """Install and set up TensorFlow on the target vm. - A specific commit which works best with TensorFlow 1.3 is used, - but can be overrided with the flag tf_benchmarks_commit_hash. + The TensorFlow benchmarks are also installed. + A specific commit of the benchmarks which works best with TensorFlow + 1.3 is used and can be overridden with the flag tf_benchmarks_commit_hash. Args: - benchmark_spec: The benchmark specification + vm: virtual machine on which to install TensorFlow """ - vm = benchmark_spec.vms[0] + commit_hash = FLAGS.tf_benchmarks_commit_hash + vm.Install('tensorflow') + vm.InstallPackages('git') vm.RemoteCommand( 'git clone https://github.com/tensorflow/benchmarks.git', should_log=True) vm.RemoteCommand( - 'cd benchmarks && git checkout {}'.format( - benchmark_spec.benchmarks_commit_hash) + 'cd benchmarks && git checkout {}'.format(commit_hash) ) @@ -175,18 +178,18 @@ def Prepare(benchmark_spec): """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vms = benchmark_spec.vms - master_vm = vms[0] - master_vm.Install('tensorflow') - master_vm.InstallPackages('git') - benchmark_spec.tensorflow_version = tensorflow.GetTensorFlowVersion(master_vm) - _InstallTensorFlowBenchmarks(benchmark_spec) + vm_util.RunThreaded(_PrepareVm, vms) + benchmark_spec.tensorflow_version = tensorflow.GetTensorFlowVersion(vms[0]) -def _CreateMetadataDict(benchmark_spec): +def _CreateMetadataDict(benchmark_spec, model, batch_size, num_gpus): """Create metadata dict to be used in run results. Args: benchmark_spec: benchmark spec + model: model which was run + batch_size: batch sized used + num_gpus: number of GPUs used Returns: metadata dict @@ -195,11 +198,11 @@ def _CreateMetadataDict(benchmark_spec): metadata = dict() if benchmark_spec.device == GPU: metadata.update(cuda_toolkit_8.GetMetadata(vm)) - metadata['num_gpus'] = benchmark_spec.num_gpus + metadata['num_gpus'] = num_gpus + metadata['model'] = model + metadata['batch_size'] = batch_size metadata['forward_only'] = benchmark_spec.forward_only - metadata['model'] = benchmark_spec.model metadata['data_name'] = benchmark_spec.data_name - metadata['batch_size'] = benchmark_spec.batch_size metadata['variable_update'] = benchmark_spec.variable_update metadata['local_parameter_device'] = benchmark_spec.local_parameter_device metadata['device'] = benchmark_spec.device @@ -249,49 +252,48 @@ def _ExtractThroughput(output): raise TFParseOutputException('Unable to parse TensorFlow output') -def _MakeSamplesFromOutput(benchmark_spec, output): - """Create a sample continaing the measured TensorFlow throughput. +def _MakeSamplesFromOutput(benchmark_spec, output, model, batch_size, num_gpus): + """Create a sample containing the measured TensorFlow throughput. Args: benchmark_spec: benchmark spec output: TensorFlow output + model: model which was run + batch_size: batch sized used + num_gpus: number of GPUs used Returns: a Sample containing the TensorFlow throughput in Gflops """ - metadata = _CreateMetadataDict(benchmark_spec) + metadata = _CreateMetadataDict(benchmark_spec, model, batch_size, num_gpus) tensorflow_throughput = _ExtractThroughput(output) return [sample.Sample('Training synthetic data', tensorflow_throughput, 'images/sec', metadata)] -def Run(benchmark_spec): - """Run TensorFlow on the cluster for each model specified. +def _RunOnVm(vm, benchmark_spec): + """Runs a TensorFlow benchmark on a single VM. Args: - benchmark_spec: The benchmark specification. Contains all data that is - required to run the benchmark. + vm: VM to run on + benchmark_spec: benchmark_spec object Returns: - A list of sample.Sample objects. + A list of samples """ - _UpdateBenchmarkSpecWithFlags(benchmark_spec) - vms = benchmark_spec.vms - master_vm = vms[0] tf_cnn_benchmark_dir = 'benchmarks/scripts/tf_cnn_benchmarks' results = [] for model in FLAGS.tf_models: - benchmark_spec.model = model - benchmark_spec.batch_size = _GetBatchSize(benchmark_spec.model) + batch_size = _GetBatchSize(model) tf_cnn_benchmark_cmd = ( 'python tf_cnn_benchmarks.py --local_parameter_device=%s ' '--batch_size=%s --model=%s --data_name=%s --variable_update=%s ' '--use_nccl=%s --distortions=%s --device=%s --data_format=%s ' '--forward_only=%s') % ( benchmark_spec.local_parameter_device, - benchmark_spec.batch_size, - benchmark_spec.model, + batch_size, + model, benchmark_spec.data_name, benchmark_spec.variable_update, benchmark_spec.use_nccl, @@ -300,18 +302,48 @@ def Run(benchmark_spec): benchmark_spec.data_format, benchmark_spec.forward_only) if benchmark_spec.device == GPU: - benchmark_spec.num_gpus = cuda_toolkit_8.QueryNumberOfGpus(master_vm) + num_gpus = cuda_toolkit_8.QueryNumberOfGpus(vm) tf_cnn_benchmark_cmd = '%s %s --num_gpus=%s' % ( - _GetEnvironmentVars(master_vm), tf_cnn_benchmark_cmd, - benchmark_spec.num_gpus) + _GetEnvironmentVars(vm), tf_cnn_benchmark_cmd, + num_gpus) + else: + num_gpus = 0 run_command = 'cd %s && %s' % (tf_cnn_benchmark_dir, tf_cnn_benchmark_cmd) - output, _ = master_vm.RobustRemoteCommand(run_command, should_log=True) - results.extend(_MakeSamplesFromOutput(benchmark_spec, output)) + output, _ = vm.RobustRemoteCommand(run_command, should_log=True) + results.extend(_MakeSamplesFromOutput( + benchmark_spec, output, model, batch_size, num_gpus)) return results +def Run(benchmark_spec): + """Run TensorFlow on the cluster for each model specified. + + Args: + benchmark_spec: The benchmark specification. Contains all data that is + required to run the benchmark. + + Returns: + A list of sample.Sample objects. + """ + _UpdateBenchmarkSpecWithFlags(benchmark_spec) + vms = benchmark_spec.vms + args = [((vm, benchmark_spec), {}) for vm in vms] + run_results = vm_util.RunThreaded(_RunOnVm, args) + + # Add vm index to results metadata + for idx, vm_result in enumerate(run_results): + for sample in vm_result: + sample.metadata['vm_index'] = idx + + # Flatten the list + flattened_results = ( + [samples for vm_results in run_results for samples in vm_results]) + + return flattened_results + + def Cleanup(benchmark_spec): """Cleanup TensorFlow on the cluster.""" pass From d7ae27db8e1bbd22354112ce7e08424a7b70e3ab Mon Sep 17 00:00:00 2001 From: deitz Date: Tue, 28 Nov 2017 11:43:58 -0800 Subject: [PATCH 15/32] Increase connection timeout for SCP from 5 seconds to 30 seconds. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=177193809 --- perfkitbenchmarker/linux_virtual_machine.py | 4 +++- perfkitbenchmarker/vm_util.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/perfkitbenchmarker/linux_virtual_machine.py b/perfkitbenchmarker/linux_virtual_machine.py index 96504770f8..c46eceab06 100644 --- a/perfkitbenchmarker/linux_virtual_machine.py +++ b/perfkitbenchmarker/linux_virtual_machine.py @@ -463,7 +463,9 @@ def RemoteHostCopy(self, file_path, remote_path='', copy_to=True): remote_location = '%s@%s:%s' % ( self.user_name, self.ip_address, remote_path) scp_cmd = ['scp', '-P', str(self.ssh_port), '-pr'] - scp_cmd.extend(vm_util.GetSshOptions(self.ssh_private_key)) + # An scp is not retried, so increase the connection timeout. + scp_cmd.extend(vm_util.GetSshOptions(self.ssh_private_key, + connect_timeout=30)) if copy_to: scp_cmd.extend([file_path, remote_location]) else: diff --git a/perfkitbenchmarker/vm_util.py b/perfkitbenchmarker/vm_util.py index 4bbdbe0a98..a5a62e8202 100644 --- a/perfkitbenchmarker/vm_util.py +++ b/perfkitbenchmarker/vm_util.py @@ -172,7 +172,7 @@ def GetCertPath(): return PrependTempDir(CERT_FILE) -def GetSshOptions(ssh_key_filename): +def GetSshOptions(ssh_key_filename, connect_timeout=5): """Return common set of SSH and SCP options.""" options = [ '-2', @@ -181,7 +181,7 @@ def GetSshOptions(ssh_key_filename): '-o', 'IdentitiesOnly=yes', '-o', 'PreferredAuthentications=publickey', '-o', 'PasswordAuthentication=no', - '-o', 'ConnectTimeout=5', + '-o', 'ConnectTimeout=%d' % connect_timeout, '-o', 'GSSAPIAuthentication=no', '-o', 'ServerAliveInterval=30', '-o', 'ServerAliveCountMax=10', From 9eb1756a611fbc3d2646b1ad038271faa53abeab Mon Sep 17 00:00:00 2001 From: Steven Deitz Date: Tue, 28 Nov 2017 13:44:20 -0800 Subject: [PATCH 16/32] Adds some DaCapo benchmarks to PKB. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=177210785 --- .../linux_benchmarks/dacapo_benchmark.py | 97 +++++++++++++++++++ perfkitbenchmarker/linux_packages/dacapo.py | 25 +++++ 2 files changed, 122 insertions(+) create mode 100644 perfkitbenchmarker/linux_benchmarks/dacapo_benchmark.py create mode 100644 perfkitbenchmarker/linux_packages/dacapo.py diff --git a/perfkitbenchmarker/linux_benchmarks/dacapo_benchmark.py b/perfkitbenchmarker/linux_benchmarks/dacapo_benchmark.py new file mode 100644 index 0000000000..c82d8b8d6c --- /dev/null +++ b/perfkitbenchmarker/linux_benchmarks/dacapo_benchmark.py @@ -0,0 +1,97 @@ +# Copyright 2016 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Runs DaCapo benchmarks. + +This benchmark runs the various DaCapo benchmarks. More information can be found +at: http://dacapobench.org/ +""" + +import os +import re + +from perfkitbenchmarker import configs +from perfkitbenchmarker import errors +from perfkitbenchmarker import flags +from perfkitbenchmarker import linux_packages +from perfkitbenchmarker import sample + +flags.DEFINE_string('dacapo_jar_filename', 'dacapo-9.12-bach.jar', + 'Filename of DaCapo jar file.') +flags.DEFINE_enum('dacapo_benchmark', 'luindex', ['luindex', 'lusearch'], + 'Name of specific DaCapo benchmark to execute.') +flags.DEFINE_integer('dacapo_num_iters', 1, 'Number of iterations to execute.') + +FLAGS = flags.FLAGS + +BENCHMARK_NAME = 'dacapo' +BENCHMARK_CONFIG = """ +dacapo: + description: Runs DaCapo benchmarks + vm_groups: + default: + vm_spec: *default_single_core +""" +_PASS_PATTERN = re.compile(r'^=====.*PASSED in (\d+) msec =====$') + + +def GetConfig(user_config): + return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) + + +def Prepare(benchmark_spec): + """Install the DaCapo benchmark suite on the vms. + + Args: + benchmark_spec: The benchmark specification. Contains all data that is + required to run the benchmark. + """ + benchmark_spec.vms[0].Install('dacapo') + + +def Run(benchmark_spec): + """Run the DaCapo benchmark on the vms. + + Args: + benchmark_spec: The benchmark specification. Contains all data that is + required to run the benchmark. + + Returns: + A singleton list of sample.Sample objects containing the DaCapo benchmark + run time (in msec). + + Raises: + errors.Benchmarks.RunError if the DaCapo benchmark didn't succeed. + """ + _, stderr = benchmark_spec.vms[0].RemoteCommand( + 'java -jar %s %s -n %i --scratch-directory=%s' % + (os.path.join(linux_packages.INSTALL_DIR, FLAGS.dacapo_jar_filename), + FLAGS.dacapo_benchmark, FLAGS.dacapo_num_iters, + os.path.join(linux_packages.INSTALL_DIR, 'dacapo_scratch'))) + for line in stderr.splitlines(): + m = _PASS_PATTERN.match(line) + if m: + return [sample.Sample('run_time', float(m.group(1)), 'ms')] + raise errors.Benchmarks.RunError( + 'DaCapo benchmark %s failed.' % FLAGS.dacapo_benchmark) + + +def Cleanup(benchmark_spec): + """Cleanup the DaCapo benchmark on the target vm (by uninstalling). + + Args: + benchmark_spec: The benchmark specification. Contains all data that is + required to run the benchmark. + """ + benchmark_spec.vms[0].RemoteCommand( + 'rm -rf %s' % os.path.join(linux_packages.INSTALL_DIR, 'dacapo_scratch')) diff --git a/perfkitbenchmarker/linux_packages/dacapo.py b/perfkitbenchmarker/linux_packages/dacapo.py new file mode 100644 index 0000000000..b56ddf9e3e --- /dev/null +++ b/perfkitbenchmarker/linux_packages/dacapo.py @@ -0,0 +1,25 @@ +# Copyright 2016 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Package for installing the DaCapo benchmarks.""" + +from perfkitbenchmarker.linux_packages import INSTALL_DIR + +_DACAPO_URL = ('https://versaweb.dl.sourceforge.net/project/dacapobench/' + '9.12-bach/dacapo-9.12-bach.jar') + + +def Install(vm): + """Installs the `dacapo` package on the VM.""" + vm.Install('openjdk') + vm.RemoteCommand('wget %s -P %s' % (_DACAPO_URL, INSTALL_DIR)) From ca9513b54d7cce6be9a90165d5d10d7a5ccc7bd4 Mon Sep 17 00:00:00 2001 From: ferneyhough Date: Wed, 29 Nov 2017 16:49:54 -0800 Subject: [PATCH 17/32] Use Tensorflow environment vars when looking up the Tensorflow version. The environment vars ensure that the required CUDA libraries can be found. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=177383869 --- .../linux_benchmarks/tensorflow_benchmark.py | 23 +----------------- .../linux_packages/tensorflow.py | 24 ++++++++++++++++++- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/perfkitbenchmarker/linux_benchmarks/tensorflow_benchmark.py b/perfkitbenchmarker/linux_benchmarks/tensorflow_benchmark.py index c8226d16cb..c54b3a7e7a 100644 --- a/perfkitbenchmarker/linux_benchmarks/tensorflow_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/tensorflow_benchmark.py @@ -214,27 +214,6 @@ def _CreateMetadataDict(benchmark_spec, model, batch_size, num_gpus): return metadata -def _GetEnvironmentVars(vm): - """Return a string containing TensorFlow-related environment variables. - - Args: - vm: vm to get environment varibles - - Returns: - string of environment variables - """ - output, _ = vm.RemoteCommand('getconf LONG_BIT', should_log=True) - long_bit = output.strip() - lib_name = 'lib' if long_bit == '32' else 'lib64' - return ' '.join([ - 'PATH=%s${PATH:+:${PATH}}' % - posixpath.join(FLAGS.cuda_toolkit_installation_dir, 'bin'), - 'CUDA_HOME=%s' % FLAGS.cuda_toolkit_installation_dir, - 'LD_LIBRARY_PATH=%s${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' % - posixpath.join(FLAGS.cuda_toolkit_installation_dir, lib_name), - ]) - - def _ExtractThroughput(output): """Extract throughput from TensorFlow output. @@ -304,7 +283,7 @@ def _RunOnVm(vm, benchmark_spec): if benchmark_spec.device == GPU: num_gpus = cuda_toolkit_8.QueryNumberOfGpus(vm) tf_cnn_benchmark_cmd = '%s %s --num_gpus=%s' % ( - _GetEnvironmentVars(vm), tf_cnn_benchmark_cmd, + tensorflow._GetEnvironmentVars(vm), tf_cnn_benchmark_cmd, num_gpus) else: num_gpus = 0 diff --git a/perfkitbenchmarker/linux_packages/tensorflow.py b/perfkitbenchmarker/linux_packages/tensorflow.py index 7d4590a6de..02b51bda22 100644 --- a/perfkitbenchmarker/linux_packages/tensorflow.py +++ b/perfkitbenchmarker/linux_packages/tensorflow.py @@ -14,10 +14,31 @@ """Module containing TensorFlow 1.3 installation and cleanup functions.""" +import posixpath from perfkitbenchmarker import flags FLAGS = flags.FLAGS +def _GetEnvironmentVars(vm): + """Return a string containing TensorFlow-related environment variables. + + Args: + vm: vm to get environment varibles + + Returns: + string of environment variables + """ + output, _ = vm.RemoteCommand('getconf LONG_BIT', should_log=True) + long_bit = output.strip() + lib_name = 'lib' if long_bit == '32' else 'lib64' + return ' '.join([ + 'PATH=%s${PATH:+:${PATH}}' % + posixpath.join(FLAGS.cuda_toolkit_installation_dir, 'bin'), + 'CUDA_HOME=%s' % FLAGS.cuda_toolkit_installation_dir, + 'LD_LIBRARY_PATH=%s${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' % + posixpath.join(FLAGS.cuda_toolkit_installation_dir, lib_name), + ]) + def GetTensorFlowVersion(vm): """Returns the version of tensorflow installed on the vm. @@ -28,7 +49,8 @@ def GetTensorFlowVersion(vm): installed python tensorflow version as a string """ stdout, _ = vm.RemoteCommand( - 'echo -e "import tensorflow\nprint(tensorflow.__version__)" | python' + ('echo -e "import tensorflow\nprint(tensorflow.__version__)" | {0} python' + .format(_GetEnvironmentVars(vm))) ) return stdout.strip() From 8bfbb9a8869cac40ad354cf4162f7c4aea47e3a2 Mon Sep 17 00:00:00 2001 From: yuyanting Date: Wed, 29 Nov 2017 17:29:52 -0800 Subject: [PATCH 18/32] - Use gcc/fortran/g++-4.7 as default in SPECCPU test (consistent with config file provided by https://github.com/GoogleCloudPlatform/PerfKitBenchmarker/issues/156) - Add a flag allowing change build tool version - Make lint happy. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=177388988 --- .../linux_benchmarks/speccpu2006_benchmark.py | 26 ++++++++++++++--- .../linux_packages/build_tools.py | 29 +++++++++++++++++++ 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/perfkitbenchmarker/linux_benchmarks/speccpu2006_benchmark.py b/perfkitbenchmarker/linux_benchmarks/speccpu2006_benchmark.py index 6043eaf1c8..b4b8dca274 100644 --- a/perfkitbenchmarker/linux_benchmarks/speccpu2006_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/speccpu2006_benchmark.py @@ -24,18 +24,19 @@ import itertools import logging +from operator import mul import os import posixpath import re import tarfile -from operator import mul from perfkitbenchmarker import configs from perfkitbenchmarker import data from perfkitbenchmarker import errors from perfkitbenchmarker import flags from perfkitbenchmarker import sample from perfkitbenchmarker import stages +from perfkitbenchmarker.linux_packages import build_tools FLAGS = flags.FLAGS @@ -62,6 +63,14 @@ 'cfg file must be placed in the local PKB data directory and will be ' 'copied to the remote machine prior to executing runspec. See README.md ' 'for instructions if running with a repackaged cpu2006v1.2.tgz file.') +flags.DEFINE_string( + 'runspec_build_tool_version', None, + 'Version of gcc/g++/gfortran. This should match runspec_config. Note, if ' + 'neither runspec_config and runspec_build_tool_version is set, the test ' + 'install gcc/g++/gfortran-4.7, since that matches default config version. ' + 'If runspec_config is set, but not runspec_build_tool_version, default ' + 'version of build tools will be installed. Also this flag only works with ' + 'debian.') flags.DEFINE_integer( 'runspec_iterations', 3, 'Used by the PKB speccpu2006 benchmark. The number of benchmark iterations ' @@ -119,7 +128,7 @@ def GetConfig(user_config): return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) -def CheckPrerequisites(benchmark_config): +def CheckPrerequisites(unused_benchmark_config): """Verifies that the required input files are present.""" try: # Peeking into the tar file is slow. If running in stages, it's @@ -227,6 +236,7 @@ class _SpecCpu2006SpecificState(object): where the SPEC files are stored. tar_file_path: Optional string. Path of the tar file on the remote machine. """ + def __init__(self): self.cfg_file_path = None self.iso_file_path = None @@ -246,8 +256,15 @@ def Prepare(benchmark_spec): speccpu_vm_state = _SpecCpu2006SpecificState() setattr(vm, _BENCHMARK_SPECIFIC_VM_STATE_ATTR, speccpu_vm_state) vm.Install('wget') - vm.Install('build_tools') vm.Install('fortran') + vm.Install('build_tools') + + # If using default config files and runspec_build_tool_version is not set, + # install 4.7 gcc/g++/gfortan. If either one of the flag is set, we assume + # user is smart + if not FLAGS['runspec_config'].present or FLAGS.runspec_build_tool_version: + build_tool_version = FLAGS.runspec_build_tool_version or '4.7' + build_tools.Reinstall(vm, version=build_tool_version) if FLAGS.runspec_enable_32bit: vm.Install('multilib') vm.Install('numactl') @@ -329,6 +346,7 @@ def _ExtractScore(stdout, vm, keep_partial_results, estimate_spec): keep_partial_results: A boolean indicating whether partial results should be extracted in the event that not all benchmarks were successfully run. See the "runspec_keep_partial_results" flag for more info. + estimate_spec: A boolean indicating whether should we estimate spec score. Sample input for SPECint: ... @@ -449,7 +467,7 @@ def _ExtractScore(stdout, vm, keep_partial_results, estimate_spec): def _GeometricMean(arr): - "Calculates the geometric mean of the array." + """Calculates the geometric mean of the array.""" return reduce(mul, arr) ** (1.0 / len(arr)) diff --git a/perfkitbenchmarker/linux_packages/build_tools.py b/perfkitbenchmarker/linux_packages/build_tools.py index e3015ab929..0ecbfa832f 100644 --- a/perfkitbenchmarker/linux_packages/build_tools.py +++ b/perfkitbenchmarker/linux_packages/build_tools.py @@ -14,6 +14,7 @@ """Module containing build tools installation and cleanup functions.""" +from perfkitbenchmarker import os_types def YumInstall(vm): @@ -24,3 +25,31 @@ def YumInstall(vm): def AptInstall(vm): """Installs build tools on the VM.""" vm.InstallPackages('build-essential git libtool autoconf automake') + + +def GetVersion(vm, pkg): + """Get version of package.""" + out, _ = vm.RemoteCommand('{pkg} -v'.format(pkg=pkg), ignore_failure=True) + return out + + +def Reinstall(vm, version='4.7'): + """Install specific version of gcc. + + Args: + vm: VirtualMachine object. + version: string. GCC version. + """ + if vm.OS_TYPE != os_types.DEBIAN: + return + for pkg in ('gcc', 'gfortran', 'g++'): + version_string = GetVersion(vm, pkg) + if version in version_string: + continue + else: + new_pkg = pkg + '-' + version + vm.RemoteCommand('sudo apt-get remove {pkg} -y'.format(pkg=pkg), + ignore_failure=True) + vm.InstallPackages(new_pkg) + vm.RemoteCommand('sudo ln -s /usr/bin/{new_pkg} /usr/bin/{pkg}'.format( + new_pkg=new_pkg, pkg=pkg)) From 18f68129bae45fd8569440f03c8ccc552b9f4541 Mon Sep 17 00:00:00 2001 From: yuyanting Date: Thu, 30 Nov 2017 11:10:48 -0800 Subject: [PATCH 19/32] - Fix bug in build_tool.GetVersion ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=177479960 --- perfkitbenchmarker/linux_packages/build_tools.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/perfkitbenchmarker/linux_packages/build_tools.py b/perfkitbenchmarker/linux_packages/build_tools.py index 0ecbfa832f..1cae30458d 100644 --- a/perfkitbenchmarker/linux_packages/build_tools.py +++ b/perfkitbenchmarker/linux_packages/build_tools.py @@ -27,10 +27,10 @@ def AptInstall(vm): vm.InstallPackages('build-essential git libtool autoconf automake') -def GetVersion(vm, pkg): +def _GetVersion(vm, pkg): """Get version of package.""" - out, _ = vm.RemoteCommand('{pkg} -v'.format(pkg=pkg), ignore_failure=True) - return out + _, err = vm.RemoteCommand('{pkg} -v'.format(pkg=pkg), ignore_failure=True) + return err def Reinstall(vm, version='4.7'): @@ -43,7 +43,7 @@ def Reinstall(vm, version='4.7'): if vm.OS_TYPE != os_types.DEBIAN: return for pkg in ('gcc', 'gfortran', 'g++'): - version_string = GetVersion(vm, pkg) + version_string = _GetVersion(vm, pkg) if version in version_string: continue else: From ec6b3016185212ee134951e249813d1cf7c9f429 Mon Sep 17 00:00:00 2001 From: ferneyhough Date: Thu, 30 Nov 2017 11:20:00 -0800 Subject: [PATCH 20/32] Add daemonset which sets nvidia-smi permissions so that a restricted user (aka a pod with a restricted security context) can set the GPU clock speed and autoboost policy. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=177481407 --- ...dia_unrestricted_permissions_daemonset.yml | 23 +++++++++++++++++++ .../providers/gcp/google_container_engine.py | 4 ++++ .../gcp/google_container_engine_test.py | 21 +++++++++++++---- 3 files changed, 43 insertions(+), 5 deletions(-) create mode 100644 perfkitbenchmarker/data/nvidia_unrestricted_permissions_daemonset.yml diff --git a/perfkitbenchmarker/data/nvidia_unrestricted_permissions_daemonset.yml b/perfkitbenchmarker/data/nvidia_unrestricted_permissions_daemonset.yml new file mode 100644 index 0000000000..1a2930c2dd --- /dev/null +++ b/perfkitbenchmarker/data/nvidia_unrestricted_permissions_daemonset.yml @@ -0,0 +1,23 @@ +apiVersion: apps/v1beta2 +kind: DaemonSet +metadata: + name: nvidia-add-unrestricted-permissions-dameon-set +spec: + selector: + matchLabels: + name: nvidia-add-unrestricted-permissions + template: + metadata: + labels: + name: nvidia-add-unrestricted-permissions + spec: + containers: + - name: nvidia-add-unrestricted-permissions + image: nvidia/cuda:8.0-devel-ubuntu16.04 + securityContext: + privileged: true + resources: + limits: + nvidia.com/gpu: 1 + command: [ "/bin/bash", "-c", "nvidia-smi -pm 1 && nvidia-smi --applications-clocks-permission=UNRESTRICTED && nvidia-smi --auto-boost-permission=UNRESTRICTED && tail -f /dev/null" ] + diff --git a/perfkitbenchmarker/providers/gcp/google_container_engine.py b/perfkitbenchmarker/providers/gcp/google_container_engine.py index 69c9e681d8..b5d0c93075 100644 --- a/perfkitbenchmarker/providers/gcp/google_container_engine.py +++ b/perfkitbenchmarker/providers/gcp/google_container_engine.py @@ -17,6 +17,7 @@ import os from perfkitbenchmarker import container_service +from perfkitbenchmarker import data from perfkitbenchmarker import flags from perfkitbenchmarker import kubernetes_helper from perfkitbenchmarker import providers @@ -26,6 +27,7 @@ FLAGS = flags.FLAGS NVIDIA_DRIVER_SETUP_DAEMON_SET_SCRIPT = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/k8s-1.8/device-plugin-daemonset.yaml' +NVIDIA_UNRESTRICTED_PERMISSIONS_DAEMON_SET = 'nvidia_unrestricted_permissions_daemonset.yml' class GkeCluster(container_service.KubernetesCluster): @@ -86,6 +88,8 @@ def _PostCreate(self): if self.gpu_count: kubernetes_helper.CreateFromFile(NVIDIA_DRIVER_SETUP_DAEMON_SET_SCRIPT) + kubernetes_helper.CreateFromFile( + data.ResourcePath(NVIDIA_UNRESTRICTED_PERMISSIONS_DAEMON_SET)) def _Delete(self): """Deletes the cluster.""" diff --git a/tests/providers/gcp/google_container_engine_test.py b/tests/providers/gcp/google_container_engine_test.py index 72452527eb..fcd7e6d504 100644 --- a/tests/providers/gcp/google_container_engine_test.py +++ b/tests/providers/gcp/google_container_engine_test.py @@ -19,6 +19,7 @@ import contextlib2 import mock +from perfkitbenchmarker import data from perfkitbenchmarker import vm_util from perfkitbenchmarker.configs import benchmark_config_spec from perfkitbenchmarker.providers.gcp import google_container_engine @@ -28,7 +29,7 @@ _COMPONENT = 'test_component' _RUN_URI = 'fake-urn-uri' _NVIDIA_DRIVER_SETUP_DAEMON_SET_SCRIPT = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/k8s-1.8/device-plugin-daemonset.yaml' - +_NVIDIA_UNRESTRICTED_PERMISSIONS_DAEMON_SET = 'nvidia_unrestricted_permissions_daemonset.yml' @contextlib2.contextmanager def patch_critical_objects(stdout='', stderr='', return_code=0): @@ -36,6 +37,7 @@ def patch_critical_objects(stdout='', stderr='', return_code=0): flags = mock_flags.MockFlags() flags.gcloud_path = 'gcloud' flags.run_uri = _RUN_URI + flags.data_search_paths = '' stack.enter_context(mock_flags.PatchFlags(flags)) stack.enter_context(mock.patch('__builtin__.open')) @@ -169,9 +171,18 @@ def testPostCreate(self, create_from_file_patch): command_string) self.assertIn('KUBECONFIG', issue_command.call_args[1]['env']) - create_from_file_patch.assert_called_with( - _NVIDIA_DRIVER_SETUP_DAEMON_SET_SCRIPT) + expected_args_to_create_from_file = ( + _NVIDIA_DRIVER_SETUP_DAEMON_SET_SCRIPT, + data.ResourcePath( + _NVIDIA_UNRESTRICTED_PERMISSIONS_DAEMON_SET) + ) + expected_calls = [mock.call(arg) + for arg in expected_args_to_create_from_file] + + # Assert that create_from_file was called twice, + # and that the args were as expected (should be the NVIDIA + # driver setup daemon set, followed by the + # NVIDIA unrestricted permissions daemon set. + create_from_file_patch.assert_has_calls(expected_calls) -if __name__ == '__main__': - unittest.main() From 751655b38ae26aad7a6e9ec6f10e005afa1b1385 Mon Sep 17 00:00:00 2001 From: ferneyhough Date: Thu, 30 Nov 2017 13:41:59 -0800 Subject: [PATCH 21/32] Don't set GPU clock speed or autoboost policy if the requested value is equal to the current value. This allows users to run GPU benchmarks in the case that they do not have permission to modify the GPU's clock speed or autoboost, with the condition that they specify the default values for both settings. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=177501235 --- .../linux_packages/cuda_toolkit_8.py | 58 ++++++++++++++----- tests/linux_packages/cuda_toolkit_8_test.py | 55 ++++++++++++++++++ 2 files changed, 98 insertions(+), 15 deletions(-) diff --git a/perfkitbenchmarker/linux_packages/cuda_toolkit_8.py b/perfkitbenchmarker/linux_packages/cuda_toolkit_8.py index b0e9c26b89..c454eb30ce 100644 --- a/perfkitbenchmarker/linux_packages/cuda_toolkit_8.py +++ b/perfkitbenchmarker/linux_packages/cuda_toolkit_8.py @@ -192,8 +192,9 @@ def SetAndConfirmGpuClocks(vm): desired_memory_clock = gpu_clock_speeds[0] desired_graphics_clock = gpu_clock_speeds[1] - SetGpuClockSpeedAndAutoboost(vm, autoboost_enabled, desired_memory_clock, - desired_graphics_clock) + EnablePersistenceMode(vm) + SetGpuClockSpeed(vm, desired_memory_clock, desired_graphics_clock) + SetAutoboostDefaultPolicy(vm, autoboost_enabled) num_gpus = QueryNumberOfGpus(vm) for i in range(num_gpus): if QueryGpuClockSpeed(vm, i) != (desired_memory_clock, @@ -204,27 +205,54 @@ def SetAndConfirmGpuClocks(vm): desired_graphics_clock)) -def SetGpuClockSpeedAndAutoboost(vm, - autoboost_enabled, - memory_clock_speed, - graphics_clock_speed): - """Sets autoboost and memory and graphics clocks to the specified frequency. +def EnablePersistenceMode(vm): + """Enables persistence mode on the NVIDIA driver. + + Args: + vm: virtual machine to operate on + """ + vm.RemoteCommand('sudo nvidia-smi -pm 1') + - Persistence mode is enabled as well. Note that these settings are - lost after reboot. +def SetAutoboostDefaultPolicy(vm, autoboost_enabled): + """Sets the autoboost policy to the specified value. + For each GPU on the VM, this function will set the autoboost policy + to the value specified by autoboost_enabled. Args: vm: virtual machine to operate on autoboost_enabled: bool or None. Value (if any) to set autoboost policy to + """ + if autoboost_enabled is None: + return + + num_gpus = QueryNumberOfGpus(vm) + for device_id in range(0, num_gpus): + current_state = QueryAutoboostPolicy(vm, device_id) + if current_state['autoboost_default'] != autoboost_enabled: + vm.RemoteCommand('sudo nvidia-smi --auto-boost-default={0} --id={1}' + .format(1 if autoboost_enabled else 0, device_id)) + + +def SetGpuClockSpeed(vm, memory_clock_speed, graphics_clock_speed): + """Sets autoboost and memory and graphics clocks to the specified frequency. + + Args: + vm: virtual machine to operate on memory_clock_speed: desired speed of the memory clock, in MHz graphics_clock_speed: desired speed of the graphics clock, in MHz """ - vm.RemoteCommand('sudo nvidia-smi -pm 1') - if autoboost_enabled is not None: - vm.RemoteCommand('sudo nvidia-smi --auto-boost-default=%s' % ( - 1 if autoboost_enabled else 0)) - vm.RemoteCommand('sudo nvidia-smi -ac {},{}'.format(memory_clock_speed, - graphics_clock_speed)) + num_gpus = QueryNumberOfGpus(vm) + for device_id in range(0, num_gpus): + current_clock_speeds = QueryGpuClockSpeed(vm, device_id) + if ( + current_clock_speeds[0] != memory_clock_speed or + current_clock_speeds[1] != graphics_clock_speed): + vm.RemoteCommand('sudo nvidia-smi -ac {},{} --id={}'.format( + memory_clock_speed, + graphics_clock_speed, + device_id + )) def QueryAutoboostPolicy(vm, device_id): diff --git a/tests/linux_packages/cuda_toolkit_8_test.py b/tests/linux_packages/cuda_toolkit_8_test.py index 42b7158457..7c225ec696 100644 --- a/tests/linux_packages/cuda_toolkit_8_test.py +++ b/tests/linux_packages/cuda_toolkit_8_test.py @@ -21,6 +21,10 @@ from perfkitbenchmarker.linux_packages import cuda_toolkit_8 +AUTOBOOST_ENABLED_DICT = {'autoboost': True, 'autoboost_default': True} +AUTOBOOST_DISABLED_DICT = {'autoboost': False, 'autoboost_default': False} + + class CudaToolkit8TestCase(unittest.TestCase, test_util.SamplesTestMixin): def setUp(self): @@ -108,6 +112,57 @@ def testHetergeneousGpuTypes(self): 'PKB only supports one type of gpu per VM', cuda_toolkit_8.GetGpuType, vm) + @mock.patch(cuda_toolkit_8.__name__ + '.QueryNumberOfGpus', return_value=2) + @mock.patch(cuda_toolkit_8.__name__ + '.QueryAutoboostPolicy', + return_value=AUTOBOOST_ENABLED_DICT) + def testSetAutoboostPolicyWhenValuesAreTheSame(self, + query_autoboost_mock, + num_gpus_mock): + vm = mock.MagicMock() + vm.RemoteCommand = mock.MagicMock() + + cuda_toolkit_8.SetAutoboostDefaultPolicy(vm, True) + query_autoboost_mock.assetCalled() + vm.RemoteCommand.assert_not_called() + + @mock.patch(cuda_toolkit_8.__name__ + '.QueryNumberOfGpus', return_value=2) + @mock.patch(cuda_toolkit_8.__name__ + '.QueryAutoboostPolicy', + return_value=AUTOBOOST_DISABLED_DICT) + def testSetAutoboostPolicyWhenValuesAreDifferent(self, + query_autoboost_mock, + num_gpus_mock): + vm = mock.MagicMock() + vm.RemoteCommand = mock.MagicMock() + + cuda_toolkit_8.SetAutoboostDefaultPolicy(vm, True) + query_autoboost_mock.assetCalled() + self.assertEqual(2, vm.RemoteCommand.call_count) + + @mock.patch(cuda_toolkit_8.__name__ + '.QueryNumberOfGpus', return_value=2) + @mock.patch(cuda_toolkit_8.__name__ + '.QueryGpuClockSpeed', + return_value=(2505,875)) + def testSetClockSpeedWhenValuesAreTheSame(self, + query_clock_speed_mock, + num_gpus_mock): + vm = mock.MagicMock() + vm.RemoteCommand = mock.MagicMock() + + cuda_toolkit_8.SetGpuClockSpeed(vm, 2505, 875) + query_clock_speed_mock.assetCalled() + vm.RemoteCommand.assert_not_called() + + @mock.patch(cuda_toolkit_8.__name__ + '.QueryNumberOfGpus', return_value=2) + @mock.patch(cuda_toolkit_8.__name__ + '.QueryGpuClockSpeed', + return_value=(2505,875)) + def testSetClockSpeedWhenValuesAreDifferent(self, + query_clock_speed_mock, + num_gpus_mock): + vm = mock.MagicMock() + vm.RemoteCommand = mock.MagicMock() + + cuda_toolkit_8.SetGpuClockSpeed(vm, 2505, 562) + query_clock_speed_mock.assetCalled() + self.assertEqual(2, vm.RemoteCommand.call_count) if __name__ == '__main__': unittest.main() From ac912a066f616f70201bf391f624ae5d20d7c6d8 Mon Sep 17 00:00:00 2001 From: nathante Date: Thu, 30 Nov 2017 16:37:03 -0800 Subject: [PATCH 22/32] Add AWS Aurora Database as an option for when running pgbench. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=177526285 --- .../configs/benchmark_config_spec.py | 7 + .../aws/aws_managed_relational_db.py | 331 +++++++++++++----- perfkitbenchmarker/providers/aws/util.py | 18 + .../aws/aws_managed_relational_db_test.py | 59 +++- 4 files changed, 321 insertions(+), 94 deletions(-) diff --git a/perfkitbenchmarker/configs/benchmark_config_spec.py b/perfkitbenchmarker/configs/benchmark_config_spec.py index 9020fb4ec2..d1fc62c78d 100644 --- a/perfkitbenchmarker/configs/benchmark_config_spec.py +++ b/perfkitbenchmarker/configs/benchmark_config_spec.py @@ -551,6 +551,13 @@ def _GetOptionDecoderConstructions(cls): managed_relational_db.AURORA_POSTGRES, ] }), + 'zones': (option_decoders.ListDecoder, { + 'item_decoder': option_decoders.StringDecoder(), + 'default': None + }), + 'machine_type': (option_decoders.StringDecoder, { + 'default': None + }), 'engine_version': (option_decoders.StringDecoder, { 'default': None }), diff --git a/perfkitbenchmarker/providers/aws/aws_managed_relational_db.py b/perfkitbenchmarker/providers/aws/aws_managed_relational_db.py index ca7c8dd839..1f4a51c422 100644 --- a/perfkitbenchmarker/providers/aws/aws_managed_relational_db.py +++ b/perfkitbenchmarker/providers/aws/aws_managed_relational_db.py @@ -19,13 +19,12 @@ import logging import time from perfkitbenchmarker import flags -from perfkitbenchmarker import providers from perfkitbenchmarker import managed_relational_db +from perfkitbenchmarker import providers from perfkitbenchmarker import vm_util -from perfkitbenchmarker.providers.aws import util from perfkitbenchmarker.providers.aws import aws_disk from perfkitbenchmarker.providers.aws import aws_network - +from perfkitbenchmarker.providers.aws import util FLAGS = flags.FLAGS @@ -85,8 +84,17 @@ def __init__(self, managed_relational_db_spec): super(AwsManagedRelationalDb, self).__init__(managed_relational_db_spec) self.spec = managed_relational_db_spec self.instance_id = 'pkb-db-instance-' + FLAGS.run_uri - self.zone = self.spec.vm_spec.zone - self.region = util.GetRegionFromZone(self.zone) + self.cluster_id = None + self.all_instance_ids = [] + + if hasattr(self.spec, 'zones') and self.spec.zones is not None: + self.zones = self.spec.zones + else: + self.zones = [self.spec.vm_spec.zone] + + self.region = util.GetRegionFromZones(self.zones) + self.subnets_owned_by_db = [] + self.subnets_used_by_db = [] def GetResourceMetadata(self): """Returns the metadata associated with the resource. @@ -132,8 +140,8 @@ def GetDefaultEngineVersion(engine): def _GetNewZones(self): """Returns a list of zones, excluding the one that the client VM is in.""" - zone = self.spec.vm_spec.zone - region = util.GetRegionFromZone(zone) + zones = self.zones + region = self.region get_zones_cmd = util.AWS_PREFIX + [ 'ec2', 'describe-availability-zones', @@ -143,9 +151,44 @@ def _GetNewZones(self): response = json.loads(stdout) all_zones = [item['ZoneName'] for item in response['AvailabilityZones'] if item['State'] == 'available'] - all_zones.remove(zone) + for zone in zones: + all_zones.remove(zone) return all_zones + def _CreateSubnetInZone(self, new_subnet_zone): + """Creates a new subnet in the same region as the client VM. + + Args: + new_subnet_zone: The zone for the subnet to be created. + Must be in the same region as the client + + Returns: + the new subnet resource + """ + cidr = self.client_vm.network.regional_network.vpc.NextSubnetCidrBlock() + logging.info('Attempting to create a subnet in zone %s' % new_subnet_zone) + new_subnet = ( + aws_network.AwsSubnet( + new_subnet_zone, + self.client_vm.network.regional_network.vpc.id, + cidr)) + new_subnet.Create() + logging.info('Successfully created a new subnet, subnet id is: %s', + new_subnet.id) + + # save for cleanup + self.subnets_used_by_db.append(new_subnet) + self.subnets_owned_by_db.append(new_subnet) + return new_subnet + + def _CreateSubnetInAllZonesAssumeClientZoneExists(self): + client_zone = self.client_vm.network.subnet.zone + for zone in self.zones: + if zone != client_zone: + self._CreateSubnetInZone(zone) + else: + self.subnets_used_by_db.append(self.client_vm.network.subnet) + def _CreateSubnetInAdditionalZone(self): """Creates a new subnet in the same region as the client VM. @@ -155,50 +198,38 @@ def _CreateSubnetInAdditionalZone(self): the new subnet resource Raises: - Exception if unable to create a subnet in any zones in the region. + Exception: if unable to create a subnet in any zones in the region. """ new_subnet_zones = self._GetNewZones() while len(new_subnet_zones) >= 1: try: new_subnet_zone = new_subnet_zones.pop() - logging.info('Attempting to create a second subnet in zone %s', - new_subnet_zone) - new_subnet = ( - aws_network.AwsSubnet( - new_subnet_zone, - self.client_vm.network.regional_network.vpc.id, - '10.0.1.0/24')) - new_subnet.Create() - logging.info('Successfully created a new subnet, subnet id is: %s', - new_subnet.id) - - # save for cleanup - self.extra_subnet_for_db = new_subnet + new_subnet = self._CreateSubnetInZone(new_subnet_zone) return new_subnet except: logging.info('Unable to create subnet in zone %s', new_subnet_zone) raise Exception('Unable to create subnet in any availability zones') - def _CreateDbSubnetGroup(self, new_subnet): + def _CreateDbSubnetGroup(self, subnets): """Creates a new db subnet group. - The db subnet group will consit of two zones: the client vm zone, - and another zone in the same region. - Args: - new_subnet: a subnet in the same region as the client VM's subnet + subnets: a list of strings. + The db subnet group will consit of all subnets in this list. """ - zone = self.spec.vm_spec.zone - region = util.GetRegionFromZone(zone) db_subnet_group_name = 'pkb-db-subnet-group-{0}'.format(FLAGS.run_uri) + create_db_subnet_group_cmd = util.AWS_PREFIX + [ 'rds', 'create-db-subnet-group', '--db-subnet-group-name', db_subnet_group_name, '--db-subnet-group-description', 'pkb_subnet_group_for_db', - '--subnet-ids', self.client_vm.network.subnet.id, new_subnet.id, - '--region', region] - stdout, stderr, _ = vm_util.IssueCommand(create_db_subnet_group_cmd) + '--region', self.region, + '--subnet-ids'] + for subnet in subnets: + create_db_subnet_group_cmd.append(subnet.id) + + vm_util.IssueCommand(create_db_subnet_group_cmd) # save for cleanup self.db_subnet_group_name = db_subnet_group_name @@ -207,10 +238,17 @@ def _CreateDbSubnetGroup(self, new_subnet): def _SetupNetworking(self): """Sets up the networking required for the RDS database.""" - zone = self.spec.vm_spec.zone - region = util.GetRegionFromZone(zone) - new_subnet = self._CreateSubnetInAdditionalZone() - self._CreateDbSubnetGroup(new_subnet) + if (self.spec.engine == managed_relational_db.MYSQL or + self.spec.engine == managed_relational_db.POSTGRES): + self.subnets_used_by_db.append(self.client_vm.network.subnet) + self._CreateSubnetInAdditionalZone() + elif self.spec.engine == managed_relational_db.AURORA_POSTGRES: + self._CreateSubnetInAllZonesAssumeClientZoneExists() + else: + raise Exception('Unknown how to create network for {0}'.format( + self.spec.engine)) + + self._CreateDbSubnetGroup(self.subnets_used_by_db) open_port_cmd = util.AWS_PREFIX + [ 'ec2', @@ -219,56 +257,110 @@ def _SetupNetworking(self): '--source-group', self.security_group_id, '--protocol', 'tcp', '--port={0}'.format(DEFAULT_POSTGRES_PORT), - '--region', region] + '--region', self.region] stdout, stderr, _ = vm_util.IssueCommand(open_port_cmd) logging.info('Granted DB port ingress, stdout is:\n%s\nstderr is:\n%s', stdout, stderr) def _TeardownNetworking(self): """Tears down all network resources that were created for the database.""" - zone = self.spec.vm_spec.zone - region = util.GetRegionFromZone(zone) if hasattr(self, 'db_subnet_group_name'): delete_db_subnet_group_cmd = util.AWS_PREFIX + [ 'rds', 'delete-db-subnet-group', '--db-subnet-group-name', self.db_subnet_group_name, - '--region', region] - stdout, stderr, _ = vm_util.IssueCommand(delete_db_subnet_group_cmd) + '--region', self.region] + vm_util.IssueCommand(delete_db_subnet_group_cmd) - if hasattr(self, 'extra_subnet_for_db'): - self.extra_subnet_for_db.Delete() + for subnet_for_db in self.subnets_owned_by_db: + subnet_for_db.Delete() def _Create(self): - """Creates the AWS RDS instance.""" - cmd = util.AWS_PREFIX + [ - 'rds', - 'create-db-instance', - '--db-instance-identifier=%s' % self.instance_id, - '--engine=%s' % self.spec.engine, - '--master-username=%s' % self.spec.database_username, - '--master-user-password=%s' % self.spec.database_password, - '--allocated-storage=%s' % self.spec.disk_spec.disk_size, - '--storage-type=%s' % self.spec.disk_spec.disk_type, - '--db-instance-class=%s' % self.spec.vm_spec.machine_type, - '--no-auto-minor-version-upgrade', - '--region=%s' % self.region, - '--engine-version=%s' % self.spec.engine_version, - '--db-subnet-group-name=%s' % self.db_subnet_group_name, - '--vpc-security-group-ids=%s' % self.security_group_id, - ] + """Creates the AWS RDS instance. - if self.spec.disk_spec.disk_type == aws_disk.IO1: - cmd.append('--iops=%s' % self.spec.disk_spec.iops) + Raises: + Exception: if unknown how to create self.spec.engine. - if self.spec.high_availability: - cmd.append('--multi-az') - else: - cmd.append('--availability-zone=%s' % self.spec.vm_spec.zone) + """ + if (self.spec.engine == managed_relational_db.MYSQL or + self.spec.engine == managed_relational_db.POSTGRES): - # TODO(ferneyhough): add backup_enabled and backup_window + instance_identifier = self.instance_id + self.all_instance_ids.append(instance_identifier) + cmd = util.AWS_PREFIX + [ + 'rds', + 'create-db-instance', + '--db-instance-identifier=%s' % instance_identifier, + '--engine=%s' % self.spec.engine, + '--master-username=%s' % self.spec.database_username, + '--master-user-password=%s' % self.spec.database_password, + '--allocated-storage=%s' % self.spec.disk_spec.disk_size, + '--storage-type=%s' % self.spec.disk_spec.disk_type, + '--db-instance-class=%s' % self.spec.vm_spec.machine_type, + '--no-auto-minor-version-upgrade', + '--region=%s' % self.region, + '--engine-version=%s' % self.spec.engine_version, + '--db-subnet-group-name=%s' % self.db_subnet_group_name, + '--vpc-security-group-ids=%s' % self.security_group_id, + ] + + if self.spec.disk_spec.disk_type == aws_disk.IO1: + cmd.append('--iops=%s' % self.spec.disk_spec.iops) + + if self.spec.high_availability: + cmd.append('--multi-az') + else: + cmd.append('--availability-zone=%s' % self.spec.vm_spec.zone) + + # TODO(ferneyhough): add backup_enabled and backup_window + + vm_util.IssueCommand(cmd) + + elif self.spec.engine == managed_relational_db.AURORA_POSTGRES: + + cluster_identifier = 'pkb-db-cluster-' + FLAGS.run_uri + #create the cluster + cmd = util.AWS_PREFIX + [ + 'rds', 'create-db-cluster', + '--db-cluster-identifier=%s' % cluster_identifier, + '--engine=aurora-postgresql', + '--master-username=%s' % self.spec.database_username, + '--master-user-password=%s' % self.spec.database_password, + '--region=%s' % self.region, + '--db-subnet-group-name=%s' % self.db_subnet_group_name, + '--vpc-security-group-ids=%s' % self.security_group_id, + '--availability-zones=%s' % self.spec.zones[0] + ] + self.cluster_id = cluster_identifier + vm_util.IssueCommand(cmd) + + for zone in self.zones: + + # The first instance is assumed to be writer - + # and so use the instance_id for that id. + if zone == self.zones[0]: + instance_identifier = self.instance_id + else: + instance_identifier = self.instance_id + '-' + zone + + self.all_instance_ids.append(instance_identifier) + + cmd = util.AWS_PREFIX + [ + 'rds', + 'create-db-instance', + '--db-instance-identifier=%s' % instance_identifier, + '--db-cluster-identifier=%s' % cluster_identifier, + '--engine=aurora-postgresql', + '--no-auto-minor-version-upgrade', + '--db-instance-class=%s' % self.spec.machine_type, + '--region=%s' % self.region, + '--availability-zone=%s' % zone + ] + vm_util.IssueCommand(cmd) - vm_util.IssueCommand(cmd) + else: + raise Exception('Unknown how to create AWS data base engine {0}'.format( + self.spec.engine)) def _Delete(self): """Deletes the underlying resource. @@ -277,14 +369,25 @@ def _Delete(self): be called multiple times, even if the resource has already been deleted. """ - cmd = util.AWS_PREFIX + [ - 'rds', - 'delete-db-instance', - '--db-instance-identifier=%s' % self.instance_id, - '--skip-final-snapshot', - '--region', self.region, - ] - vm_util.IssueCommand(cmd) + for current_instance_id in self.all_instance_ids: + cmd = util.AWS_PREFIX + [ + 'rds', + 'delete-db-instance', + '--db-instance-identifier=%s' % current_instance_id, + '--skip-final-snapshot', + '--region', self.region, + ] + vm_util.IssueCommand(cmd) + + if self.cluster_id is not None: + cmd = util.AWS_PREFIX + [ + 'rds', + 'delete-db-cluster', + '--db-cluster-identifier=%s' % self.cluster_id, + '--skip-final-snapshot', + '--region', self.region, + ] + vm_util.IssueCommand(cmd) def _Exists(self): """Returns true if the underlying resource exists. @@ -293,14 +396,18 @@ def _Exists(self): default is to assume success when _Create and _Delete do not raise exceptions. """ - cmd = util.AWS_PREFIX + [ - 'rds', - 'describe-db-instances', - '--db-instance-identifier=%s' % self.instance_id, - '--region=%s' % self.region - ] - _, _, retcode = vm_util.IssueCommand(cmd) - return retcode == 0 + for current_instance_id in self.all_instance_ids: + cmd = util.AWS_PREFIX + [ + 'rds', + 'describe-db-instances', + '--db-instance-identifier=%s' % current_instance_id, + '--region=%s' % self.region + ] + _, _, retcode = vm_util.IssueCommand(cmd) + if retcode != 0: + return False + + return True def _ParseEndpoint(self, describe_instance_json): """Parses the json output from the CLI and returns the endpoint. @@ -342,20 +449,50 @@ def _SavePrimaryAndSecondaryZones(self, describe_instance_json): def _IsReady(self, timeout=IS_READY_TIMEOUT): """Return true if the underlying resource is ready. + This method will query all of the instance every 5 seconds until + its instance state is 'available', or until a timeout occurs. + + Args: + timeout: timeout in seconds + + Returns: + True if the resource was ready in time, False if the wait timed out + or an Exception occurred. + """ + + if len(self.all_instance_ids) == 0: + return False + + for instance_id in self.all_instance_ids: + if not self._IsInstanceReady(instance_id, timeout): + return False + + return True + + def _PostCreate(self): + """Perform general post create operations on the cluster. + + """ + self._GetPortsForWriterInstance(self.all_instance_ids[0]) + + def _IsInstanceReady(self, instance_id, timeout=IS_READY_TIMEOUT): + """Return true if the instance is ready. + This method will query the instance every 5 seconds until its instance state is 'available', or until a timeout occurs. Args: + instance_id: string of the instance to check is ready timeout: timeout in seconds Returns: True if the resource was ready in time, False if the wait timed out - or an Exception occured. + or an Exception occurred. """ cmd = util.AWS_PREFIX + [ 'rds', 'describe-db-instances', - '--db-instance-identifier=%s' % self.instance_id, + '--db-instance-identifier=%s' % instance_id, '--region=%s' % self.region ] start_time = datetime.datetime.now() @@ -378,20 +515,32 @@ def _IsReady(self, timeout=IS_READY_TIMEOUT): return False time.sleep(5) + return True + + def _GetPortsForWriterInstance(self, instance_id): + """Assigns the ports and endpoints from the instance_id to self. + + These will be used to communicate with the data base, tje + """ + cmd = util.AWS_PREFIX + [ + 'rds', + 'describe-db-instances', + '--db-instance-identifier=%s' % instance_id, + '--region=%s' % self.region + ] + stdout, _, _ = vm_util.IssueCommand(cmd) + json_output = json.loads(stdout) self.endpoint = self._ParseEndpoint(json_output) self.port = self._ParsePort(json_output) - return True def _AssertClientAndDbInSameRegion(self): """Asserts that the client vm is in the same region requested by the server. Raises: - AwsManagedRelationalDbCrossRegionException if the client vm is in a + AwsManagedRelationalDbCrossRegionException: if the client vm is in a different region that is requested by the server. """ - client_region = self.client_vm.region - db_region = util.GetRegionFromZone(self.zone) - if client_region != db_region: + if self.client_vm.region != self.region: raise AwsManagedRelationalDbCrossRegionException(( 'client_vm and managed_relational_db server ' 'must be in the same region')) diff --git a/perfkitbenchmarker/providers/aws/util.py b/perfkitbenchmarker/providers/aws/util.py index fd7771c70c..1cabf71d96 100644 --- a/perfkitbenchmarker/providers/aws/util.py +++ b/perfkitbenchmarker/providers/aws/util.py @@ -42,6 +42,24 @@ def GetRegionFromZone(zone_or_region): return zone_or_region[:-1] +def GetRegionFromZones(zones): + """Returns the region a set of zones are in. + + Raises: + Exception: if the zones are in different regions. + """ + region = None + for zone in zones: + current_region = GetRegionFromZone(zone) + if region is None: + region = current_region + else: + if region != current_region: + raise Exception('Not All zones are in the same region %s not same as %s. zones: %s' % + (region, current_region, ','.join(zones))) + return region + + def AddTags(resource_id, region, **kwargs): """Adds tags to an AWS resource created by PerfKitBenchmarker. diff --git a/tests/providers/aws/aws_managed_relational_db_test.py b/tests/providers/aws/aws_managed_relational_db_test.py index 46a4d69f67..653e6ed4db 100644 --- a/tests/providers/aws/aws_managed_relational_db_test.py +++ b/tests/providers/aws/aws_managed_relational_db_test.py @@ -24,6 +24,7 @@ from perfkitbenchmarker import vm_util from perfkitbenchmarker.configs import benchmark_config_spec from perfkitbenchmarker.managed_relational_db import MYSQL +from perfkitbenchmarker.managed_relational_db import AURORA_POSTGRES from perfkitbenchmarker.providers.aws import aws_managed_relational_db from perfkitbenchmarker.providers.aws import aws_disk from perfkitbenchmarker.providers.aws.aws_managed_relational_db import ( @@ -101,17 +102,19 @@ def createMockSpec(self, additional_spec_items={}): mock_db_spec.configure_mock(**spec_dict) return mock_db_spec - def createManagedDbFromSpec(self, additional_spec_items={}): - mock_spec = self.createMockSpec(additional_spec_items) + def createDbFromMockSpec(self, mock_spec): aws_db = AwsManagedRelationalDb(mock_spec) # Set necessary instance attributes that are not part of the spec aws_db.security_group_name = 'fake_security_group' aws_db.db_subnet_group_name = 'fake_db_subnet' aws_db.security_group_id = 'fake_security_group_id' - return aws_db + def createManagedDbFromSpec(self, additional_spec_items={}): + mock_spec = self.createMockSpec(additional_spec_items) + return self.createDbFromMockSpec(mock_spec) + def create(self, additional_spec_items={}): with self._PatchCriticalObjects() as issue_command: db = self.createManagedDbFromSpec(additional_spec_items) @@ -129,6 +132,53 @@ def testCreate(self): self.assertIn('--engine=mysql', command_string) self.assertIn('--master-user-password=fakepassword', command_string) + def createAuroraMockSpec(self, additional_spec_items={}): + + spec_dict = { + 'engine': AURORA_POSTGRES, + 'run_uri': '123', + 'database_name': 'fakedbname', + 'database_password': 'fakepassword', + 'database_username': 'fakeusername', + 'machine_type' : 'db.r4.4xlarge', + 'zones' : ['us-east-1a', 'us-east-1d'] + } + spec_dict.update(additional_spec_items) + + mock_db_spec = Mock( + spec=benchmark_config_spec._ManagedRelationalDbSpec) + mock_db_spec.configure_mock(**spec_dict) + return mock_db_spec + + def createAuroraDbFromSpec(self, additional_spec_items={}): + mock_spec = self.createAuroraMockSpec(additional_spec_items) + return self.createDbFromMockSpec(mock_spec) + + def createAurora(self, additional_spec_items={}): + with self._PatchCriticalObjects() as issue_command: + db = self.createAuroraDbFromSpec(additional_spec_items) + db._Create() + call_results = [] + for call in issue_command.call_args_list: + call_results.append(' '.join(call[0][0])) + return call_results + + def testCreateAurora(self): + command_strings = self.createAurora() + + self.assertIn( + '%s rds create-db-cluster' % _AWS_PREFIX, command_strings[0]) + self.assertIn('--db-cluster-identifier=pkb-db-cluster-123', + command_strings[0]) + self.assertIn('--engine=aurora-postgresql', command_strings[0]) + self.assertIn('--master-user-password=fakepassword', command_strings[0]) + + self.assertIn( + '%s rds create-db-instance' % _AWS_PREFIX, command_strings[1]) + self.assertIn('--db-cluster-identifier=pkb-db-cluster-123', + command_strings[1]) + self.assertIn('--engine=aurora-postgresql', command_strings[1]) + def testNoHighAvailability(self): spec_dict = { 'multi_az': False, @@ -177,6 +227,7 @@ def testIsNotReady(self): test_data = readTestDataFile('aws-describe-db-instances-creating.json') with self._PatchCriticalObjects(stdout=test_data): db = self.createManagedDbFromSpec() + db.all_instance_ids.append('pkb-db-instance-123') self.assertEqual(False, db._IsReady(timeout=0)) @@ -184,6 +235,7 @@ def testIsReady(self): test_data = readTestDataFile('aws-describe-db-instances-available.json') with self._PatchCriticalObjects(stdout=test_data): db = self.createManagedDbFromSpec() + db.all_instance_ids.append('pkb-db-instance-123') self.assertEqual(True, db._IsReady()) @@ -206,6 +258,7 @@ def testParsePort(self): def testDelete(self): with self._PatchCriticalObjects() as issue_command: db = self.createManagedDbFromSpec() + db.all_instance_ids.append('pkb-db-instance-123') db._Delete() command_string = ' '.join(issue_command.call_args[0][0]) From 849287cdc608aca72e94a0824e9d0f2ef5ff3b7b Mon Sep 17 00:00:00 2001 From: deitz Date: Thu, 30 Nov 2017 18:01:09 -0800 Subject: [PATCH 23/32] Add an IMAGE_OWNER constant to AWS virtual machine. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=177535383 --- perfkitbenchmarker/providers/aws/aws_virtual_machine.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/perfkitbenchmarker/providers/aws/aws_virtual_machine.py b/perfkitbenchmarker/providers/aws/aws_virtual_machine.py index 409feb1bac..1658a53ca9 100644 --- a/perfkitbenchmarker/providers/aws/aws_virtual_machine.py +++ b/perfkitbenchmarker/providers/aws/aws_virtual_machine.py @@ -284,6 +284,7 @@ class AwsVirtualMachine(virtual_machine.BaseVirtualMachine): CLOUD = providers.AWS IMAGE_NAME_FILTER = None + IMAGE_OWNER = None DEFAULT_ROOT_DISK_TYPE = 'gp2' _lock = threading.Lock() @@ -372,6 +373,8 @@ def _GetDefaultImage(cls, machine_type, region): 'Name=block-device-mapping.volume-type,Values=%s' % cls.DEFAULT_ROOT_DISK_TYPE, 'Name=virtualization-type,Values=%s' % virt_type] + if cls.IMAGE_OWNER: + describe_cmd.extend(['--owners', cls.IMAGE_OWNER]) stdout, _ = util.IssueRetryableCommand(describe_cmd) if not stdout: @@ -663,11 +666,13 @@ def AddMetadata(self, **kwargs): class DebianBasedAwsVirtualMachine(AwsVirtualMachine, linux_virtual_machine.DebianMixin): IMAGE_NAME_FILTER = 'ubuntu/images/*/ubuntu-trusty-14.04-amd64-server-20*' + IMAGE_OWNER = '099720109477' # For Amazon-owned images. class JujuBasedAwsVirtualMachine(AwsVirtualMachine, linux_virtual_machine.JujuMixin): IMAGE_NAME_FILTER = 'ubuntu/images/*/ubuntu-trusty-14.04-amd64-server-20*' + IMAGE_OWNER = '099720109477' # For Amazon-owned images. class RhelBasedAwsVirtualMachine(AwsVirtualMachine, From cea000b95dd963c2ac8035c869e782c259c328c1 Mon Sep 17 00:00:00 2001 From: ehankland Date: Fri, 1 Dec 2017 10:39:18 -0800 Subject: [PATCH 24/32] Only emit boot samples once per test ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=177609813 --- perfkitbenchmarker/pkb.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/perfkitbenchmarker/pkb.py b/perfkitbenchmarker/pkb.py index 77f3c52d78..c37775e92d 100755 --- a/perfkitbenchmarker/pkb.py +++ b/perfkitbenchmarker/pkb.py @@ -493,9 +493,6 @@ def DoRunPhase(spec, collector, timer): try: with timer.Measure('Benchmark Run'): samples = spec.BenchmarkRun(spec) - if (FLAGS.boot_samples or - spec.name == cluster_boot_benchmark.BENCHMARK_NAME): - samples.extend(cluster_boot_benchmark.GetTimeToBoot(spec.vms)) except Exception: consecutive_failures += 1 if consecutive_failures > FLAGS.run_stage_retries: @@ -518,6 +515,10 @@ def DoRunPhase(spec, collector, timer): last_publish_time = time.time() run_number += 1 if time.time() > deadline: + if (FLAGS.boot_samples or + spec.name == cluster_boot_benchmark.BENCHMARK_NAME): + collector.AddSamples( + cluster_boot_benchmark.GetTimeToBoot(spec.vms), spec.name, spec) break From 99ffbf232884012d08062c82e232414ce0404956 Mon Sep 17 00:00:00 2001 From: ehankland Date: Fri, 1 Dec 2017 13:49:52 -0800 Subject: [PATCH 25/32] Add descending integerlist range capabilities ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=177635647 --- perfkitbenchmarker/flag_util.py | 3 ++- perfkitbenchmarker/linux_benchmarks/multichase_benchmark.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/perfkitbenchmarker/flag_util.py b/perfkitbenchmarker/flag_util.py index 9cf0c61f0a..a0e03e68ba 100644 --- a/perfkitbenchmarker/flag_util.py +++ b/perfkitbenchmarker/flag_util.py @@ -109,8 +109,8 @@ def __str__(self): def _CreateXrangeFromTuple(self, input_tuple): start = input_tuple[0] - stop_inclusive = input_tuple[1] + 1 step = 1 if len(input_tuple) == 2 else input_tuple[2] + stop_inclusive = input_tuple[1] + (1 if step > 0 else -1) return xrange(start, stop_inclusive, step) @@ -187,6 +187,7 @@ def HandleNonIncreasing(): low = int(match.group(1)) high = int(match.group(3)) step = int(match.group(5)) if match.group(5) is not None else 1 + step = step if low <= high else -step if high <= low or (len(result) > 0 and low <= result[-1]): HandleNonIncreasing() diff --git a/perfkitbenchmarker/linux_benchmarks/multichase_benchmark.py b/perfkitbenchmarker/linux_benchmarks/multichase_benchmark.py index 46c347fe7c..4699560b60 100644 --- a/perfkitbenchmarker/linux_benchmarks/multichase_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/multichase_benchmark.py @@ -285,7 +285,7 @@ def Run(benchmark_spec): for thread_count in FLAGS.multichase_thread_count: if thread_count > vm.num_cpus: - break + continue memory_size_iterator = _IterMemorySizes( lambda: vm.total_memory_kb * 1024, FLAGS.multichase_memory_size_min, FLAGS.multichase_memory_size_max) From 67d3430d5682e72bd9e86aa5b1afd2fe46bbdc4f Mon Sep 17 00:00:00 2001 From: ferneyhough Date: Fri, 1 Dec 2017 16:10:42 -0800 Subject: [PATCH 26/32] Change the NVIDIA permissions daemonset so that it does not consume GPU resources, but still works as intended. This allows all GPUs on the node to be useable by pods. Furthermore, pods can set the clock speed of the GPUs, as this daemonset grants them permissions to do so. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=177654682 --- ...dia_unrestricted_permissions_daemonset.yml | 30 +++++++++++++++---- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/perfkitbenchmarker/data/nvidia_unrestricted_permissions_daemonset.yml b/perfkitbenchmarker/data/nvidia_unrestricted_permissions_daemonset.yml index 1a2930c2dd..dd9b7e7cc1 100644 --- a/perfkitbenchmarker/data/nvidia_unrestricted_permissions_daemonset.yml +++ b/perfkitbenchmarker/data/nvidia_unrestricted_permissions_daemonset.yml @@ -1,3 +1,15 @@ +# This file defines a daemonset which runs automatically on all +# kubernetes nodes. It is used like so: kubectl create -f . +# The daemonset does the following: +# - enables persistence mode on the nvidia driver +# - allows all users to set the GPU clock speed +# In effect, this allows pods created without a privilaged security context to +# set the GPU clock speeds +# This daemonset config does not define GPU resources, because otherwise it +# would consume them, leaving them unavaliable to pods. Instead, it runs in +# privilaged mode (so it can see all GPUs), and manually mounts the CUDA +# lib and bin directories. + apiVersion: apps/v1beta2 kind: DaemonSet metadata: @@ -16,8 +28,16 @@ spec: image: nvidia/cuda:8.0-devel-ubuntu16.04 securityContext: privileged: true - resources: - limits: - nvidia.com/gpu: 1 - command: [ "/bin/bash", "-c", "nvidia-smi -pm 1 && nvidia-smi --applications-clocks-permission=UNRESTRICTED && nvidia-smi --auto-boost-permission=UNRESTRICTED && tail -f /dev/null" ] - + command: [ "/bin/bash", "-c", "export PATH=$PATH:/usr/local/bin/nvidia/ && nvidia-smi -pm 1 && nvidia-smi --applications-clocks-permission=UNRESTRICTED && nvidia-smi --auto-boost-permission=UNRESTRICTED && tail -f /dev/null" ] + volumeMounts: + - name: nvidia-debug-tools + mountPath: /usr/local/bin/nvidia + - name: nvidia-libraries + mountPath: /usr/local/nvidia/lib64 + volumes: + - name: nvidia-debug-tools + hostPath: + path: /home/kubernetes/bin/nvidia/bin + - name: nvidia-libraries + hostPath: + path: /home/kubernetes/bin/nvidia/lib From 6c299213af40b4ae902dcaaf809a867e24e01d2a Mon Sep 17 00:00:00 2001 From: ferneyhough Date: Fri, 1 Dec 2017 17:26:12 -0800 Subject: [PATCH 27/32] Change the NVIDIA permissions daemonset so that it waits until nvidia-smi is available on PATH. This prevents the daemonset from exiting in error if it runs before nvidia-smi becomes available. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=177662581 --- .../data/nvidia_unrestricted_permissions_daemonset.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/perfkitbenchmarker/data/nvidia_unrestricted_permissions_daemonset.yml b/perfkitbenchmarker/data/nvidia_unrestricted_permissions_daemonset.yml index dd9b7e7cc1..29b7178259 100644 --- a/perfkitbenchmarker/data/nvidia_unrestricted_permissions_daemonset.yml +++ b/perfkitbenchmarker/data/nvidia_unrestricted_permissions_daemonset.yml @@ -1,13 +1,14 @@ # This file defines a daemonset which runs automatically on all # kubernetes nodes. It is used like so: kubectl create -f . # The daemonset does the following: +# - waits until nvidia-smi is mounted and available on PATH # - enables persistence mode on the nvidia driver # - allows all users to set the GPU clock speed -# In effect, this allows pods created without a privilaged security context to +# In effect, this allows pods created without a privileged security context to # set the GPU clock speeds # This daemonset config does not define GPU resources, because otherwise it -# would consume them, leaving them unavaliable to pods. Instead, it runs in -# privilaged mode (so it can see all GPUs), and manually mounts the CUDA +# would consume them, leaving them unavailable to pods. Instead, it runs in +# privileged mode (so it can see all GPUs), and manually mounts the CUDA # lib and bin directories. apiVersion: apps/v1beta2 @@ -28,7 +29,7 @@ spec: image: nvidia/cuda:8.0-devel-ubuntu16.04 securityContext: privileged: true - command: [ "/bin/bash", "-c", "export PATH=$PATH:/usr/local/bin/nvidia/ && nvidia-smi -pm 1 && nvidia-smi --applications-clocks-permission=UNRESTRICTED && nvidia-smi --auto-boost-permission=UNRESTRICTED && tail -f /dev/null" ] + command: [ "/bin/bash", "-c", "export PATH=$PATH:/usr/local/bin/nvidia/ && while [ ! $(type -p nvidia-smi) ]; do echo waiting for nvidia-smi to mount...; sleep 2; done && nvidia-smi -pm 1 && nvidia-smi --applications-clocks-permission=UNRESTRICTED && nvidia-smi --auto-boost-permission=UNRESTRICTED && tail -f /dev/null" ] volumeMounts: - name: nvidia-debug-tools mountPath: /usr/local/bin/nvidia From 6bfba38945cd53ec3f0071991a994f41d6353c19 Mon Sep 17 00:00:00 2001 From: ferneyhough Date: Tue, 5 Dec 2017 14:16:53 -0800 Subject: [PATCH 28/32] Add gpu_type and num_gpus to container_service resource metadata. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=178009683 --- perfkitbenchmarker/container_service.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/perfkitbenchmarker/container_service.py b/perfkitbenchmarker/container_service.py index 84693550fb..883ed12a3c 100644 --- a/perfkitbenchmarker/container_service.py +++ b/perfkitbenchmarker/container_service.py @@ -90,6 +90,11 @@ def GetResourceMetadata(self): 'zone': self.zone, 'size': self.num_nodes, } + if self.gpu_count: + metadata.update({ + 'gpu_type': self.gpu_type, + 'num_gpus': self.gpu_count, + }) return metadata From b5bd95ef3627db6d54a2011d916c6ad22f0566b3 Mon Sep 17 00:00:00 2001 From: ferneyhough Date: Tue, 5 Dec 2017 15:50:24 -0800 Subject: [PATCH 29/32] Update GKE cluster version to 1.8.4-gke.0 when creating a cluster with accelerators. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=178023686 --- perfkitbenchmarker/providers/gcp/google_container_engine.py | 4 ++-- tests/providers/gcp/google_container_engine_test.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/perfkitbenchmarker/providers/gcp/google_container_engine.py b/perfkitbenchmarker/providers/gcp/google_container_engine.py index b5d0c93075..2a88c31617 100644 --- a/perfkitbenchmarker/providers/gcp/google_container_engine.py +++ b/perfkitbenchmarker/providers/gcp/google_container_engine.py @@ -46,7 +46,7 @@ def __init__(self, spec): self.gce_accelerator_type_override = FLAGS.gce_accelerator_type_override def GetResourceMetadata(self): - """Returns a dict containing metadata about the VM. + """Returns a dict containing metadata about the cluster. Returns: dict mapping string property key to value. @@ -64,7 +64,7 @@ def _Create(self): # for google_container_engine however). cmd = util.GcloudCommand( self, 'alpha', 'container', 'clusters', 'create', self.name, - '--enable-kubernetes-alpha', '--cluster-version', '1.8.1-gke.1') + '--enable-kubernetes-alpha', '--cluster-version', '1.8.4-gke.0') cmd.flags['accelerator'] = (gce_virtual_machine. GenerateAcceleratorSpecString(self.gpu_type, diff --git a/tests/providers/gcp/google_container_engine_test.py b/tests/providers/gcp/google_container_engine_test.py index fcd7e6d504..6804943ebb 100644 --- a/tests/providers/gcp/google_container_engine_test.py +++ b/tests/providers/gcp/google_container_engine_test.py @@ -151,7 +151,7 @@ def testCreate(self): self.assertEqual(issue_command.call_count, 1) self.assertIn('gcloud alpha container clusters create', command_string) self.assertIn('--enable-kubernetes-alpha', command_string) - self.assertIn('--cluster-version 1.8.1-gke.1', command_string) + self.assertIn('--cluster-version 1.8.4-gke.0', command_string) self.assertIn('--num-nodes 2', command_string) self.assertIn('--machine-type fake-machine-type', command_string) self.assertIn('--accelerator type=nvidia-tesla-k80,count=2', From 54cb16d51a6437203e71aac92657bc032a938f09 Mon Sep 17 00:00:00 2001 From: deitz Date: Wed, 6 Dec 2017 16:00:03 -0800 Subject: [PATCH 30/32] Import changes from GitHub with MOE. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=178167242 --- CHANGES.next.md | 6 +++++- .../providers/profitbricks/__init__.py | 2 +- perfkitbenchmarker/providers/profitbricks/flags.py | 13 ++++++++++--- .../profitbricks/profitbricks_virtual_machine.py | 10 ++++++++-- 4 files changed, 24 insertions(+), 7 deletions(-) diff --git a/CHANGES.next.md b/CHANGES.next.md index f330cdb429..d773d63dc8 100644 --- a/CHANGES.next.md +++ b/CHANGES.next.md @@ -9,9 +9,13 @@ - Windows benchmarks can now be run from linux controllers ###Enhancements: +- Support for ProfitBricks API v4: + - Add `profitbricks_image_alias` flag and support for image aliases + - Add new location, `us/ewr` - Add aws_image_name_filter flag to ease specifying images. ###Bug fixes and maintenance updates: - Moved GPU-related specs from GceVmSpec to BaseVmSpec +- Fix ProfitBricks issue with extra `/` in the API url +- Fix ProfitBricks volume availability zone issue - Bulk AllowPort restored. - diff --git a/perfkitbenchmarker/providers/profitbricks/__init__.py b/perfkitbenchmarker/providers/profitbricks/__init__.py index 391845adcf..cd63f632aa 100644 --- a/perfkitbenchmarker/providers/profitbricks/__init__.py +++ b/perfkitbenchmarker/providers/profitbricks/__init__.py @@ -13,4 +13,4 @@ # limitations under the License. """ProfitBricks cloud provider implementation.""" -PROFITBRICKS_API = 'https://api.profitbricks.com/cloudapi/v3/' +PROFITBRICKS_API = 'https://api.profitbricks.com/cloudapi/v4' diff --git a/perfkitbenchmarker/providers/profitbricks/flags.py b/perfkitbenchmarker/providers/profitbricks/flags.py index 64da831bb9..6e70695b25 100644 --- a/perfkitbenchmarker/providers/profitbricks/flags.py +++ b/perfkitbenchmarker/providers/profitbricks/flags.py @@ -19,6 +19,7 @@ # Locations US_LAS = 'us/las' +US_EWR = 'us/ewr' DE_FKB = 'de/fkb' DE_FRA = 'de/fra' @@ -39,11 +40,17 @@ 'Can also be set via $PROFITBRICKS_CONFIG environment ' 'variable.\n(File format: email:password)')) +flags.DEFINE_string('profitbricks_image_alias', None, + 'An alias to a ProfitBricks public image. If given, ' + 'it will be used instead of a default Ubuntu 14 image. ' + 'E.g., "ubuntu:latest" indicates the latest version of ' + 'Ubuntu is to be used to provision a volume.') + flags.DEFINE_enum('profitbricks_location', US_LAS, - [US_LAS, DE_FKB, DE_FRA], + [US_LAS, US_EWR, DE_FKB, DE_FRA], ('Location of data center to be provisioned (us/las, ' - 'de/fkb, de/fra)')) + 'us/ewr, de/fkb, de/fra)')) flags.DEFINE_enum('profitbricks_boot_volume_type', HDD, @@ -56,7 +63,7 @@ flags.DEFINE_enum('availability_zone', AUTO, - [AUTO, ZONE_1, ZONE_2, ZONE_2], + [AUTO, ZONE_1, ZONE_2, ZONE_3], ('Direct a storage volume to be created in one of three ' 'zones per data center (AUTO, ' 'ZONE_1, ZONE_2, ZONE_3)')) diff --git a/perfkitbenchmarker/providers/profitbricks/profitbricks_virtual_machine.py b/perfkitbenchmarker/providers/profitbricks/profitbricks_virtual_machine.py index b74da525d9..3e7411ab28 100644 --- a/perfkitbenchmarker/providers/profitbricks/profitbricks_virtual_machine.py +++ b/perfkitbenchmarker/providers/profitbricks/profitbricks_virtual_machine.py @@ -139,6 +139,8 @@ def _ApplyFlags(cls, config_values, flag_values): flag_values.profitbricks_boot_volume_size if flag_values['availability_zone'].present: config_values['availability_zone'] = flag_values.availability_zone + if flag_values['profitbricks_image_alias'].present: + config_values['image_alias'] = flag_values.profitbricks_image_alias @classmethod def _GetOptionDecoderConstructions(cls): @@ -153,6 +155,7 @@ def _GetOptionDecoderConstructions(cls): result.update({ 'machine_type': (MachineTypeDecoder, {}), 'location': (option_decoders.StringDecoder, {'default': 'us/las'}), + 'image_alias': (option_decoders.StringDecoder, {'default': None}), 'boot_volume_type': (option_decoders.StringDecoder, {'default': 'HDD'}), 'boot_volume_size': (option_decoders.IntDecoder, {'default': 10, 'min': 10}), @@ -194,6 +197,7 @@ def __init__(self, vm_spec): self.cores = vm_spec.cores self.machine_type = vm_spec.machine_type self.image = self.image or self.DEFAULT_IMAGE + self.image_alias = vm_spec.image_alias self.boot_volume_type = vm_spec.boot_volume_type self.boot_volume_size = vm_spec.boot_volume_size self.location = vm_spec.location @@ -212,8 +216,9 @@ def _Create(self): with open(self.ssh_public_key) as f: public_key = f.read().rstrip('\n') - # Find an Ubuntu image that matches our location - self.image = util.ReturnImage(self.header, self.location) + if self.image_alias is None: + # Find an Ubuntu image that matches our location + self.image = util.ReturnImage(self.header, self.location) # Create server POST body new_server = { @@ -231,6 +236,7 @@ def _Create(self): 'size': self.boot_volume_size, 'name': 'boot volume', 'image': self.image, + 'imageAlias': self.image_alias, 'type': self.boot_volume_type, 'sshKeys': [public_key], 'availabilityZone': self.availability_zone From 18296489f1fc76a84469c0958ba37513bc36a9ab Mon Sep 17 00:00:00 2001 From: tohaowu Date: Wed, 6 Dec 2017 16:48:36 -0800 Subject: [PATCH 31/32] The model we used is resnet not restnet. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=178173609 --- perfkitbenchmarker/linux_benchmarks/tensorflow_benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perfkitbenchmarker/linux_benchmarks/tensorflow_benchmark.py b/perfkitbenchmarker/linux_benchmarks/tensorflow_benchmark.py index c54b3a7e7a..5a71698df3 100644 --- a/perfkitbenchmarker/linux_benchmarks/tensorflow_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/tensorflow_benchmark.py @@ -105,7 +105,7 @@ def LocalParameterDeviceValidator(value): DEFAULT_BATCH_SIZES_BY_MODEL = { 'vgg16': 32, 'alexnet': 512, - 'restnet152': 32, + 'resnet152': 32, } From 780487da354d9a9999f336b3627e9780a67e5099 Mon Sep 17 00:00:00 2001 From: Steven Deitz Date: Thu, 7 Dec 2017 10:12:47 -0800 Subject: [PATCH 32/32] Fix tox flake8 errors and a unit test. The unit test was expecting FLAGS.run_uri to be a string, but it was None. --- perfkitbenchmarker/container_service.py | 4 ++-- .../linux_benchmarks/tensorflow_benchmark.py | 5 ++--- perfkitbenchmarker/linux_packages/redis_server.py | 9 +++++---- perfkitbenchmarker/linux_packages/tensorflow.py | 1 + .../providers/aws/aws_managed_relational_db.py | 2 +- tests/linux_packages/cuda_toolkit_8_test.py | 8 ++++---- tests/providers/aws/aws_managed_relational_db_test.py | 4 ++-- tests/providers/gcp/gcp_managed_relational_db_test.py | 6 +++++- tests/providers/gcp/google_container_engine_test.py | 3 +-- 9 files changed, 23 insertions(+), 19 deletions(-) diff --git a/perfkitbenchmarker/container_service.py b/perfkitbenchmarker/container_service.py index 883ed12a3c..49e183f2e6 100644 --- a/perfkitbenchmarker/container_service.py +++ b/perfkitbenchmarker/container_service.py @@ -92,8 +92,8 @@ def GetResourceMetadata(self): } if self.gpu_count: metadata.update({ - 'gpu_type': self.gpu_type, - 'num_gpus': self.gpu_count, + 'gpu_type': self.gpu_type, + 'num_gpus': self.gpu_count, }) return metadata diff --git a/perfkitbenchmarker/linux_benchmarks/tensorflow_benchmark.py b/perfkitbenchmarker/linux_benchmarks/tensorflow_benchmark.py index 5a71698df3..21a4e964dd 100644 --- a/perfkitbenchmarker/linux_benchmarks/tensorflow_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/tensorflow_benchmark.py @@ -14,7 +14,6 @@ """Run Tensorflow benchmarks (https://github.com/tensorflow/benchmarks).""" -import posixpath import re from perfkitbenchmarker import configs from perfkitbenchmarker import flags @@ -313,8 +312,8 @@ def Run(benchmark_spec): # Add vm index to results metadata for idx, vm_result in enumerate(run_results): - for sample in vm_result: - sample.metadata['vm_index'] = idx + for result_sample in vm_result: + result_sample.metadata['vm_index'] = idx # Flatten the list flattened_results = ( diff --git a/perfkitbenchmarker/linux_packages/redis_server.py b/perfkitbenchmarker/linux_packages/redis_server.py index eeac8d4b50..1223dfd81b 100644 --- a/perfkitbenchmarker/linux_packages/redis_server.py +++ b/perfkitbenchmarker/linux_packages/redis_server.py @@ -58,13 +58,14 @@ def AptInstall(vm): def Configure(vm): """Configure redis server.""" - sed_cmd = (r"sed -i -e '/^save /d' -e 's/# *save \"\"/save \"\"/' " - "{0}/redis.conf").format(REDIS_DIR) + sed_cmd = ( + r"sed -i -e '/^save /d' -e 's/# *save \"\"/save \"\"/' " + "{0}/redis.conf").format(REDIS_DIR) vm.RemoteCommand(sed_cmd) if FLAGS.redis_enable_aof: vm.RemoteCommand( - (r'sed -i -e "s/appendonly no/appendonly yes/g" {0}/redis.conf' - ).format(REDIS_DIR)) + r'sed -i -e "s/appendonly no/appendonly yes/g" {0}/redis.conf'.format( + REDIS_DIR)) vm.RemoteCommand(( r'sed -i -e "s/appendfsync everysec/# appendfsync everysec/g" ' r'{0}/redis.conf' diff --git a/perfkitbenchmarker/linux_packages/tensorflow.py b/perfkitbenchmarker/linux_packages/tensorflow.py index 02b51bda22..fcafe16c8c 100644 --- a/perfkitbenchmarker/linux_packages/tensorflow.py +++ b/perfkitbenchmarker/linux_packages/tensorflow.py @@ -39,6 +39,7 @@ def _GetEnvironmentVars(vm): posixpath.join(FLAGS.cuda_toolkit_installation_dir, lib_name), ]) + def GetTensorFlowVersion(vm): """Returns the version of tensorflow installed on the vm. diff --git a/perfkitbenchmarker/providers/aws/aws_managed_relational_db.py b/perfkitbenchmarker/providers/aws/aws_managed_relational_db.py index 1f4a51c422..23fb3d5d69 100644 --- a/perfkitbenchmarker/providers/aws/aws_managed_relational_db.py +++ b/perfkitbenchmarker/providers/aws/aws_managed_relational_db.py @@ -319,7 +319,7 @@ def _Create(self): elif self.spec.engine == managed_relational_db.AURORA_POSTGRES: cluster_identifier = 'pkb-db-cluster-' + FLAGS.run_uri - #create the cluster + # Create the cluster. cmd = util.AWS_PREFIX + [ 'rds', 'create-db-cluster', '--db-cluster-identifier=%s' % cluster_identifier, diff --git a/tests/linux_packages/cuda_toolkit_8_test.py b/tests/linux_packages/cuda_toolkit_8_test.py index 7c225ec696..5f1b8e0638 100644 --- a/tests/linux_packages/cuda_toolkit_8_test.py +++ b/tests/linux_packages/cuda_toolkit_8_test.py @@ -129,8 +129,8 @@ def testSetAutoboostPolicyWhenValuesAreTheSame(self, @mock.patch(cuda_toolkit_8.__name__ + '.QueryAutoboostPolicy', return_value=AUTOBOOST_DISABLED_DICT) def testSetAutoboostPolicyWhenValuesAreDifferent(self, - query_autoboost_mock, - num_gpus_mock): + query_autoboost_mock, + num_gpus_mock): vm = mock.MagicMock() vm.RemoteCommand = mock.MagicMock() @@ -140,7 +140,7 @@ def testSetAutoboostPolicyWhenValuesAreDifferent(self, @mock.patch(cuda_toolkit_8.__name__ + '.QueryNumberOfGpus', return_value=2) @mock.patch(cuda_toolkit_8.__name__ + '.QueryGpuClockSpeed', - return_value=(2505,875)) + return_value=(2505, 875)) def testSetClockSpeedWhenValuesAreTheSame(self, query_clock_speed_mock, num_gpus_mock): @@ -153,7 +153,7 @@ def testSetClockSpeedWhenValuesAreTheSame(self, @mock.patch(cuda_toolkit_8.__name__ + '.QueryNumberOfGpus', return_value=2) @mock.patch(cuda_toolkit_8.__name__ + '.QueryGpuClockSpeed', - return_value=(2505,875)) + return_value=(2505, 875)) def testSetClockSpeedWhenValuesAreDifferent(self, query_clock_speed_mock, num_gpus_mock): diff --git a/tests/providers/aws/aws_managed_relational_db_test.py b/tests/providers/aws/aws_managed_relational_db_test.py index 653e6ed4db..8d85442ee0 100644 --- a/tests/providers/aws/aws_managed_relational_db_test.py +++ b/tests/providers/aws/aws_managed_relational_db_test.py @@ -140,8 +140,8 @@ def createAuroraMockSpec(self, additional_spec_items={}): 'database_name': 'fakedbname', 'database_password': 'fakepassword', 'database_username': 'fakeusername', - 'machine_type' : 'db.r4.4xlarge', - 'zones' : ['us-east-1a', 'us-east-1d'] + 'machine_type': 'db.r4.4xlarge', + 'zones': ['us-east-1a', 'us-east-1d'] } spec_dict.update(additional_spec_items) diff --git a/tests/providers/gcp/gcp_managed_relational_db_test.py b/tests/providers/gcp/gcp_managed_relational_db_test.py index 894720b959..efbe82a199 100644 --- a/tests/providers/gcp/gcp_managed_relational_db_test.py +++ b/tests/providers/gcp/gcp_managed_relational_db_test.py @@ -29,6 +29,7 @@ from perfkitbenchmarker.providers.gcp import gce_virtual_machine from perfkitbenchmarker.providers.gcp import util from perfkitbenchmarker import disk +from tests import mock_flags _BENCHMARK_NAME = 'name' _BENCHMARK_UID = 'benchmark_uid' @@ -40,7 +41,10 @@ def CreateManagedDbFromSpec(spec_dict): mock_db_spec = mock.Mock( spec=benchmark_config_spec._ManagedRelationalDbSpec) mock_db_spec.configure_mock(**spec_dict) - db_class = gcp_managed_relational_db.GCPManagedRelationalDb(mock_db_spec) + mocked_flags = mock_flags.MockFlags() + mocked_flags.run_uri = 'mock-run-uri' + with mock_flags.PatchFlags(mocked_flags): + db_class = gcp_managed_relational_db.GCPManagedRelationalDb(mock_db_spec) return db_class diff --git a/tests/providers/gcp/google_container_engine_test.py b/tests/providers/gcp/google_container_engine_test.py index 6804943ebb..d3c88d617b 100644 --- a/tests/providers/gcp/google_container_engine_test.py +++ b/tests/providers/gcp/google_container_engine_test.py @@ -31,6 +31,7 @@ _NVIDIA_DRIVER_SETUP_DAEMON_SET_SCRIPT = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/k8s-1.8/device-plugin-daemonset.yaml' _NVIDIA_UNRESTRICTED_PERMISSIONS_DAEMON_SET = 'nvidia_unrestricted_permissions_daemonset.yml' + @contextlib2.contextmanager def patch_critical_objects(stdout='', stderr='', return_code=0): with contextlib2.ExitStack() as stack: @@ -184,5 +185,3 @@ def testPostCreate(self, create_from_file_patch): # driver setup daemon set, followed by the # NVIDIA unrestricted permissions daemon set. create_from_file_patch.assert_has_calls(expected_calls) - -