From 585f0933810e13486bc99f9e32884e586aa50089 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Tue, 26 Jan 2016 17:44:40 -0500 Subject: [PATCH 1/9] Start ICloudAPI change. --- flocker/node/agents/blockdevice.py | 9 ++++++--- flocker/node/agents/cinder.py | 7 +++++-- flocker/node/agents/ebs.py | 5 ++++- flocker/node/agents/test/test_blockdevice.py | 12 ++++++++++-- 4 files changed, 25 insertions(+), 8 deletions(-) diff --git a/flocker/node/agents/blockdevice.py b/flocker/node/agents/blockdevice.py index 287486cd76..8eb33f268c 100644 --- a/flocker/node/agents/blockdevice.py +++ b/flocker/node/agents/blockdevice.py @@ -1121,8 +1121,10 @@ def list_live_nodes(): This is used to figure out which nodes are dead, so that other nodes can do the detach. - :returns: A collection of ``unicode`` compute instance IDs, compatible - with those returned by ``IBlockDeviceAPI.compute_instance_id``. + :returns: A mapping of ``unicode`` compute instance IDs + (compatible with those returned by + ``IBlockDeviceAPI.compute_instance_id``) to IPs of those + nodes, also as unicode.. """ def start_node(node_id): @@ -1647,7 +1649,8 @@ def is_existing_block_device(dataset_id, path): pass if ICloudAPI.providedBy(self._underlying_blockdevice_api): - live_instances = self._underlying_blockdevice_api.list_live_nodes() + live_instances = list( + self._underlying_blockdevice_api.list_live_nodes()) else: # Can't know accurately who is alive and who is dead: live_instances = None diff --git a/flocker/node/agents/cinder.py b/flocker/node/agents/cinder.py index 6b14b0b6c2..dac04d7f48 100644 --- a/flocker/node/agents/cinder.py +++ b/flocker/node/agents/cinder.py @@ -689,8 +689,11 @@ def get_device_path(self, blockdevice_id): # ICloudAPI: def list_live_nodes(self): - return list(server.id for server in self.nova_server_manager.list() - if server.status == u'ACTIVE') + return {server.id: + list(map( + unicode, _extract_nova_server_addresses(server.addresses))) + for server in self.nova_server_manager.list() + if server.status == u'ACTIVE'} def start_node(self, node_id): server = self.nova_server_manager.get(node_id) diff --git a/flocker/node/agents/ebs.py b/flocker/node/agents/ebs.py index 90259fa9eb..8ff888541c 100644 --- a/flocker/node/agents/ebs.py +++ b/flocker/node/agents/ebs.py @@ -1363,7 +1363,10 @@ def get_device_path(self, blockdevice_id): def list_live_nodes(self): instances = self.connection.instances.filter( Filters=[{'Name': 'instance-state-name', 'Values': ['running']}]) - return list(unicode(instance.id) for instance in instances) + return {unicode(instance.id): + [unicode(instance.public_ip_address), + unicode(instance.private_ip_address)] + for instance in instances} @boto3_log def start_node(self, node_id): diff --git a/flocker/node/agents/test/test_blockdevice.py b/flocker/node/agents/test/test_blockdevice.py index 5f5e995ea5..d83450554b 100644 --- a/flocker/node/agents/test/test_blockdevice.py +++ b/flocker/node/agents/test/test_blockdevice.py @@ -754,7 +754,9 @@ def __init__(self, block_api, live_nodes=()): self.live_nodes = live_nodes def list_live_nodes(self): - return [self.compute_instance_id()] + list(self.live_nodes) + return {node: [u"10.1.1.{}".format(i)] + for i, node in enumerate( + [self.compute_instance_id()] + list(self.live_nodes))} def start_node(self, node_id): return @@ -5501,11 +5503,17 @@ def test_current_machine_is_live(self): self.assertIn(self.api.compute_instance_id(), live)) return d + def test_current_machine_has_appropriate_ip(self): + """ + The machine's known IP is set for the current node. + """ + # XXX ... + def test_list_live_nodes(self): """ ``list_live_nodes`` returns an iterable of unicode values. """ - live_nodes = self.api.list_live_nodes() + live_nodes = list(self.api.list_live_nodes()) self.assertThat(live_nodes, AllMatch(IsInstance(unicode))) return Tests From dc5ba06374ebb2f505aa86767759b862fdae1eeb Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 27 Jan 2016 09:27:30 -0500 Subject: [PATCH 2/9] FakeCloudAPI updated, and test for listing appropriate IPs. --- flocker/node/agents/test/test_blockdevice.py | 21 +++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/flocker/node/agents/test/test_blockdevice.py b/flocker/node/agents/test/test_blockdevice.py index d83450554b..197f2857db 100644 --- a/flocker/node/agents/test/test_blockdevice.py +++ b/flocker/node/agents/test/test_blockdevice.py @@ -97,7 +97,7 @@ _backing_file_name, ) from ....common.algebraic import tagged_union_strategy - +from ....common import get_all_ips from ... import run_state_change, in_parallel, ILocalState, IStateChange, NoOp from ...testtools import ( @@ -754,9 +754,12 @@ def __init__(self, block_api, live_nodes=()): self.live_nodes = live_nodes def list_live_nodes(self): - return {node: [u"10.1.1.{}".format(i)] - for i, node in enumerate( - [self.compute_instance_id()] + list(self.live_nodes))} + result = {self.compute_instance_id(): + set(unicode(i) for i in get_all_ips() + if i != b"127.0.0.1")} + result.update({node: [u"10.1.1.{}".format(i)] + for i, node in enumerate(self.live_nodes)}) + return result def start_node(self, node_id): return @@ -5507,7 +5510,15 @@ def test_current_machine_has_appropriate_ip(self): """ The machine's known IP is set for the current node. """ - # XXX ... + local_addresses = set(unicode(i) for i in get_all_ips() + if i != b"127.0.0.1") + d = self.async_cloud_api.list_live_nodes() + d.addCallback( + lambda live: + self.assertTrue( + set(live[self.api.compute_instance_id()]).intersection( + local_addresses))) + return d def test_list_live_nodes(self): """ From e2437dd9bd000b1368758063e872f29a7d080717 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 27 Jan 2016 09:44:57 -0500 Subject: [PATCH 3/9] Don't use env variable anymore to find public IPs for nodes. --- admin/acceptance.py | 5 -- flocker/acceptance/endtoend/test_installer.py | 1 - flocker/acceptance/testtools.py | 60 ++++++++++++++----- 3 files changed, 45 insertions(+), 21 deletions(-) diff --git a/admin/acceptance.py b/admin/acceptance.py index dbb737d7e8..a19831189b 100644 --- a/admin/acceptance.py +++ b/admin/acceptance.py @@ -94,11 +94,6 @@ def get_trial_environment(cluster): 'FLOCKER_ACCEPTANCE_VOLUME_BACKEND': cluster.dataset_backend.name, 'FLOCKER_ACCEPTANCE_API_CERTIFICATES_PATH': cluster.certificates_path.path, - 'FLOCKER_ACCEPTANCE_HOSTNAME_TO_PUBLIC_ADDRESS': json.dumps({ - node.private_address: node.address - for node in cluster.agent_nodes - if node.private_address is not None - }), 'FLOCKER_ACCEPTANCE_DEFAULT_VOLUME_SIZE': bytes( cluster.default_volume_size ), diff --git a/flocker/acceptance/endtoend/test_installer.py b/flocker/acceptance/endtoend/test_installer.py index 60f2bf38ea..92b49e4995 100644 --- a/flocker/acceptance/endtoend/test_installer.py +++ b/flocker/acceptance/endtoend/test_installer.py @@ -326,7 +326,6 @@ def _cleanup_flocker(self): control_node=self.control_node_ip.encode('ascii'), certificates_path=local_certs_path, num_agent_nodes=2, - hostname_to_public_address={}, username='user1', ) d.addCallback( diff --git a/flocker/acceptance/testtools.py b/flocker/acceptance/testtools.py index 85a4d56342..228dd5fe81 100644 --- a/flocker/acceptance/testtools.py +++ b/flocker/acceptance/testtools.py @@ -18,6 +18,8 @@ from docker.tls import TLSConfig +from ipaddress import ip_address + from twisted.web.http import OK, CREATED from twisted.python.filepath import FilePath from twisted.python.constants import Names, NamedConstant @@ -45,6 +47,7 @@ from ..apiclient import FlockerClient, DatasetState from ..node.script import get_backend, get_api from ..node import dockerpy_client +from ..node.agents.blockdevice import ICloudAPI from .node_scripts import SCRIPTS as NODE_SCRIPTS @@ -826,7 +829,7 @@ def get_file(self, node, path): def connected_cluster( reactor, control_node, certificates_path, num_agent_nodes, - hostname_to_public_address, username='user', + username='user', ): cluster_cert = certificates_path.child(b"cluster.crt") user_cert = certificates_path.child( @@ -869,24 +872,56 @@ def failed_query(failure): failed_query) return d agents_connected = loop_until(reactor, nodes_available) + agents_connected.addCallback(_add_nodes) + return agents_connected + + +def _add_nodes(cluster): + """ + Configure the ``Node`` objects for a newly created ``Cluster`` whose + nodes are known to be alive. + + :param Cluster cluster: Cluster that still needs nodes set. + :return: ``cluster`` updated with appropriate ``nodes`` set. + """ + # By default we just trust address returned by Flocker + def default_get_public_ip(address): + return address - # Extract node hostnames from API that lists nodes. Currently we - # happen know these in advance, but in FLOC-1631 node identification - # will switch to UUIDs instead. - agents_connected.addCallback(lambda _: cluster.current_nodes()) + try: + backend = get_backend_api(None, cluster.cluster_uuid) + except SkipTest: + # Can't load backend, will have to trust Flocker's reported IPs. + get_public_ip = default_get_public_ip + else: + if ICloudAPI.providedBy(backend): + node_ips = list(set(ip_address(i) for i in ips) + for ips in backend.list_live_nodes().values()) + + def get_public_ip(address): + for ips in node_ips: + if ip_address(address) in ips: + return [unicode(ip) for ip in ips + if not any(ip.is_private, ip.is_link_local, + ip.is_loopback)][0] + raise ValueError( + "Couldn't find address in cloud API reported IPs") + else: + get_public_ip = default_get_public_ip def node_from_dict(node): reported_hostname = node["host"] - public_address = hostname_to_public_address.get( - reported_hostname, reported_hostname) + public_address = get_public_ip(reported_hostname) return Node( uuid=node[u"uuid"], public_address=public_address.encode("ascii"), reported_hostname=reported_hostname.encode("ascii"), ) - agents_connected.addCallback(lambda nodes: cluster.set( - "nodes", map(node_from_dict, nodes))) - return agents_connected + + d = cluster.current_nodes() + d.addCallback( + lambda nodes: cluster.set("nodes", map(node_from_dict, nodes))) + return d def _get_test_cluster(reactor): @@ -914,16 +949,11 @@ def _get_test_cluster(reactor): certificates_path = FilePath( environ["FLOCKER_ACCEPTANCE_API_CERTIFICATES_PATH"]) - hostname_to_public_address_env_var = environ.get( - "FLOCKER_ACCEPTANCE_HOSTNAME_TO_PUBLIC_ADDRESS", "{}") - hostname_to_public_address = json.loads(hostname_to_public_address_env_var) - return connected_cluster( reactor, control_node, certificates_path, num_agent_nodes, - hostname_to_public_address ) From 920e2a8a44ef8ccc36c8a36f00d7a57fef4e7fbf Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 27 Jan 2016 09:53:33 -0500 Subject: [PATCH 4/9] Lint fix. --- flocker/node/agents/ebs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flocker/node/agents/ebs.py b/flocker/node/agents/ebs.py index 8ff888541c..3540ef918e 100644 --- a/flocker/node/agents/ebs.py +++ b/flocker/node/agents/ebs.py @@ -719,7 +719,7 @@ def _wait_for_volume_state_change(operation, start_time = time.time() poll_until( lambda: _reached_end_state( - operation, volume, update, time.time() - start_time, timeout + operation, volume, update, time.time() - start_time, timeout ), itertools.repeat(1) ) From 4e964749cceb3270f41458dcd41eedf4f98d9b0f Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Thu, 28 Jan 2016 11:28:56 -0500 Subject: [PATCH 5/9] A couple minor usage bug fixes. --- flocker/acceptance/testtools.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flocker/acceptance/testtools.py b/flocker/acceptance/testtools.py index 228dd5fe81..f972cbb8d6 100644 --- a/flocker/acceptance/testtools.py +++ b/flocker/acceptance/testtools.py @@ -872,7 +872,7 @@ def failed_query(failure): failed_query) return d agents_connected = loop_until(reactor, nodes_available) - agents_connected.addCallback(_add_nodes) + agents_connected.addCallback(lambda _: _add_nodes(cluster)) return agents_connected @@ -902,8 +902,8 @@ def get_public_ip(address): for ips in node_ips: if ip_address(address) in ips: return [unicode(ip) for ip in ips - if not any(ip.is_private, ip.is_link_local, - ip.is_loopback)][0] + if not any([ip.is_private, ip.is_link_local, + ip.is_loopback])][0] raise ValueError( "Couldn't find address in cloud API reported IPs") else: From 46ac178d9e88a139572146bb0d9346f60ef11d92 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Thu, 28 Jan 2016 12:23:35 -0500 Subject: [PATCH 6/9] Fix EBS node start to actually work. --- flocker/node/agents/ebs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flocker/node/agents/ebs.py b/flocker/node/agents/ebs.py index 3540ef918e..ccb0ff915a 100644 --- a/flocker/node/agents/ebs.py +++ b/flocker/node/agents/ebs.py @@ -1370,7 +1370,7 @@ def list_live_nodes(self): @boto3_log def start_node(self, node_id): - self.connection.start_instances(instance_ids=[node_id]) + self.connection.instances.filter(InstanceIds=[node_id]).start() def aws_from_configuration( From 3d50b5722c366648c017726a059dee444840db44 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Thu, 28 Jan 2016 12:31:55 -0500 Subject: [PATCH 7/9] Passing test, albeit with non-ideal cleanup code. --- flocker/acceptance/endtoend/test_dataset.py | 25 ++++++++++++--------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/flocker/acceptance/endtoend/test_dataset.py b/flocker/acceptance/endtoend/test_dataset.py index 4e6fe36402..916f98ab39 100644 --- a/flocker/acceptance/endtoend/test_dataset.py +++ b/flocker/acceptance/endtoend/test_dataset.py @@ -5,8 +5,9 @@ """ from uuid import UUID -from unittest import SkipTest, skipIf +from unittest import SkipTest from datetime import timedelta +from time import sleep from testtools import run_test_with @@ -18,7 +19,7 @@ from ..testtools import ( require_cluster, require_moving_backend, create_dataset, DatasetBackend, - get_backend_api, verify_socket + get_backend_api, ) @@ -109,18 +110,19 @@ def not_exists(): created.addCallback(delete_dataset) return created - @skipIf(True, - "Shutting down a node invalidates a public IP, which breaks all " - "kinds of things. So skip for now.") @require_moving_backend @run_test_with(async_runner(timeout=timedelta(minutes=6))) - @require_cluster(2) - def test_dataset_move_from_dead_node(self, cluster): + # Rackspace is buggy so this doesn't work there; once we're off + # Rackspace we should re-enable it for everything (FLOC-4001): + @require_cluster(2, required_backend=DatasetBackend.aws) + def test_dataset_move_from_dead_node(self, cluster, backend): """ A dataset can be moved from a dead node to a live node. All attributes, including the maximum size, are preserved. """ + # We could use backend argument, but we're going to drop it as + # part of FLOC-4001, so make our own. api = get_backend_api(self, cluster.cluster_uuid) if not ICloudAPI.providedBy(api): raise SkipTest( @@ -139,9 +141,12 @@ def test_dataset_move_from_dead_node(self, cluster): def startup_node(node_id): api.start_node(node_id) - # Wait for node to boot up:; we presume Flocker getting going after - # SSH is available will be pretty quick: - return loop_until(reactor, verify_socket(node.public_address, 22)) + # Wait for node to boot up: + d = loop_until( + reactor, lambda: node_id in api.list_live_nodes()) + # Give it another ten seconds to boot: + d.addCallback(lambda _: sleep(10)) + return d # Once created, shut down origin node and then request to move the # dataset to node2: From 380633e1076f9857c7d2d0375b88830cb5d776a6 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Thu, 28 Jan 2016 12:59:36 -0500 Subject: [PATCH 8/9] Nicer cleanup after the test. --- flocker/acceptance/endtoend/test_dataset.py | 15 +++++++++++---- flocker/acceptance/testtools.py | 11 +++++++++-- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/flocker/acceptance/endtoend/test_dataset.py b/flocker/acceptance/endtoend/test_dataset.py index 916f98ab39..39b773e23b 100644 --- a/flocker/acceptance/endtoend/test_dataset.py +++ b/flocker/acceptance/endtoend/test_dataset.py @@ -7,7 +7,8 @@ from uuid import UUID from unittest import SkipTest from datetime import timedelta -from time import sleep + +from ipaddress import ip_address from testtools import run_test_with @@ -19,7 +20,7 @@ from ..testtools import ( require_cluster, require_moving_backend, create_dataset, DatasetBackend, - get_backend_api, + get_backend_api, verify_socket, is_public_ip, ) @@ -144,8 +145,14 @@ def startup_node(node_id): # Wait for node to boot up: d = loop_until( reactor, lambda: node_id in api.list_live_nodes()) - # Give it another ten seconds to boot: - d.addCallback(lambda _: sleep(10)) + + # Wait for it to be accessible over SSH, on theory that means + # it's ready to be used: + def wait_for_ssh(_): + ips = [ip_address(i) for i in api.list_live_nodes()[node_id]] + public_ip = [unicode(i) for i in ips if is_public_ip(i)][0] + return verify_socket(public_ip, 22) + d.addCallback(wait_for_ssh) return d # Once created, shut down origin node and then request to move the diff --git a/flocker/acceptance/testtools.py b/flocker/acceptance/testtools.py index f972cbb8d6..5d40b87f3c 100644 --- a/flocker/acceptance/testtools.py +++ b/flocker/acceptance/testtools.py @@ -876,6 +876,14 @@ def failed_query(failure): return agents_connected +def is_public_ip(ip): + """ + :param IPAddress ip: An IP address. + :return: Boolean which is true if it is a public address. + """ + return not any([ip.is_private, ip.is_link_local, ip.is_loopback]) + + def _add_nodes(cluster): """ Configure the ``Node`` objects for a newly created ``Cluster`` whose @@ -902,8 +910,7 @@ def get_public_ip(address): for ips in node_ips: if ip_address(address) in ips: return [unicode(ip) for ip in ips - if not any([ip.is_private, ip.is_link_local, - ip.is_loopback])][0] + if is_public_ip(ip)][0] raise ValueError( "Couldn't find address in cloud API reported IPs") else: From fdd7f9d1e71b0083e6c96ae5499c0e6f0e1d3668 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Thu, 28 Jan 2016 13:02:13 -0500 Subject: [PATCH 9/9] News entry. --- docs/releasenotes/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/releasenotes/index.rst b/docs/releasenotes/index.rst index 980bcebd15..5488753478 100644 --- a/docs/releasenotes/index.rst +++ b/docs/releasenotes/index.rst @@ -16,6 +16,7 @@ Next Release * The new :ref:`CloudFormation installer ` has been made available, to provide a far simpler installation experience for users on AWS. * The :ref:`Flocker plugin for Docker ` should support the direct volume listing and inspection functionality added to Docker 1.10. * Fixed a regression that caused block device agents to poll backend APIs like EBS too frequently in some circumstances. +* Datasets can now be moved off of shutdown EC2 instances. Previously they could only be moved off of terminated instances. This Release ============