diff --git a/appscale/tools/agents/azure_agent.py b/appscale/tools/agents/azure_agent.py index 0b5f7fe5..6ad821e8 100644 --- a/appscale/tools/agents/azure_agent.py +++ b/appscale/tools/agents/azure_agent.py @@ -123,7 +123,6 @@ class AzureAgent(BaseAgent): PARAM_APP_SECRET, PARAM_APP_ID, PARAM_IMAGE_ID, - PARAM_INSTANCE_TYPE, PARAM_KEYNAME, PARAM_SUBSCRIBER_ID, PARAM_TENANT_ID, @@ -491,6 +490,7 @@ def add_instances_to_existing_ss(self, count, parameters): credentials = self.open_connection(parameters) subscription_id = str(parameters[self.PARAM_SUBSCRIBER_ID]) resource_group = parameters[self.PARAM_RESOURCE_GROUP] + instance_type = parameters[self.PARAM_INSTANCE_TYPE] compute_client = ComputeManagementClient(credentials, subscription_id) num_instances_added = 0 @@ -505,6 +505,9 @@ def add_instances_to_existing_ss(self, count, parameters): if ss_instance_count >= self.MAX_VMSS_CAPACITY: continue + if not vmss.sku.name == instance_type: + continue + scaleset = compute_client.virtual_machine_scale_sets.get( resource_group, vmss.name) ss_upgrade_policy = scaleset.upgrade_policy diff --git a/appscale/tools/agents/ec2_agent.py b/appscale/tools/agents/ec2_agent.py index e4d0399a..c9d6412f 100644 --- a/appscale/tools/agents/ec2_agent.py +++ b/appscale/tools/agents/ec2_agent.py @@ -56,7 +56,6 @@ class EC2Agent(BaseAgent): PARAM_CREDENTIALS, PARAM_GROUP, PARAM_IMAGE_ID, - PARAM_INSTANCE_TYPE, PARAM_KEYNAME, PARAM_SPOT ) diff --git a/appscale/tools/agents/gce_agent.py b/appscale/tools/agents/gce_agent.py index 3493ab22..098a7fb7 100644 --- a/appscale/tools/agents/gce_agent.py +++ b/appscale/tools/agents/gce_agent.py @@ -123,7 +123,6 @@ class GCEAgent(BaseAgent): REQUIRED_CREDENTIALS = ( PARAM_GROUP, PARAM_IMAGE_ID, - PARAM_INSTANCE_TYPE, PARAM_KEYNAME, PARAM_PROJECT, PARAM_ZONE diff --git a/appscale/tools/node_layout.py b/appscale/tools/node_layout.py index 4c8a86e9..968e6af5 100644 --- a/appscale/tools/node_layout.py +++ b/appscale/tools/node_layout.py @@ -10,6 +10,8 @@ from agents.factory import InfrastructureAgentFactory from appscale_logger import AppScaleLogger from custom_exceptions import BadConfigurationException +from local_state import LocalState +from parse_args import ParseArgs class NodeLayout(): @@ -134,6 +136,9 @@ def __init__(self, options): self.replication = options.get('replication') self.database_type = options.get('table', 'cassandra') self.add_to_existing = options.get('add_to_existing') + self.default_instance_type = options.get('instance_type') + self.test = options.get('test') + self.force = options.get('force') if 'login_host' in options and options['login_host'] is not None: self.login_host = options['login_host'] @@ -261,6 +266,22 @@ def validate_node_layout(self): for node, disk in zip(nodes, disks): node.disk = disk + instance_type = node_set.get('instance_type', self.default_instance_type) + + if self.infrastructure: + if not instance_type: + self.invalid("Must set a default instance type or specify instance " + "type per role.") + + # Check if this is an allowed instance type. + if instance_type in ParseArgs.DISALLOWED_INSTANCE_TYPES and \ + not (self.force or self.test): + reason = "the suggested 4GB of RAM" + if 'database' in roles: + reason += " to run Cassandra" + LocalState.confirm_or_abort("The {0} instance type does not have {1}." + "Please consider using a larger instance " + "type.".format(instance_type, reason)) # Assign master. if 'master' in roles: self.master = nodes[0] @@ -271,6 +292,7 @@ def validate_node_layout(self): node.add_role(role) if role == 'login': node.public_ip = self.login_host or node.public_ip + node.instance_type = instance_type if not node.is_valid(): self.invalid(",".join(node.errors())) @@ -526,8 +548,9 @@ def from_locations_json_list(self, locations_nodes_list): open_nodes.append(old_node) continue for _, node in enumerate(nodes_copy): - # Match nodes based on jobs/roles. - if set(old_node_roles) == set(node.roles): + # Match nodes based on jobs/roles and the instance type specified. + if set(old_node_roles) == set(node.roles) \ + and old_node.get('instance_type') == node.instance_type: nodes_copy.remove(node) node.from_json(old_node) if node.is_valid(): @@ -537,19 +560,19 @@ def from_locations_json_list(self, locations_nodes_list): return None break for open_node in open_nodes: - try: - node = nodes_copy.pop() - except IndexError: - return None - # Match nodes based on jobs/roles. - roles = node.roles - node.from_json(open_node) - node.roles = roles - if node.is_valid(): - nodes.append(node) - else: - # Locations JSON is incorrect if we get here. - return None + for node in nodes_copy: + # Match nodes based on jobs/roles and the instance type specified. + if node.instance_type == open_node.get('instance_type'): + roles = node.roles + node.from_json(open_node) + node.roles = roles + if node.is_valid(): + nodes.append(node) + else: + # Locations JSON is incorrect if we get here. + return None + else: + continue # If these lengths are equal all nodes were matched. if len(nodes) == len(self.nodes): @@ -572,7 +595,7 @@ class Node(): DUMMY_INSTANCE_ID = "i-APPSCALE" - def __init__(self, public_ip, cloud, roles=[], disk=None): + def __init__(self, public_ip, cloud, roles=[], disk=None, instance_type=None): """Creates a new Node, representing the given id in the specified cloud. @@ -589,6 +612,7 @@ def __init__(self, public_ip, cloud, roles=[], disk=None): self.cloud = cloud self.roles = roles self.disk = disk + self.instance_type = instance_type self.expand_roles() @@ -712,7 +736,8 @@ def to_json(self): 'private_ip': self.private_ip, 'instance_id': self.instance_id, 'jobs': self.roles, - 'disk': self.disk + 'disk': self.disk, + 'instance_type' : self.instance_type } @@ -735,3 +760,4 @@ def from_json(self, node_dict): self.instance_id = node_dict.get('instance_id') self.roles = node_dict.get('jobs') self.disk = node_dict.get('disk') + self.instance_type = node_dict.get('instance_type') diff --git a/appscale/tools/parse_args.py b/appscale/tools/parse_args.py index 53451c4c..82caff72 100644 --- a/appscale/tools/parse_args.py +++ b/appscale/tools/parse_args.py @@ -651,16 +651,11 @@ def validate_infrastructure_flags(self): raise BadConfigurationException("--disks must be a dict, but was a " \ "{0}".format(type(self.args.disks))) - if not self.args.instance_type: - raise BadConfigurationException("Cannot start a cloud instance without " \ - "the instance type.") - if self.args.instance_type in self.DISALLOWED_INSTANCE_TYPES and \ not (self.args.force or self.args.test): - LocalState.confirm_or_abort("The {0} instance type does not have " \ - "enough RAM to run Cassandra in a production setting. Please " \ - "consider using a larger instance type.".format( - self.args.instance_type)) + LocalState.confirm_or_abort("The {0} instance type does not have " + "the suggested 4GB of RAM. Please consider using a larger instance " + "type.".format(self.args.instance_type)) if self.args.infrastructure == 'azure': if not self.args.azure_subscription_id: diff --git a/appscale/tools/remote_helper.py b/appscale/tools/remote_helper.py index 3e62b978..06654068 100644 --- a/appscale/tools/remote_helper.py +++ b/appscale/tools/remote_helper.py @@ -13,6 +13,7 @@ import uuid import yaml +from boto.exception import BotoServerError # AppScale-specific imports from agents.factory import InfrastructureAgentFactory @@ -24,6 +25,7 @@ from custom_exceptions import BadConfigurationException from custom_exceptions import ShellException from custom_exceptions import TimeoutException +from agents.base_agent import AgentRuntimeException from agents.gce_agent import CredentialTypes from agents.gce_agent import GCEAgent from local_state import APPSCALE_VERSION @@ -151,27 +153,86 @@ def start_all_nodes(cls, options, node_layout): agent.configure_instance_security(params) - load_balancer_nodes = node_layout.get_nodes('load_balancer', True) - instance_ids, public_ips, private_ips = cls.spawn_load_balancers_in_cloud( - options, agent, params, - len(load_balancer_nodes)) + load_balancer_roles = {} + instance_type_roles = {} - for node_index, node in enumerate(load_balancer_nodes): - index = node_layout.nodes.index(node) - node_layout.nodes[index].public_ip = public_ips[node_index] - node_layout.nodes[index].private_ip = private_ips[node_index] - node_layout.nodes[index].instance_id = instance_ids[node_index] + for node in node_layout.get_nodes('load_balancer', True): + load_balancer_roles.setdefault(node.instance_type, []).append(node) + + for node in node_layout.get_nodes('load_balancer', False): + instance_type = instance_type_roles + instance_type.setdefault(node.instance_type, []).append(node) + + spawned_instance_ids = [] + + for instance_type, load_balancer_nodes in load_balancer_roles.items(): + # Copy parameters so we can modify the instance type. + instance_type_params = params.copy() + instance_type_params['instance_type'] = instance_type + + try: + instance_ids, public_ips, private_ips = cls.spawn_nodes_in_cloud( + agent, instance_type_params, count=len(load_balancer_nodes), + load_balancer=True) + except (AgentRuntimeException, BotoServerError): + AppScaleLogger.warn("AppScale was unable to start the requested number " + "of instances, attempting to terminate those that " + "were started.") + if len(spawned_instance_ids) > 0: + AppScaleLogger.warn("Attempting to terminate those that were started.") + cls.terminate_spawned_instances(spawned_instance_ids, agent, params) + + # Cleanup the keyname since it failed. + LocalState.cleanup_keyname(options.keyname) + + # Re-raise the original exception. + raise + + # Keep track of instances we have started. + spawned_instance_ids.extend(instance_ids) + + for node_index, node in enumerate(load_balancer_nodes): + index = node_layout.nodes.index(node) + node_layout.nodes[index].public_ip = public_ips[node_index] + node_layout.nodes[index].private_ip = private_ips[node_index] + node_layout.nodes[index].instance_id = instance_ids[node_index] + + if options.static_ip: + node = node_layout.head_node() + agent.associate_static_ip(params, node.instance_id, + options.static_ip) + node.public_ip = options.static_ip + AppScaleLogger.log("Static IP associated with head node.") AppScaleLogger.log("\nPlease wait for AppScale to prepare your machines " "for use. This can take few minutes.") - other_nodes = node_layout.get_nodes('load_balancer', False) - if len(other_nodes) > 0: - _instance_ids, _public_ips, _private_ips = cls.spawn_other_nodes_in_cloud( - agent, params, - len(other_nodes)) + for instance_type, nodes in instance_type_roles.items(): + # Copy parameters so we can modify the instance type. + instance_type_params = params.copy() + instance_type_params['instance_type'] = instance_type - for node_index, node in enumerate(other_nodes): + try: + _instance_ids, _public_ips, _private_ips = cls.spawn_nodes_in_cloud( + agent, instance_type_params, count=len(nodes)) + except (AgentRuntimeException, BotoServerError): + AppScaleLogger.warn("AppScale was unable to start the requested number " + "of instances, attempting to terminate those that " + "were started.") + if len(spawned_instance_ids) > 0: + AppScaleLogger.warn("Attempting to terminate those that were started.") + cls.terminate_spawned_instances(spawned_instance_ids, agent, params) + + # Cleanup the keyname since it failed. + LocalState.cleanup_keyname(options.keyname) + + # Re-raise the original exception. + raise + + # Keep track of instances we have started. + spawned_instance_ids.extend(_instance_ids) + + for node_index, node in enumerate(nodes): index = node_layout.nodes.index(node) node_layout.nodes[index].public_ip = _public_ips[node_index] node_layout.nodes[index].private_ip = _private_ips[node_index] @@ -276,52 +337,27 @@ def start_head_node(cls, options, my_id, node_layout): @classmethod - def spawn_load_balancers_in_cloud(cls, options, agent, params, count=1): + def spawn_nodes_in_cloud(cls, agent, params, count=1, load_balancer=False): """Starts count number of virtual machines in a cloud infrastructure with public ips. This method also prepares the virtual machine for use by the AppScale Tools. Args: - options: A Namespace that specifies the cloud infrastructure to use, as - well as how to interact with that cloud. agent: The agent to start VMs with, must be passed as an argument because agents cannot be made twice. params: The parameters to be sent to the agent. count: A int, the number of instances to start. + load_balancer: A boolean indicating whether the spawned instance should + have a public ip or not. Returns: The instance ID, public IP address, and private IP address of the machine that was started. """ instance_ids, public_ips, private_ips = agent.run_instances( count=count, parameters=params, security_configured=True, - public_ip_needed=True) + public_ip_needed=load_balancer) - if options.static_ip: - agent.associate_static_ip(params, instance_ids[0], options.static_ip) - public_ips[0] = options.static_ip - AppScaleLogger.log("Static IP associated with head node.") - return instance_ids, public_ips, private_ips - - - @classmethod - def spawn_other_nodes_in_cloud(cls, agent, params, count=1): - """Starts count number of virtual machines in a cloud infrastructure. - - This method also prepares the virtual machine for use by the AppScale Tools. - - Args: - agent: The agent to start VMs with, must be passed as an argument - because agents cannot be made twice. - params: The parameters to be sent to the agent. - count: A int, the number of instances to start. - Returns: - The instance ID, public IP address, and private IP address of the machine - that was started. - """ - instance_ids, public_ips, private_ips = agent.run_instances( - count=count, parameters=params, security_configured=True, - public_ip_needed=False) return instance_ids, public_ips, private_ips @classmethod @@ -850,6 +886,27 @@ def wait_for_machines_to_finish_loading(cls, host, keyname): time.sleep(cls.WAIT_TIME) + @classmethod + def terminate_spawned_instances(cls, spawned_instance_ids, agent, params): + """ Shuts down instances specified. For use when AppScale has failed to + start all the instances for the deployment since we do not check or clean + any local files. + + Args: + spawned_instance_ids: A list of instance ids we have started that + should be terminated. + agent: The agent to call terminate instance with. + params: Agent parameters. + """ + terminate_params = params.copy() + terminate_params[agent.PARAM_INSTANCE_IDS] = spawned_instance_ids + try: + agent.terminate_instances(terminate_params) + except (AgentRuntimeException, BotoServerError): + AppScaleLogger.warn("AppScale failed to terminate instance(s) with " + "id(s): {}".format(spawned_instance_ids)) + + @classmethod def terminate_cloud_instance(cls, instance_id, options): """ Powers off a single instance in the currently AppScale deployment and diff --git a/test/test_ip_layouts.py b/test/test_ip_layouts.py index 0424f3f2..f1a1f77a 100644 --- a/test/test_ip_layouts.py +++ b/test/test_ip_layouts.py @@ -12,6 +12,8 @@ DISK_ONE = 'disk_number_1' DISK_TWO = 'disk_number_2' +INSTANCE_TYPE_1 = "instance_type_1" +INSTANCE_TYPE_2 = "instance_type_2" ONE_NODE_CLOUD = [ { @@ -23,34 +25,35 @@ ONE_NODE_CLUSTER = [ { 'roles': ['master', 'database', 'appengine'], - 'nodes': IP_1 + 'nodes': IP_1, + 'instance_type': INSTANCE_TYPE_1 } ] -OPEN_NODE_CLOUD = [{'roles': ['master', 'database', 'appengine'], 'nodes': 1}, - {'roles': 'open', 'nodes': 1}] +OPEN_NODE_CLOUD = [{'roles': ['master', 'database', 'appengine'], 'nodes': 1, 'instance_type': INSTANCE_TYPE_1}, + {'roles': 'open', 'nodes': 1, 'instance_type': INSTANCE_TYPE_1}] -LOGIN_NODE_CLOUD = [{'roles': ['master', 'database', 'appengine'], 'nodes': 1}, - {'roles': 'login', 'nodes': 1}] +LOGIN_NODE_CLOUD = [{'roles': ['master', 'database', 'appengine'], 'nodes': 1, 'instance_type': INSTANCE_TYPE_1}, + {'roles': 'login', 'nodes': 1, 'instance_type': INSTANCE_TYPE_2}] -FOUR_NODE_CLOUD = [{'roles': 'master', 'nodes': 1}, - {'roles': 'appengine', 'nodes': 1}, - {'roles': 'database', 'nodes': 1}, - {'roles': 'zookeeper', 'nodes': 1}] +FOUR_NODE_CLOUD = [{'roles': 'master', 'nodes': 1, 'instance_type': INSTANCE_TYPE_1}, + {'roles': 'appengine', 'nodes': 1, 'instance_type': INSTANCE_TYPE_1}, + {'roles': 'database', 'nodes': 1, 'instance_type': INSTANCE_TYPE_1}, + {'roles': 'zookeeper', 'nodes': 1, 'instance_type': INSTANCE_TYPE_1}] -FOUR_NODE_CLUSTER = [{'roles': 'master', 'nodes': IP_1}, - {'roles': 'appengine', 'nodes': IP_2}, - {'roles': 'database', 'nodes': IP_3}, - {'roles': 'zookeeper', 'nodes': IP_4}] +FOUR_NODE_CLUSTER = [{'roles': 'master', 'nodes': IP_1, 'instance_type': INSTANCE_TYPE_1}, + {'roles': 'appengine', 'nodes': IP_2, 'instance_type': INSTANCE_TYPE_2}, + {'roles': 'database', 'nodes': IP_3, 'instance_type': INSTANCE_TYPE_1}, + {'roles': 'zookeeper', 'nodes': IP_4, 'instance_type': INSTANCE_TYPE_2}] THREE_NODE_CLOUD = [{'roles': 'master', 'nodes': 1}, {'roles': 'zookeeper', 'nodes': 1}, {'roles': ['database', 'appengine'], 'nodes': 1}] -TWO_NODES_TWO_DISKS_CLOUD = [{'roles': ['master', 'database'], 'nodes': 1, - 'disks': DISK_ONE}, +TWO_NODES_TWO_DISKS_CLOUD = [{'roles': ['master', 'database'], 'nodes': 1, + 'instance_type': INSTANCE_TYPE_1, 'disks': DISK_ONE}, {'roles': ['appengine'], 'nodes': 1, - 'disks': DISK_TWO}] + 'instance_type': INSTANCE_TYPE_2, 'disks': DISK_TWO}] TWO_NODES_ONE_NOT_UNIQUE_DISK_CLOUD = [ {'roles': ['master', 'database'], 'nodes': 1, 'disks': DISK_ONE}, @@ -61,5 +64,7 @@ {'roles': ['appengine'], 'nodes': 2, 'disks': DISK_TWO}] THREE_NODES_TWO_DISKS_FOR_NODESET_CLOUD = [ - {'roles': ['master', 'database'], 'nodes': 1}, - {'roles': ['appengine'], 'nodes': 2, 'disks': [DISK_ONE, DISK_TWO]}] + {'roles': ['master', 'database'], 'nodes': 1, 'instance_type': INSTANCE_TYPE_1}, + {'roles': ['appengine'], 'nodes': 2, + 'instance_type': INSTANCE_TYPE_2, 'disks': [DISK_ONE, DISK_TWO]}] + diff --git a/test/test_local_state.py b/test/test_local_state.py index 005052e5..e3a0cd37 100644 --- a/test/test_local_state.py +++ b/test/test_local_state.py @@ -105,7 +105,8 @@ def test_generate_deployment_params(self): 'table' : 'cassandra', 'infrastructure' : "ec2", 'min_machines' : 1, - 'max_machines' : 1 + 'max_machines' : 1, + 'instance_type': 'm1.large' }) flexmock(NodeLayout).should_receive("head_node").and_return(Node( @@ -247,7 +248,8 @@ def test_update_local_metadata(self): 'min_machines' : 1, 'max_machines' : 1, 'infrastructure' : 'ec2', - 'table' : 'cassandra' + 'table' : 'cassandra', + 'instance_type': 'm1.large' }) LocalState.update_local_metadata(options, 'public1', 'public1') diff --git a/test/test_node_layout.py b/test/test_node_layout.py index 1c119650..16fe209a 100644 --- a/test/test_node_layout.py +++ b/test/test_node_layout.py @@ -92,7 +92,8 @@ def test_login_override_login_node(self): def test_is_database_replication_valid_with_db_slave(self): - input_yaml = [{'roles': ['master', 'database', 'appengine'], 'nodes': 1}] + input_yaml = [{'roles': ['master', 'database', 'appengine'], 'nodes': 1, + 'instance_type': 'm1.large'}] options = self.default_options.copy() options['ips'] = input_yaml fake_node = flexmock() @@ -178,9 +179,9 @@ def test_new_with_right_number_of_unique_disks_both_nodes(self): # suppose that the user has specified two nodes, and two EBS / PD disks # with different names. This is the desired user behavior. input_yaml = [{'roles': ['master', 'database'], 'nodes': 1, - 'disks': self.DISK_ONE}, + 'instance_type': 'm1.large', 'disks': self.DISK_ONE}, {'roles': ['appengine'], 'nodes': 1, - 'disks': self.DISK_TWO}] + 'instance_type': 'm1.large', 'disks': self.DISK_TWO}] options = self.default_options.copy() options['ips'] = input_yaml layout = NodeLayout(options) @@ -193,9 +194,9 @@ def test_new_with_right_number_of_unique_disks_one_node(self): # suppose that the user has specified two nodes, and two EBS / PD disks # with different names. This is the desired user behavior. input_yaml = [ - {'roles': ['master', 'database'], 'nodes': 1}, + {'roles': ['master', 'database'], 'nodes': 1, 'instance_type': 'm1.large'}, {'roles': ['appengine'], 'nodes': 2, - 'disks': [self.DISK_ONE, self.DISK_TWO]}] + 'instance_type': 'm1.large', 'disks': [self.DISK_ONE, self.DISK_TWO]}] options = self.default_options.copy() options['ips'] = input_yaml layout = NodeLayout(options) @@ -229,19 +230,23 @@ def test_new_with_right_number_of_unique_disks_one_node(self): "private_ip": "0.0.0.0", "instance_id": "i-APPSCALE1", "jobs": ['load_balancer', 'taskqueue', 'shadow', 'login', - 'taskqueue_master'] }, + 'taskqueue_master'], + "instance_type": "instance_type_1"}, { "public_ip": "0.0.0.0", "private_ip": "0.0.0.0", "instance_id": "i-APPSCALE2", - "jobs": ['memcache', 'appengine'] }, + "jobs": ['memcache', 'appengine'], + "instance_type": "instance_type_1"}, { "public_ip": "0.0.0.0", "private_ip": "0.0.0.0", "instance_id": "i-APPSCALE3", - "jobs": ['zookeeper'] }, + "jobs": ['zookeeper'], + "instance_type": "instance_type_1"}, { "public_ip": "0.0.0.0", "private_ip": "0.0.0.0", "instance_id": "i-APPSCALE4", - "jobs": ['database', 'db_master'] } + "jobs": ['database', 'db_master'], + "instance_type": "instance_type_1"} ] diff --git a/test/test_remote_helper.py b/test/test_remote_helper.py index b2c60732..4baa8480 100644 --- a/test/test_remote_helper.py +++ b/test/test_remote_helper.py @@ -529,19 +529,23 @@ def test_wait_for_machines_to_finish_loading(self): "private_ip": "0.0.0.0", "instance_id": "i-APPSCALE1", "jobs": ['load_balancer', 'taskqueue', 'shadow', 'login', - 'taskqueue_master'] }, + 'taskqueue_master'], + "instance_type" : "instance_type_1"}, { "public_ip": "0.0.0.0", "private_ip": "0.0.0.0", "instance_id": "i-APPSCALE2", - "jobs": ['memcache', 'appengine'] }, + "jobs": ['memcache', 'appengine'], + "instance_type": "instance_type_1"}, { "public_ip": "0.0.0.0", "private_ip": "0.0.0.0", "instance_id": "i-APPSCALE3", - "jobs": ['zookeeper'] }, + "jobs": ['zookeeper'], + "instance_type": "instance_type_1"}, { "public_ip": "0.0.0.0", "private_ip": "0.0.0.0", "instance_id": "i-APPSCALE4", - "jobs": ['database', 'db_master'] } + "jobs": ['database', 'db_master'], + "instance_type": "instance_type_1"} ] def test_start_all_nodes_reattach(self):