diff --git a/compute.yml b/compute.yml index 74414fd6..ad38308f 100644 --- a/compute.yml +++ b/compute.yml @@ -14,7 +14,7 @@ - citc_user - filesystem - ssh - #- security_updates + ##- security_updates - ntp - sssd - lmod diff --git a/group_vars/compute.yml b/group_vars/compute.yml index 550ce43f..3a2f18d0 100644 --- a/group_vars/compute.yml +++ b/group_vars/compute.yml @@ -14,6 +14,9 @@ mpi_packages: google: - mpich - openmpi + azure: + - mpich + - openmpi aws: [] monitoring_role: client diff --git a/group_vars/management.yml b/group_vars/management.yml index dc96244a..5b21391a 100644 --- a/group_vars/management.yml +++ b/group_vars/management.yml @@ -12,6 +12,8 @@ slurm_role: mgmt slurm_elastic: oracle: config_directory: /home/slurm/.oci/ + azure: + config_directory: /home/slurm/.oci/ install_packages: - xorg-x11-xauth @@ -27,6 +29,9 @@ mpi_packages: google: - openmpi-devel - mpich-devel + azure: + - openmpi-devel + - mpich-devel aws: [] monitoring_role: master diff --git a/roles/filesystem/tasks/main.yml b/roles/filesystem/tasks/main.yml index 7bbdcf5d..84f43e87 100644 --- a/roles/filesystem/tasks/main.yml +++ b/roles/filesystem/tasks/main.yml @@ -26,7 +26,18 @@ opts: defaults,nofail,nosuid state: mounted when: - ansible_local.citc.csp != "aws" + - ansible_local.citc.csp != "aws" + - ansible_local.citc.csp != "azure" + +- name: Mount shared file system now that fileserver is ready + mount: + path: /mnt/{{ filesystem_mount_point }} + src: "{{ filesystem_target_address }}:{{ filesystem_mount_point }}" + fstype: nfs + opts: rw,hard,rsize=1048576,wsize=1048576,vers=3,tcp,_netdev,noauto + state: mounted + when: + - ansible_local.citc.csp == "azure" - name: Mount shared file system mount: diff --git a/roles/finalise/tasks/main.yml b/roles/finalise/tasks/main.yml index 8a022e57..f2a840d2 100644 --- a/roles/finalise/tasks/main.yml +++ b/roles/finalise/tasks/main.yml @@ -9,6 +9,12 @@ delay: 10 tags: packer +- name: update directory mode for the finalised files + file: + path: /mnt/shared + state: directory + mode: 0755 + - name: create directory for the finalised files file: path: /mnt/shared/finalised diff --git a/roles/monitoring/tasks/main.yml b/roles/monitoring/tasks/main.yml index de1aacdc..7e851fc8 100644 --- a/roles/monitoring/tasks/main.yml +++ b/roles/monitoring/tasks/main.yml @@ -20,11 +20,16 @@ baseurl: https://repos.influxdata.com/centos/$releasever/{{ "arm64" if ansible_architecture == "aarch64" else ansible_architecture }}/stable/ gpgkey: https://repos.influxdata.com/influxdb.key +#- name: install telegraf package +# package: +# name: telegraf +# state: present +# notify: restart telegraf + - name: install telegraf package - package: + yum: name: telegraf - state: present - notify: restart telegraf + disable_gpg_check: yes - name: enable the telegraf service service: diff --git a/roles/packer/files/all.pkr.hcl b/roles/packer/files/all.pkr.hcl index f92c810d..21f85e03 100644 --- a/roles/packer/files/all.pkr.hcl +++ b/roles/packer/files/all.pkr.hcl @@ -10,6 +10,13 @@ variable "aws_region" {} variable "aws_instance_type" {} variable "aws_arch" {} +variable "azure_region" {} +variable "azure_instance_type" {} +variable "azure_resource_group" {} +variable "azure_virtual_network" {} +variable "azure_virtual_network_subnet" {} +variable "azure_dns_zone" {} + variable "oracle_availability_domain" {} variable "oracle_base_image_ocid" {} variable "oracle_compartment_ocid" {} @@ -91,6 +98,22 @@ source "amazon-ebs" "aws" { } } +source "azure-arm" "azure" { + managed_image_name = "${var.destination_image_name}-${var.cluster}-v{{timestamp}}" + managed_image_resource_group_name = var.azure_resource_group + build_resource_group_name = var.azure_resource_group + virtual_network_name = var.azure_virtual_network + virtual_network_subnet_name = var.azure_virtual_network_subnet + virtual_network_resource_group_name = var.azure_resource_group + vm_size = var.azure_instance_type + ssh_username = var.ssh_username + os_type = "Linux" + image_publisher = "OpenLogic" + image_offer = "CentOS" + image_sku = "8_4-gen2" +} + + source "oracle-oci" "oracle" { image_name = "${var.destination_image_name}-${var.cluster}-v{{timestamp}}" availability_domain = var.oracle_availability_domain @@ -111,6 +134,7 @@ build { "source.googlecompute.google", "source.amazon-ebs.aws", "source.oracle-oci.oracle", + "source.azure-arm.azure", ] provisioner "file" { @@ -161,4 +185,12 @@ build { provisioner "shell" { script = "/home/citc/compute_image_extra.sh" } + + provisioner "shell" { + script = "/home/citc/install_cvmfs_eessi.sh" + } + + provisioner "shell" { + script = "/home/citc/compute_image_finalize.sh" + } } diff --git a/roles/packer/files/compute_image_extra.sh b/roles/packer/files/compute_image_extra.sh index 27bd331b..e4f7dd0c 100644 --- a/roles/packer/files/compute_image_extra.sh +++ b/roles/packer/files/compute_image_extra.sh @@ -8,4 +8,4 @@ # sudo yum -y install cmake gcc-gfortran # to install CernVM-FS and configure access to EESSI, uncomment the line below: -# /home/citc/install_cvmfs_eessi.sh +#/home/citc/install_cvmfs_eessi.sh diff --git a/roles/packer/files/compute_image_finalize.sh b/roles/packer/files/compute_image_finalize.sh new file mode 100644 index 00000000..cd3d5b29 --- /dev/null +++ b/roles/packer/files/compute_image_finalize.sh @@ -0,0 +1,3 @@ +#! /bin/bash + +/usr/sbin/waagent -force -deprovision && export HISTSIZE=0 && sync diff --git a/roles/packer/tasks/main.yml b/roles/packer/tasks/main.yml index f05ac85a..4494157d 100644 --- a/roles/packer/tasks/main.yml +++ b/roles/packer/tasks/main.yml @@ -65,6 +65,14 @@ group: citc mode: u=rw,g=rw,o= +- name: copy in packer finalize run script template + copy: + src: compute_image_finalize.sh + dest: /home/citc/compute_image_finalize.sh + owner: citc + group: citc + mode: u=rw,g=rw,o= + - name: copy in EESSI install script copy: src: install_cvmfs_eessi.sh diff --git a/roles/packer/templates/prepare_ansible.sh.j2 b/roles/packer/templates/prepare_ansible.sh.j2 index 0637bb52..8ac998d1 100644 --- a/roles/packer/templates/prepare_ansible.sh.j2 +++ b/roles/packer/templates/prepare_ansible.sh.j2 @@ -9,7 +9,7 @@ $(hostname) cluster_id={{ startnode_config.cluster_id }} packer_run=yes EOF' -{% if ansible_local.citc.csp in ["aws", "google"] %} +{% if ansible_local.citc.csp in ["aws", "google", "azure"] %} sudo yum install -y epel-release sudo dnf config-manager --set-enabled powertools {% elif ansible_local.citc.csp == "oracle" %} @@ -17,6 +17,16 @@ sudo dnf install -y oracle-epel-release-el8 sudo dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm sudo dnf config-manager --set-enabled ol8_codeready_builder {% endif %} +{% if ansible_local.citc.csp in ["azure"] %} +echo "[main]" | sudo tee -a /etc/NetworkManager/conf.d/dhclient.conf +echo "dhcp=dhclient" | sudo tee -a /etc/NetworkManager/conf.d/dhclient.conf +cat /etc/NetworkManager/conf.d/dhclient.conf +echo "append domain-name \" {{ startnode_config.dns_zone }}\";" | sudo tee -a /etc/dhcp/dhclient.conf +echo "append domain-search \" {{ startnode_config.dns_zone }}\";" | sudo tee -a /etc/dhcp/dhclient.conf +sudo cat /etc/dhcp/dhclient.conf +sudo systemctl restart NetworkManager +cat /etc/resolv.conf +{% endif %} sudo dnf install -y ansible git sudo cat /tmp/hosts sudo mkdir -p /etc/ansible/facts.d/ diff --git a/roles/packer/templates/variables.pkrvars.hcl.j2 b/roles/packer/templates/variables.pkrvars.hcl.j2 index 02d22f0e..b7059663 100644 --- a/roles/packer/templates/variables.pkrvars.hcl.j2 +++ b/roles/packer/templates/variables.pkrvars.hcl.j2 @@ -1,12 +1,19 @@ ca_cert = "{{ ca_cert }}" cluster = "{{ startnode_config.cluster_id }}" destination_image_name = "citc-slurm-compute" -ssh_username = "{%- if ansible_local.citc.csp in ["aws", "google"] -%}centos{%- else -%}opc{%- endif -%}" +ssh_username = "{%- if ansible_local.citc.csp in ["aws", "google", "azure"] -%}centos{%- else -%}opc{%- endif -%}" aws_arch = "x86_64" aws_region = "{%- if startnode_config.region is defined -%}{{ startnode_config.region }}{%- endif -%}" aws_instance_type = "t2.nano" +azure_region = "{%- if startnode_config.region is defined -%}{{ startnode_config.region }}{%- endif -%}" +azure_resource_group = "{%- if startnode_config.resource_group is defined -%}{{ startnode_config.resource_group }}{%- endif -%}" +azure_virtual_network = "{%- if startnode_config.virtual_network is defined -%}{{ startnode_config.virtual_network }}{%- endif -%}" +azure_virtual_network_subnet = "{%- if startnode_config.virtual_network_subnet is defined -%}{{ startnode_config.virtual_network_subnet }}{%- endif -%}" +azure_dns_zone = "{%- if startnode_config.dns_zone is defined -%}{{ startnode_config.dns_zone }}{%- endif -%}" +azure_instance_type = "Standard_D4s_v3" + google_destination_image_family = "citc-slurm-compute" google_network = "{%- if startnode_config.network_name is defined -%}{{ startnode_config.network_name }}{%- endif -%}" google_source_image_family = "centos-8" diff --git a/roles/slurm/files/citc_azure.py b/roles/slurm/files/citc_azure.py new file mode 100644 index 00000000..5bba89fc --- /dev/null +++ b/roles/slurm/files/citc_azure.py @@ -0,0 +1,218 @@ +import asyncio +import base64 +import re +import subprocess +import time +from typing import Dict, Optional, Tuple, List + +import yaml # type: ignore +import os +from azure.identity import DefaultAzureCredential +from azure.mgmt.resource import ResourceManagementClient +from azure.mgmt.network import NetworkManagementClient +from azure.mgmt.compute import ComputeManagementClient + +__all__ = ["get_nodespace", "start_node"] + + +def load_yaml(filename: str) -> dict: + with open(filename, "r") as f: + return yaml.safe_load(f) + + +def get_nodespace() -> Dict[str, str]: + """ + Get the information about the space into which we were creating nodes + This will be static for all nodes in this cluster + """ + return load_yaml("/etc/citc/startnode.yaml") + + +def get_node_features(hostname): + features = subprocess.run( + ["sinfo", "--Format=features:200", "--noheader", f"--nodes={hostname}"], + stdout=subprocess.PIPE + ).stdout.decode().strip().split(',') + features = {f.split("=")[0]: f.split("=")[1] for f in features} + return features + +#def get_node_state(oci_config, log, compartment_id: str, hostname: str, cluster_id: str) -> str: +def get_node_state(compute_client, log, hostname: str, resource_group: str) -> str: + """ + Get the current node state of the VM for the given hostname + If there is no such VM, return "TERMINATED" + """ + #matches = compute_client.virtual_machines.list(resource_group) + #print(matches) + #matches = oci.core.ComputeClient(oci_config).list_instances(compartment_id=compartment_id, display_name=hostname).data + #matches = [i for i in matches if i.freeform_tags.get("cluster") == cluster_id] + #still_exist = [i for i in matches if i.lifecycle_state != "TERMINATED"] + #if not still_exist: + # return "TERMINATED" + #if len(still_exist) > 1: + # log.error(f"{hostname}: Multiple matches found for {hostname}") + #return still_exist[0].lifecycle_state + return "node_state" + + +#def get_ip(hostname: str) -> Tuple[Optional[str], Optional[str], Optional[str]]: +def get_ip() -> str: + #host_dns_match = re.match(r"(\d+\.){3}\d+", subprocess.run(["host", hostname], stdout=subprocess.PIPE).stdout.decode().split()[-1]) + #dns_ip = host_dns_match.group(0) if host_dns_match else None + + #slurm_dns_match = re.search(r"NodeAddr=((\d+\.){3}\d+)", subprocess.run(["scontrol", "show", "node", hostname], stdout=subprocess.PIPE).stdout.decode()) + #slurm_ip = slurm_dns_match.group(1) if slurm_dns_match else None + + #ip = dns_ip or slurm_ip + + #return ip, dns_ip, slurm_ip + return "ip" + + +async def start_node(log, host: str, nodespace: Dict[str, str], ssh_keys: str) -> None: + log.info(f"{host}: Starting") + credential = DefaultAzureCredential() + nodespace = get_nodespace() + subscription_id = nodespace["subscription"] + resource_group = nodespace["resource_group"] + region = nodespace["region"] + subnet = nodespace["subnet"] + dns_zone = "."+nodespace["dns_zone"] + ip = "ip" + resource_client = ResourceManagementClient(credential, subscription_id) + network_client = NetworkManagementClient(credential, subscription_id) + compute_client = ComputeManagementClient(credential, subscription_id) + + with open("/home/slurm/bootstrap.sh", "rb") as f: + custom_data = base64.b64encode(f.read()).decode() + + while get_node_state(compute_client, log, host, resource_group) == "TERMINATING": + log.info(f"{host}: host is currently terminating. Waiting...") + await asyncio.sleep(5) + + images = compute_client.images.list_by_resource_group(resource_group) + for image in images: + vm_image = str(image.id) + + print(f"Provisioning nic for {host}....") + poller = network_client.network_interfaces.begin_create_or_update(resource_group,host+"-nic", + { + "location": region, + "ip_configurations": [ { + "name": host+"-nic", + "subnet": { "id": subnet }, + }] + } + ) + + nic_result = poller.result() + + features = get_node_features(host) + shape = features["shape"] + + if shape == "Standard_HC44rs" : + print(f"Provisioning availability set for {host}....") + poller = compute_client.availability_sets.create_or_update(resource_group,shape, + { + "location": region, + "platform_update_domain_count": 1, + "platform_fault_domain_count": 1, + "sku": { "name": "Aligned" }, + } + ) + avset_id = poller.id + print(f"Provisioning virtual machine {host}; this operation might take a few minutes.") + poller = compute_client.virtual_machines.begin_create_or_update(resource_group, host, { + "location": region, + "storage_profile": { + "image_reference": { + "id": vm_image, + } + }, + "hardware_profile": { + "vm_size": shape + }, + "os_profile": { + "computer_name": host, + "admin_username": "centos", + "linux_configuration": { + "ssh": { + "public_keys" : [ { + "path": "/home/centos/.ssh/authorized_keys", + "key_data": ssh_keys + } ] + } + }, + "custom_data": custom_data, + }, + "network_profile": { + "network_interfaces": [{ + "id": nic_result.id, + }] + }, + "availability_set": { + "id": avset_id, + } + }) + vm_result = poller.result() + else : + print(f"Provisioning virtual machine {host}; this operation might take a few minutes.") + poller = compute_client.virtual_machines.begin_create_or_update(resource_group, host, { + "location": region, + "storage_profile": { + "image_reference": { + "id": vm_image, + } + }, + "hardware_profile": { + "vm_size": shape + }, + "os_profile": { + "computer_name": host, + "admin_username": "centos", + "linux_configuration": { + "ssh": { + "public_keys" : [ { + "path": "/home/centos/.ssh/authorized_keys", + "key_data": ssh_keys + } ] + } + }, + "custom_data": custom_data, + }, + "network_profile": { + "network_interfaces": [{ + "id": nic_result.id, + }] + }, + }) + vm_result = poller.result() + + print(f"Provisioned virtual machine {vm_result.name}") + + log.info(f"{host}: Started") + return vm_result + + +def terminate_instance(log, hosts): + + credential = DefaultAzureCredential() + nodespace = get_nodespace() + subscription_id = nodespace["subscription"] + resource_group = nodespace["resource_group"] + resource_client = ResourceManagementClient(credential, subscription_id) + compute_client = ComputeManagementClient(credential, subscription_id) + + for host in hosts: + log.info(f"Stopping {host}") + + try: + vm = compute_client.virtual_machines.get(resource_group, host, expand='instanceView') + for stat in vm.instance_view.statuses: + if stat.code == "PowerState/running": + poller = compute_client.virtual_machines.begin_delete(resource_group, host) + vm_result = poller.result() + print(f"Deleted virtual machine {vm_result.name}") + except: + print("An exception occurred") + log.info(f" Stopped {host}") diff --git a/roles/slurm/files/update_config.py b/roles/slurm/files/update_config.py index 2ac46142..8ecea15b 100644 --- a/roles/slurm/files/update_config.py +++ b/roles/slurm/files/update_config.py @@ -40,9 +40,9 @@ def get_nodespace() -> Dict[str, Dict[str, str]]: def encode_nodename(shape_name: str, node_number: int, cluster_id: str, ad: Optional[int] = None) -> str: if ad is not None: - return "{}-ad{}-{:0>4}".format(shape_name.lower().replace(".", "-"), ad, node_number) + return "{}-ad{}-{:0>4}".format(shape_name.lower().replace(".", "-").replace("_", "-"), ad, node_number) else: - return "{}-{}-{:0>4}".format(cluster_id, shape_name.lower().replace(".", "-"), node_number) + return "{}-{}-{:0>4}".format(cluster_id, shape_name.lower().replace(".", "-").replace("_", "-"), node_number) def create_slurmconf_line(number: int, shape_info: Dict, shape: str, cluster_id, ad: Optional[int] = None): diff --git a/roles/slurm/tasks/elastic.yml b/roles/slurm/tasks/elastic.yml index b6a038d6..9e36eb51 100644 --- a/roles/slurm/tasks/elastic.yml +++ b/roles/slurm/tasks/elastic.yml @@ -123,9 +123,15 @@ --output text --query 'Tags[?Key==`Name`].Value') hostnamectl set-hostname ${name} {% endif %} + {% if ansible_local.citc.csp == "azure" %} + sudo sed -i -e 's/# OS.EnableRDMA=./OS.EnableRDMA=y/g' /etc/waagent.conf + sudo systemctl restart waagent + sudo mount /mnt/shared + sudo systemctl start slurmd + {% endif %} # Ensure that slurmd is running at this point - systemctl start slurmd + #systemctl start slurmd date dest: /home/slurm/bootstrap.sh diff --git a/roles/slurm/tasks/elastic_azure.yml b/roles/slurm/tasks/elastic_azure.yml new file mode 100644 index 00000000..a49e7c78 --- /dev/null +++ b/roles/slurm/tasks/elastic_azure.yml @@ -0,0 +1,25 @@ +--- +- name: install Azure tools + pip: + name: + - azure-mgmt-resource + - azure-mgmt-compute + - azure-mgmt-network + - azure-identity + - azure-cli + virtualenv: /opt/cloud_sdk + +- name: create azure config directory + file: + path: "{{ slurm_elastic.azure.config_directory }}" + state: directory + owner: slurm + group: slurm + mode: 0755 + +- name: install startnode support module + copy: + src: citc_azure.py + dest: /opt/cloud_sdk/lib/python3.8/site-packages/citc_cloud.py + mode: 0755 + diff --git a/roles/slurm/tasks/main.yml b/roles/slurm/tasks/main.yml index debd807c..f3422cc5 100644 --- a/roles/slurm/tasks/main.yml +++ b/roles/slurm/tasks/main.yml @@ -215,6 +215,9 @@ - include_tasks: elastic_aws.yml when: slurm_role == "mgmt" and ansible_local.citc.csp == "aws" +- include_tasks: elastic_azure.yml + when: slurm_role == "mgmt" and ansible_local.citc.csp == "azure" + - name: start service slurmctld service: name: slurmctld