diff --git a/ssh-keys.yaml b/ssh-keys.yaml index 2ec3ad95..8629bcf1 100644 --- a/ssh-keys.yaml +++ b/ssh-keys.yaml @@ -7,3 +7,6 @@ flokli: - ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIPTVTXOutUZZjXLB0lUSgeKcSY/8mxKkC0ingGK1whD2 hrosten: - ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHFuB+uEjhoSdakwiKLD3TbNpbjnlXerEfZQbtRgvdSz +jrautiola: + - ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGlFqSQFoSSuAS1IjmWBFXie329I5Aqf71QhVOnLTBG+ joonas@x1 + - ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIB3h/Aj66ndKFtqpQ8H53tE9KbbO0obThC0qbQQKFQRr joonas@zeus diff --git a/terraform/arm-builder.tf b/terraform/arm-builder.tf new file mode 100644 index 00000000..af06bd96 --- /dev/null +++ b/terraform/arm-builder.tf @@ -0,0 +1,71 @@ +# SPDX-FileCopyrightText: 2022-2024 TII (SSRC) and the Ghaf contributors +# SPDX-License-Identifier: Apache-2.0 + +locals { + arm_num_builders = local.opts[local.conf].num_builders_aarch64 +} + +module "arm_builder_vm" { + source = "./modules/arm-builder-vm" + + count = local.arm_num_builders + + resource_group_name = azurerm_resource_group.infra.name + location = azurerm_resource_group.infra.location + virtual_machine_name = "ghaf-builder-aarch64-${count.index}-${local.ws}" + virtual_machine_size = local.opts[local.conf].vm_size_builder_aarch64 + virtual_machine_osdisk_size = local.opts[local.conf].osdisk_size_builder + + virtual_machine_custom_data = join("\n", ["#cloud-config", yamlencode({ + users = [{ + name = "remote-build" + ssh_authorized_keys = [ + "${data.azurerm_key_vault_secret.ssh_remote_build_pub.value}" + ] + }] + write_files = [ + { + content = "AZURE_STORAGE_ACCOUNT_NAME=${data.azurerm_storage_account.binary_cache.name}", + "path" = "/var/lib/rclone-http/env" + } + ], + })]) + + subnet_id = azurerm_subnet.builders.id +} + +# Allow inbound SSH from the jenkins subnet (only) +resource "azurerm_network_interface_security_group_association" "arm_builder_vm" { + count = local.arm_num_builders + + network_interface_id = module.arm_builder_vm[count.index].virtual_machine_network_interface_id + network_security_group_id = azurerm_network_security_group.arm_builder_vm[count.index].id +} + +resource "azurerm_network_security_group" "arm_builder_vm" { + count = local.arm_num_builders + + name = "arm-builder-vm-${count.index}" + resource_group_name = azurerm_resource_group.infra.name + location = azurerm_resource_group.infra.location + + security_rule { + name = "AllowSSHFromJenkins" + priority = 400 + direction = "Inbound" + access = "Allow" + protocol = "Tcp" + source_port_range = "*" + destination_port_ranges = [22] + source_address_prefix = azurerm_subnet.jenkins.address_prefixes[0] + destination_address_prefix = "*" + } +} + +# Allow the VMs to read from the binary cache bucket +resource "azurerm_role_assignment" "arm_builder_access_binary_cache" { + count = local.arm_num_builders + scope = data.azurerm_storage_container.binary_cache_1.resource_manager_id + role_definition_name = "Storage Blob Data Reader" + principal_id = module.arm_builder_vm[count.index].virtual_machine_identity_principal_id +} diff --git a/terraform/binary-cache.tf b/terraform/binary-cache.tf index 563c333e..c692adef 100644 --- a/terraform/binary-cache.tf +++ b/terraform/binary-cache.tf @@ -26,7 +26,7 @@ module "binary_cache_vm" { virtual_machine_custom_data = join("\n", ["#cloud-config", yamlencode({ users = [ - for user in toset(["bmg", "flokli", "hrosten"]) : { + for user in toset(["bmg", "flokli", "hrosten", "jrautiola"]) : { name = user sudo = "ALL=(ALL) NOPASSWD:ALL" ssh_authorized_keys = local.ssh_keys[user] diff --git a/terraform/builder.tf b/terraform/builder.tf index 4362a741..5e5e1cb0 100644 --- a/terraform/builder.tf +++ b/terraform/builder.tf @@ -15,7 +15,7 @@ module "builder_image" { } locals { - num_builders = local.opts[local.conf].num_builders + num_builders = local.opts[local.conf].num_builders_x86 } module "builder_vm" { @@ -25,8 +25,8 @@ module "builder_vm" { resource_group_name = azurerm_resource_group.infra.name location = azurerm_resource_group.infra.location - virtual_machine_name = "ghaf-builder-${count.index}-${local.ws}" - virtual_machine_size = local.opts[local.conf].vm_size_builder + virtual_machine_name = "ghaf-builder-x86-${count.index}-${local.ws}" + virtual_machine_size = local.opts[local.conf].vm_size_builder_x86 virtual_machine_osdisk_size = local.opts[local.conf].osdisk_size_builder virtual_machine_source_image = module.builder_image.image_id diff --git a/terraform/jenkins-controller.tf b/terraform/jenkins-controller.tf index f18bb6f4..8cc979f4 100644 --- a/terraform/jenkins-controller.tf +++ b/terraform/jenkins-controller.tf @@ -28,7 +28,7 @@ module "jenkins_controller_vm" { virtual_machine_custom_data = join("\n", ["#cloud-config", yamlencode({ users = [ - for user in toset(["bmg", "flokli", "hrosten"]) : { + for user in toset(["bmg", "flokli", "hrosten", "jrautiola"]) : { name = user sudo = "ALL=(ALL) NOPASSWD:ALL" ssh_authorized_keys = local.ssh_keys[user] @@ -53,15 +53,19 @@ module "jenkins_controller_vm" { # rather than having to recreate the VM whenever the list of builders is # changed. { - content = join("\n", [ - for ip in toset(module.builder_vm[*].virtual_machine_private_ip_address) : "ssh://remote-build@${ip} x86_64-linux /etc/secrets/remote-build-ssh-key 10 10 kvm,big-parallel - -" - ]), + content = join("\n", concat( + [for ip in toset(module.builder_vm[*].virtual_machine_private_ip_address) : "ssh://remote-build@${ip} x86_64-linux /etc/secrets/remote-build-ssh-key 10 1 kvm,nixos-test,benchmark,big-parallel - -"], + [for ip in toset(module.arm_builder_vm[*].virtual_machine_private_ip_address) : "ssh://remote-build@${ip} aarch64-linux /etc/secrets/remote-build-ssh-key 8 1 kvm,nixos-test,benchmark,big-parallel - -"] + )), "path" = "/etc/nix/machines" }, # Render /var/lib/builder-keyscan/scanlist, so known_hosts can be populated. { - content = join("\n", toset(module.builder_vm[*].virtual_machine_private_ip_address)) - "path" = "/var/lib/builder-keyscan/scanlist" + content = join("\n", toset(concat( + module.builder_vm[*].virtual_machine_private_ip_address, + module.arm_builder_vm[*].virtual_machine_private_ip_address + ))), + "path" = "/var/lib/builder-keyscan/scanlist" }, { content = "SITE_ADDRESS=ghaf-jenkins-controller-${local.ws}.${azurerm_resource_group.infra.location}.cloudapp.azure.com", diff --git a/terraform/main.tf b/terraform/main.tf index 51bb39f7..dc50b037 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -79,29 +79,35 @@ locals { priv = { vm_size_binarycache = "Standard_D2_v3" osdisk_size_binarycache = "50" - vm_size_builder = "Standard_D2_v3" + vm_size_builder_x86 = "Standard_D2_v3" + vm_size_builder_aarch64 = "Standard_D2ps_v5" osdisk_size_builder = "150" vm_size_controller = "Standard_E2_v5" osdisk_size_controller = "150" - num_builders = 1 + num_builders_x86 = 1 + num_builders_aarch64 = 1 } dev = { vm_size_binarycache = "Standard_D2_v3" osdisk_size_binarycache = "250" - vm_size_builder = "Standard_D4_v3" + vm_size_builder_x86 = "Standard_D4_v3" + vm_size_builder_aarch64 = "Standard_D4ps_v5" osdisk_size_builder = "250" vm_size_controller = "Standard_E4_v5" osdisk_size_controller = "500" - num_builders = 1 + num_builders_x86 = 1 + num_builders_aarch64 = 1 } prod = { vm_size_binarycache = "Standard_D2_v3" osdisk_size_binarycache = "250" - vm_size_builder = "Standard_D8_v3" + vm_size_builder_x86 = "Standard_D8_v3" + vm_size_builder_aarch64 = "Standard_D8ps_v5" osdisk_size_builder = "500" vm_size_controller = "Standard_E4_v5" osdisk_size_controller = "1000" - num_builders = 2 + num_builders_x86 = 2 + num_builders_aarch64 = 2 } } diff --git a/terraform/modules/arm-builder-vm/README.md b/terraform/modules/arm-builder-vm/README.md new file mode 100644 index 00000000..bee7a209 --- /dev/null +++ b/terraform/modules/arm-builder-vm/README.md @@ -0,0 +1,19 @@ + + +# arm-builder-vm + +Terraform module spinning up a Azure aarch64 VM with ubuntu and nix. + +Modified from `azurerm-linux-vm` + +## Why not NixOS Image? + +- `virtualisation.azure.agent` does not support anything that isn't x86, [quite explicitly](https://github.com/NixOS/nixpkgs/blob/master/nixos/modules/virtualisation/azure-agent.nix#L38) + +- aarch64 azure vms (Standard_D2ps_v5 etc.) are all v5, and as such only support [Generation 2 hypervisor images](https://learn.microsoft.com/en-us/azure/virtual-machines/generation-2), which nix also lacks support for. +There is a [stale pull request](https://github.com/NixOS/nixpkgs/pull/236110) in nixpkgs that tries to fix this issue but it has not been active since june 2023. Part of the problem is that Gen2 images use EFI boot. + +For these reasons, this arm builder is using ubuntu with nix installed on top, configured to be similar to the x86 builder's nixos configuration. diff --git a/terraform/modules/arm-builder-vm/ubuntu-builder.sh b/terraform/modules/arm-builder-vm/ubuntu-builder.sh new file mode 100755 index 00000000..925050a2 --- /dev/null +++ b/terraform/modules/arm-builder-vm/ubuntu-builder.sh @@ -0,0 +1,172 @@ +#!/usr/bin/env bash + +# SPDX-FileCopyrightText: 2022-2024 TII (SSRC) and the Ghaf contributors +# SPDX-License-Identifier: Apache-2.0 + +set -x # debug + +################################################################################ + +# Assume root if HOME and USER are unset +[ -z "${HOME}" ] && export HOME="/root" +[ -z "${USER}" ] && export USER="root" + +################################################################################ + +apt_update() { + sudo apt-get update -y + sudo apt-get upgrade -y + sudo apt-get install -y ca-certificates curl xz-utils +} + +install_nix() { + type="$1" + if [ "$type" = "single" ]; then + # Single-user + sh <(curl -L https://nixos.org/nix/install) --yes --no-daemon + elif [ "$type" = "multi" ]; then + # Multi-user + sh <(curl -L https://nixos.org/nix/install) --yes --daemon + else + echo "Error: unknown installation type: '$type'" + exit 1 + fi + # Fix https://github.com/nix-community/home-manager/issues/3734: + sudo mkdir -m 0755 -p /nix/var/nix/{profiles,gcroots}/per-user/"$USER" + sudo chown -R "$USER:nixbld" "/nix/var/nix/profiles/per-user/$USER" + # Enable flakes + extra_nix_conf="experimental-features = nix-command flakes" + sudo sh -c "printf '$extra_nix_conf\n'>>/etc/nix/nix.conf" + # https://github.com/NixOS/nix/issues/1078#issuecomment-1019327751 + for f in /nix/var/nix/profiles/default/bin/nix*; do + sudo ln -fs "$f" "/usr/bin/$(basename "$f")" + done +} + +configure_builder() { + # Add user: remote-build + # Extra nix config for the builder, + # for detailed description of each of the below options see: + # https://nixos.org/manual/nix/stable/command-ref/conf-file + extra_nix_conf=" +# 20 GB (20*1024*1024*1024) +min-free = 21474836480 +# 500 GB (500*1024*1024*1024) +# osdisk size for prod builders +max-free = 536870912000 +system-features = nixos-test benchmark big-parallel kvm +trusted-users = remote-build +substituters = http://localhost:8080 https://cache.vedenemo.dev https://cache.nixos.org +trusted-public-keys = ghaf-infra-dev:EdgcUJsErufZitluMOYmoJDMQE+HFyveI/D270Cr84I= cache.vedenemo.dev:8NhplARANhClUSWJyLVk4WMyy1Wb4rhmWW2u8AejH9E= cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY=" + sudo sh -c "printf '$extra_nix_conf\n' >> /etc/nix/nix.conf" +} + +configure_rclone() { + # The version of rclone in ubuntu repositories is too old to include --azureblob-env-auth + # https://rclone.org/install/ + sudo -v + curl https://rclone.org/install.sh | sudo bash + service_file=" +[Unit] +After=network.target +Requires=network.target + +[Service] +DynamicUser=true +EnvironmentFile=/var/lib/rclone-http/env +ExecStart=/usr/bin/rclone serve http --azureblob-env-auth --read-only --addr localhost:8080 :azureblob:binary-cache-v1 +Restart=always +RestartSec=2 +RuntimeDirectory=rclone-http +Type=notify" + sudo sh -c "printf '$service_file\n' > /etc/systemd/system/rclone-http.service" + sudo systemctl daemon-reload + sudo systemctl enable rclone-http.service + sudo systemctl start rclone-http.service +} + +restart_nix_daemon() { + # Re-start nix-daemon + if systemctl list-units | grep -iq "nix-daemon"; then + sudo systemctl restart nix-daemon + if ! systemctl status nix-daemon; then + echo "Error: nix-daemon failed to start" + exit 1 + fi + fi +} + +uninstall_nix() { + # https://github.com/NixOS/nix/issues/1402 + if grep -q nixbld /etc/passwd; then + grep nixbld /etc/passwd | awk -F ":" '{print $1}' | xargs -t -n 1 sudo userdel -r + fi + if grep -q nixbld /etc/group; then + sudo groupdel nixbld + fi + rm -rf "$HOME/"{.nix-channels,.nix-defexpr,.nix-profile,.config/nixpkgs,.config/nix,.config/home-manager,.local/state/nix,.local/state/home-manager} + sudo rm -rf /etc/profile.d/nix.sh + if [ -d "/nix" ]; then + sudo rm -rf /nix + fi + if [ -d "/etc/nix" ]; then + sudo rm -fr /etc/nix + fi + sudo find /etc -iname "*backup-before-nix*" -delete + sudo find -L /usr/bin -iname "nix*" -delete + [ -f "$HOME/.profile" ] && sed -i "/\/nix/d" "$HOME/.profile" + [ -f "$HOME/.bash_profile" ] && sed -i "/\/nix/d" "$HOME/.bash_profile" + [ -f "$HOME/.bashrc" ] && sed -i "/\/nix/d" "$HOME/.bashrc" + if systemctl list-units | grep -iq "nix-daemon"; then + sudo systemctl stop nix-daemon nix-daemon.socket + sudo systemctl disable nix-daemon nix-daemon.socket + sudo find /etc/systemd -iname "*nix-daemon*" -delete + sudo find /usr/lib/systemd -iname "*nix-daemon*" -delete + sudo systemctl daemon-reload + sudo systemctl reset-failed + fi + unset NIX_PATH +} + +outro() { + set +x + echo "" + nixpkgs_ver=$(nix-instantiate --eval -E '(import {}).lib.version' 2>/dev/null) + if [ -n "$nixpkgs_ver" ]; then + echo "Installed nixpkgs version: $nixpkgs_ver" + else + echo "Failed reading installed nixpkgs version" + exit 1 + fi + echo "" + echo "Open a new terminal for the changes to take impact" + echo "" +} + +exit_unless_command_exists() { + if ! command -v "$1" 2>/dev/null; then + echo "Error: command '$1' is not installed" >&2 + exit 1 + fi +} + +################################################################################ + +main() { + exit_unless_command_exists "apt-get" + exit_unless_command_exists "systemctl" + apt_update + uninstall_nix + install_nix "multi" + configure_builder + configure_rclone + restart_nix_daemon + exit_unless_command_exists "nix-shell" + outro +} + +################################################################################ + +main "$@" + +################################################################################ diff --git a/terraform/modules/arm-builder-vm/variables.tf b/terraform/modules/arm-builder-vm/variables.tf new file mode 100644 index 00000000..c45de13f --- /dev/null +++ b/terraform/modules/arm-builder-vm/variables.tf @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: 2022-2024 TII (SSRC) and the Ghaf contributors +# SPDX-License-Identifier: Apache-2.0 + +variable "resource_group_name" { + type = string +} + +variable "location" { + type = string +} + +variable "virtual_machine_name" { + type = string +} + +variable "virtual_machine_size" { + type = string +} + +variable "virtual_machine_osdisk_size" { + type = string +} + +variable "virtual_machine_custom_data" { + type = string + default = "" +} + +variable "allocate_public_ip" { + type = bool + default = false +} + +variable "subnet_id" { + type = string + description = "The subnet ID to attach to the VM and allocate an IP from" +} + +variable "data_disks" { + description = "List of dict containing keys of the storage_data_disk block" + default = [] +} diff --git a/terraform/modules/arm-builder-vm/virtual_machine.tf b/terraform/modules/arm-builder-vm/virtual_machine.tf new file mode 100644 index 00000000..856a9cfc --- /dev/null +++ b/terraform/modules/arm-builder-vm/virtual_machine.tf @@ -0,0 +1,148 @@ +# SPDX-FileCopyrightText: 2022-2024 TII (SSRC) and the Ghaf contributors +# SPDX-License-Identifier: Apache-2.0 + +resource "azurerm_virtual_machine" "main" { + name = var.virtual_machine_name + resource_group_name = var.resource_group_name + location = var.location + vm_size = var.virtual_machine_size + + delete_os_disk_on_termination = true + delete_data_disks_on_termination = false + + network_interface_ids = [azurerm_network_interface.default.id] + + storage_image_reference { + publisher = "Canonical" + offer = "0001-com-ubuntu-server-jammy" + sku = "22_04-lts-arm64" + version = "latest" + } + + identity { + type = "SystemAssigned" + } + + os_profile { + computer_name = var.virtual_machine_name + # Unused, but required by the API. May not be root either + admin_username = "foo" + admin_password = "S00persecret" + + # We only set custom_data here, not user_data. + # user_data is more recent, and allows updates without recreating the machine, + # but at least cloud-init 23.1.2 blocks boot if custom_data is not set. + # (It logs about not being able to mount /dev/sr0 to /metadata). + # This can be worked around by setting custom_data to a static placeholder, + # but user_data is still ignored. + # TODO: check this again with a more recent cloud-init version. + custom_data = (var.virtual_machine_custom_data == "") ? null : base64encode(var.virtual_machine_custom_data) + } + + os_profile_linux_config { + # We *don't* support password auth, and this doesn't change anything. + # However, if we don't set this to false we need to + # specify additional pubkeys. + disable_password_authentication = false + # We can't use admin_ssh_key, as it only works for the admin_username. + } + + boot_diagnostics { + enabled = true + # azurerm_virtual_machine doesn't support the managed storage account + storage_uri = azurerm_storage_account.boot_diag.primary_blob_endpoint + } + + storage_os_disk { + name = "${var.virtual_machine_name}-osdisk" # needs to be unique + caching = "ReadWrite" + create_option = "FromImage" + managed_disk_type = "Standard_LRS" + disk_size_gb = var.virtual_machine_osdisk_size + } + + dynamic "storage_data_disk" { + for_each = var.data_disks + + content { + # use lookup here, so keys can be set optionally + name = lookup(storage_data_disk.value, "name", null) + caching = lookup(storage_data_disk.value, "caching", null) + create_option = "Attach" + # This has to be passed, even for "Attach" + disk_size_gb = lookup(storage_data_disk.value, "disk_size_gb", null) + lun = lookup(storage_data_disk.value, "lun", null) + + managed_disk_type = lookup(storage_data_disk.value, "managed_disk_type", null) + managed_disk_id = lookup(storage_data_disk.value, "managed_disk_id", null) + } + } +} + +resource "azurerm_network_interface" "default" { + name = "${var.virtual_machine_name}-nic" + resource_group_name = var.resource_group_name + location = var.location + + ip_configuration { + name = "internal" + subnet_id = var.subnet_id + private_ip_address_allocation = "Dynamic" + public_ip_address_id = (var.allocate_public_ip) ? azurerm_public_ip.default[0].id : null + } +} + +resource "azurerm_public_ip" "default" { + count = (var.allocate_public_ip) ? 1 : 0 + + name = "${var.virtual_machine_name}-pub-ip" + domain_name_label = var.virtual_machine_name + resource_group_name = var.resource_group_name + location = var.location + allocation_method = "Static" +} + +# Create a random string, and a storage account using that random string. +resource "random_string" "boot_diag" { + length = "8" + special = "false" + upper = false +} + +resource "azurerm_storage_account" "boot_diag" { + name = "${random_string.boot_diag.result}bootdiag" + resource_group_name = var.resource_group_name + location = var.location + account_tier = "Standard" + account_replication_type = "GRS" +} + +resource "azurerm_virtual_machine_extension" "deploy_ubuntu_builder" { + name = "${var.virtual_machine_name}-vmext" + virtual_machine_id = azurerm_virtual_machine.main.id + publisher = "Microsoft.Azure.Extensions" + type = "CustomScript" + type_handler_version = "2.1" + settings = <