From 5c519a7dab7c0364305d26c67b378b974d21e4dd Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Thu, 30 Nov 2023 17:31:32 +0200 Subject: [PATCH 01/88] devshell.nix: add jq This is used by the nix_build.sh script used to build images with terraform. Signed-off-by: Florian Klink --- nix/devshell.nix | 1 + 1 file changed, 1 insertion(+) diff --git a/nix/devshell.nix b/nix/devshell.nix index 5935bc71..ed5b777b 100644 --- a/nix/devshell.nix +++ b/nix/devshell.nix @@ -7,6 +7,7 @@ packages = with pkgs; [ azure-cli git + jq nix nixos-rebuild python3.pkgs.black From ae56cbfe97006f4b6ed4e9ad7e5f92f6be9ea6df Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Tue, 5 Dec 2023 13:47:33 +0200 Subject: [PATCH 02/88] tf-modules: init azurerm-nix-vm-image This introduces a terraform module that can be used to nix-build and upload VM images to Azure. nix-build.sh originates from https://cs.tvl.fyi/depot/-/blob/ops/terraform/deploy-nixos/nixos-eval.sh, which is why it inherits its copyright from there. Signed-off-by: Florian Klink --- tf-modules/azurerm-nix-vm-image/README.md | 9 +++++ tf-modules/azurerm-nix-vm-image/main.tf | 35 ++++++++++++++++++++ tf-modules/azurerm-nix-vm-image/nix-build.sh | 20 +++++++++++ tf-modules/azurerm-nix-vm-image/variables.tf | 35 ++++++++++++++++++++ 4 files changed, 99 insertions(+) create mode 100644 tf-modules/azurerm-nix-vm-image/README.md create mode 100644 tf-modules/azurerm-nix-vm-image/main.tf create mode 100755 tf-modules/azurerm-nix-vm-image/nix-build.sh create mode 100644 tf-modules/azurerm-nix-vm-image/variables.tf diff --git a/tf-modules/azurerm-nix-vm-image/README.md b/tf-modules/azurerm-nix-vm-image/README.md new file mode 100644 index 00000000..b610710c --- /dev/null +++ b/tf-modules/azurerm-nix-vm-image/README.md @@ -0,0 +1,9 @@ + + +# azurerm-nix-vm-image + +Tooling to build an Azure VM image (with Nix) and upload to Azure. diff --git a/tf-modules/azurerm-nix-vm-image/main.tf b/tf-modules/azurerm-nix-vm-image/main.tf new file mode 100644 index 00000000..f788ade7 --- /dev/null +++ b/tf-modules/azurerm-nix-vm-image/main.tf @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +resource "azurerm_storage_blob" "default" { + name = "${var.name}.vhd" + storage_account_name = var.storage_account_name + storage_container_name = var.storage_container_name + type = "Page" # necessary to be able to create an image out of it + source = "${data.external.nix_build.result.outPath}/disk.vhd" +} + +data "external" "nix_build" { + program = ["${path.module}/nix-build.sh"] + + query = { + attrpath = var.nix_attrpath + entrypoint = var.nix_entrypoint + } +} + +resource "azurerm_image" "default" { + name = var.name + resource_group_name = var.resource_group_name + location = var.location + os_disk { + blob_uri = azurerm_storage_blob.default.url + os_state = "Generalized" + os_type = "Linux" + } +} + +output "image_id" { + value = azurerm_image.default.id +} diff --git a/tf-modules/azurerm-nix-vm-image/nix-build.sh b/tf-modules/azurerm-nix-vm-image/nix-build.sh new file mode 100755 index 00000000..0e194f56 --- /dev/null +++ b/tf-modules/azurerm-nix-vm-image/nix-build.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +# SPDX-FileCopyrightText: 2023 The TVL Authors +# +# SPDX-License-Identifier: MIT + +# +# Builds a derivation at the given attribute path. +set -ueo pipefail + +# Load input variables from Terraform. jq's @sh format takes care of +# escaping. +eval "$(jq -r '@sh "ATTRPATH=\(.attrpath) && ENTRYPOINT=\(.entrypoint)"')" + +# Evaluate and build the derivation. +[[ -z "$ENTRYPOINT" ]] && ENTRYPOINT=$(git rev-parse --show-toplevel) +OUTPATH=$(nix-build --no-out-link -A "${ATTRPATH}" "${ENTRYPOINT}") + +# Return the output path back to Terraform. +jq -n --arg outPath "$OUTPATH" '{"outPath":$outPath}' diff --git a/tf-modules/azurerm-nix-vm-image/variables.tf b/tf-modules/azurerm-nix-vm-image/variables.tf new file mode 100644 index 00000000..cf655ec3 --- /dev/null +++ b/tf-modules/azurerm-nix-vm-image/variables.tf @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +variable "nix_attrpath" { + type = string + description = "Nix attribute path building a directory containing a disk.vhd file" +} + +variable "nix_entrypoint" { + type = string + description = "Path to the .nix file exposing the attribute path" +} + +variable "resource_group_name" { + type = string +} + +variable "location" { + type = string +} + +variable "storage_account_name" { + type = string +} + +variable "storage_container_name" { + type = string +} + +variable "name" { + type = string + description = "Name of the VM image." +} + From 40d6f5af8d53f9d72e981ad872cc4cbf192dc75e Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Tue, 5 Dec 2023 13:51:21 +0200 Subject: [PATCH 03/88] tf-modules: init azurerm-linux-vm This groups some common together some resources to create a VM. We might introduce more flexibility at a later point. Signed-off-by: Florian Klink --- tf-modules/azurerm-linux-vm/README.md | 9 +++ tf-modules/azurerm-linux-vm/variables.tf | 29 ++++++++ .../azurerm-linux-vm/virtual_machine.tf | 70 +++++++++++++++++++ 3 files changed, 108 insertions(+) create mode 100644 tf-modules/azurerm-linux-vm/README.md create mode 100644 tf-modules/azurerm-linux-vm/variables.tf create mode 100644 tf-modules/azurerm-linux-vm/virtual_machine.tf diff --git a/tf-modules/azurerm-linux-vm/README.md b/tf-modules/azurerm-linux-vm/README.md new file mode 100644 index 00000000..7dde4a7b --- /dev/null +++ b/tf-modules/azurerm-linux-vm/README.md @@ -0,0 +1,9 @@ + + +# azurerm-linux-vm + +Terraform module spinning up a Azure VM. diff --git a/tf-modules/azurerm-linux-vm/variables.tf b/tf-modules/azurerm-linux-vm/variables.tf new file mode 100644 index 00000000..50977a37 --- /dev/null +++ b/tf-modules/azurerm-linux-vm/variables.tf @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +variable "resource_group_name" { + type = string +} + +variable "location" { + type = string +} + +variable "virtual_machine_name" { + type = string +} + +variable "virtual_machine_size" { + type = string +} + +variable "virtual_machine_source_image" { + type = string +} + +variable "subnet_id" { + type = string + description = "The subnet ID to attach to the VM and allocate an IP from" +} + diff --git a/tf-modules/azurerm-linux-vm/virtual_machine.tf b/tf-modules/azurerm-linux-vm/virtual_machine.tf new file mode 100644 index 00000000..e5255aa5 --- /dev/null +++ b/tf-modules/azurerm-linux-vm/virtual_machine.tf @@ -0,0 +1,70 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +resource "azurerm_linux_virtual_machine" "main" { + name = var.virtual_machine_name + resource_group_name = var.resource_group_name + location = var.location + size = var.virtual_machine_size + + # Unused, but required by the API. May not be root either + admin_username = "foo" + admin_password = "S00persecret" + + # We *don't* support password auth, and this doesn't change anything. + # However, if we don't set this to false we need to + # specify additional pubkeys. + disable_password_authentication = false + # We can't use admin_ssh_key, as it only works for the admin_username. + + network_interface_ids = [azurerm_network_interface.default.id] + source_image_id = var.virtual_machine_source_image + + os_disk { + caching = "ReadWrite" + storage_account_type = "Standard_LRS" + } +} + +resource "azurerm_network_security_group" "ssh_inbound" { + name = "${var.virtual_machine_name}-nsg-ssh-inbound" + resource_group_name = var.resource_group_name + location = var.location + security_rule { + name = "AllowSSHInbound" + priority = 300 + direction = "Inbound" + access = "Allow" + protocol = "Tcp" + source_port_range = "*" + destination_port_range = "22" + source_address_prefix = "*" + destination_address_prefix = "*" + } +} +resource "azurerm_network_interface_security_group_association" "apply_ssh_inbound" { + network_interface_id = azurerm_network_interface.default.id + network_security_group_id = azurerm_network_security_group.ssh_inbound.id +} + +resource "azurerm_network_interface" "default" { + name = "${var.virtual_machine_name}-nic" + resource_group_name = var.resource_group_name + location = var.location + + ip_configuration { + name = "internal" + subnet_id = var.subnet_id + private_ip_address_allocation = "Dynamic" + public_ip_address_id = azurerm_public_ip.default.id + } +} + +resource "azurerm_public_ip" "default" { + name = "${var.virtual_machine_name}-pub-ip" + domain_name_label = var.virtual_machine_name + resource_group_name = var.resource_group_name + location = var.location + allocation_method = "Static" +} From 8b728d152e2f9a0fba22e84ad7fc5531aa0ef254 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 22 Nov 2023 13:20:24 +0200 Subject: [PATCH 04/88] hosts/jenkins-controller: init We can just include azure-config.nix from nixpkgs. It pulls in azure- common.nix, which contains all necessary kernel config / udev rules. It also defines a `config.system.azureImage` attribute, which builds a vhd that we can import into azure, using the `azurerm-nix-vm-image` terraform module These can be referred to from source_image_id in Terraform (using azurerm-linux-vm for example), allowing to boot the desired machine config out of the box, without having to do a two-staged-deploy. Signed-off-by: Florian Klink --- hosts/default.nix | 12 ++++++++---- hosts/jenkins-controller/configuration.nix | 14 ++++++++++++++ 2 files changed, 22 insertions(+), 4 deletions(-) create mode 100644 hosts/jenkins-controller/configuration.nix diff --git a/hosts/default.nix b/hosts/default.nix index 6ec1a614..a825a844 100644 --- a/hosts/default.nix +++ b/hosts/default.nix @@ -28,10 +28,6 @@ inherit specialArgs; modules = [./binarycache/configuration.nix]; }; - monitoring = lib.nixosSystem { - inherit specialArgs; - modules = [./monitoring/configuration.nix]; - }; ficolobuild3 = lib.nixosSystem { inherit specialArgs; modules = [./ficolobuild/build3.nix]; @@ -40,9 +36,17 @@ inherit specialArgs; modules = [./ficolobuild/build4.nix]; }; + jenkins-controller = lib.nixosSystem { + inherit specialArgs; + modules = [./jenkins-controller/configuration.nix]; + }; prbuilder = lib.nixosSystem { inherit specialArgs; modules = [./prbuilder/configuration.nix]; }; + monitoring = lib.nixosSystem { + inherit specialArgs; + modules = [./monitoring/configuration.nix]; + }; }; } diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix new file mode 100644 index 00000000..d04afefe --- /dev/null +++ b/hosts/jenkins-controller/configuration.nix @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 +{ + lib, + modulesPath, + ... +}: { + imports = ["${modulesPath}/virtualisation/azure-config.nix"]; + + nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; + + system.stateVersion = "23.05"; +} From e58d8e4f73d8bfe8868a5a8f8f62cb23e5d5c9c6 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 6 Dec 2023 15:15:44 +0200 Subject: [PATCH 05/88] azurerm-linux-vm: add virtual_machine_custom_data This allows injecting custom userdata to the VM at instance creation time, which we can use to provision some config (like SSH pubkey config) that's not part of the NixOS image. Signed-off-by: Florian Klink --- tf-modules/azurerm-linux-vm/variables.tf | 5 +++++ tf-modules/azurerm-linux-vm/virtual_machine.tf | 14 ++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/tf-modules/azurerm-linux-vm/variables.tf b/tf-modules/azurerm-linux-vm/variables.tf index 50977a37..4a6bd79c 100644 --- a/tf-modules/azurerm-linux-vm/variables.tf +++ b/tf-modules/azurerm-linux-vm/variables.tf @@ -22,6 +22,11 @@ variable "virtual_machine_source_image" { type = string } +variable "virtual_machine_custom_data" { + type = string + default = "" +} + variable "subnet_id" { type = string description = "The subnet ID to attach to the VM and allocate an IP from" diff --git a/tf-modules/azurerm-linux-vm/virtual_machine.tf b/tf-modules/azurerm-linux-vm/virtual_machine.tf index e5255aa5..71b988fd 100644 --- a/tf-modules/azurerm-linux-vm/virtual_machine.tf +++ b/tf-modules/azurerm-linux-vm/virtual_machine.tf @@ -21,6 +21,20 @@ resource "azurerm_linux_virtual_machine" "main" { network_interface_ids = [azurerm_network_interface.default.id] source_image_id = var.virtual_machine_source_image + # We only set custom_data here, not user_data. + # user_data is more recent, and allows updates without recreating the machine, + # but at least cloud-init 23.1.2 blocks boot if custom_data is not set. + # (It logs about not being able to mount /dev/sr0 to /metadata). + # This can be worked around by setting custom_data to a static placeholder, + # but user_data is still ignored. + # TODO: check this again with a more recent cloud-init version. + custom_data = (var.virtual_machine_custom_data == "") ? null : base64encode(var.virtual_machine_custom_data) + + # Enable boot diagnostics, use the managed storage account to store them + boot_diagnostics { + storage_account_uri = null + } + os_disk { caching = "ReadWrite" storage_account_type = "Standard_LRS" From 3a0c52de1307df6a97f4a7bb3e0a73ef13cbe6ea Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 6 Dec 2023 15:19:59 +0200 Subject: [PATCH 06/88] hosts/jenkins-controller: enable cloud-init Signed-off-by: Florian Klink --- hosts/jenkins-controller/configuration.nix | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index d04afefe..7071c784 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -10,5 +10,9 @@ nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; + # enable cloud-init, so instance metadata is set accordingly and we can use + # cloud-config for ssh key management. + services.cloud-init.enable = true; + system.stateVersion = "23.05"; } From 08a320f03660bffbf59ece68df39544ceecdd0f9 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 6 Dec 2023 15:39:12 +0200 Subject: [PATCH 07/88] services/openssh: add kitty terminfo Signed-off-by: Florian Klink --- services/openssh/default.nix | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/services/openssh/default.nix b/services/openssh/default.nix index 066a02b6..868a6baf 100644 --- a/services/openssh/default.nix +++ b/services/openssh/default.nix @@ -2,6 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 { + pkgs, +}: { services.openssh = { enable = true; @@ -30,4 +32,8 @@ # Ban brute force SSH services.fail2ban.enable = true; + + environment.systemPackages = [ + pkgs.kitty.terminfo + ]; } From f2120d6f139f5419d3384db437d9628c1ae3344b Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 6 Dec 2023 15:42:46 +0200 Subject: [PATCH 08/88] hosts/jenkins-controller: include service-openssh module Signed-off-by: Florian Klink --- hosts/jenkins-controller/configuration.nix | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index 7071c784..6c284a90 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -2,11 +2,15 @@ # # SPDX-License-Identifier: Apache-2.0 { + self, lib, modulesPath, ... }: { - imports = ["${modulesPath}/virtualisation/azure-config.nix"]; + imports = [ + "${modulesPath}/virtualisation/azure-config.nix" + self.nixosModules.service-openssh + ]; nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; From 65a2ba41ce4f1769d298583193163610da31dad8 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 6 Dec 2023 16:00:34 +0200 Subject: [PATCH 09/88] services/openssh: set with priorities azure-common.nix already sets services.openssh.settings.{PermitRootLogin,ClientAliveInterval}, so we need to decide what wins. To keep the intended behaviour, we want to mkForce PermitRootLogin to "no" (azure-common.nix sets "prohibit-password"), and set the ClientAliveInterval with mkDefault - bumping that timeout probably makes sense for azure, and we don't want the setting in this file to take priority. Signed-off-by: Florian Klink --- services/openssh/default.nix | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/services/openssh/default.nix b/services/openssh/default.nix index 868a6baf..8a870618 100644 --- a/services/openssh/default.nix +++ b/services/openssh/default.nix @@ -3,15 +3,17 @@ # SPDX-License-Identifier: Apache-2.0 { pkgs, + lib, + ... }: { services.openssh = { enable = true; settings = { - PermitRootLogin = "no"; + PermitRootLogin = lib.mkForce "no"; KbdInteractiveAuthentication = false; PasswordAuthentication = false; - ClientAliveInterval = 60; + ClientAliveInterval = lib.mkDefault 60; }; # Only allow ed25519 keys From fc3d7e1343c4809ec01af5154133c285eccfc525 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 6 Dec 2023 16:23:04 +0200 Subject: [PATCH 10/88] add ssh-keys.yaml This file contains all ssh public keys used by real humans. It's parsed from Terraform to inject into instance metadata. Signed-off-by: Florian Klink --- ssh-keys.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 ssh-keys.yaml diff --git a/ssh-keys.yaml b/ssh-keys.yaml new file mode 100644 index 00000000..2ec3ad95 --- /dev/null +++ b/ssh-keys.yaml @@ -0,0 +1,9 @@ +# This file contains all ssh public keys used by real humans. +# It's parsed from Terraform to inject into instance metadata. +bmg: + - sk-ssh-ed25519@openssh.com AAAAGnNrLXNzaC1lZDI1NTE5QG9wZW5zc2guY29tAAAAIEJ9ewKwo5FLj6zE30KnTn8+nw7aKdei9SeTwaAeRdJDAAAABHNzaDo= + - sk-ssh-ed25519@openssh.com AAAAGnNrLXNzaC1lZDI1NTE5QG9wZW5zc2guY29tAAAAIA/pwHnzGNM+ZU4lANGROTRe2ZHbes7cnZn72Oeun/MCAAAABHNzaDo= +flokli: + - ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIPTVTXOutUZZjXLB0lUSgeKcSY/8mxKkC0ingGK1whD2 +hrosten: + - ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHFuB+uEjhoSdakwiKLD3TbNpbjnlXerEfZQbtRgvdSz From 3124047a384c31c7a5983bfc2247c565052c2785 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 6 Dec 2023 16:23:33 +0200 Subject: [PATCH 11/88] terraform/jenkins: init This builds the jenkins-master Nix image, turns it into a bootable Azure image, and then boots an instance with the image. Signed-off-by: Florian Klink --- terraform/jenkins/image_storage.tf | 21 ++++++++++++ terraform/jenkins/jenkins_controller.tf | 43 +++++++++++++++++++++++++ terraform/jenkins/main.tf | 33 +++++++++++++++++++ terraform/jenkins/remote_state.tf | 18 +++++++++++ 4 files changed, 115 insertions(+) create mode 100644 terraform/jenkins/image_storage.tf create mode 100644 terraform/jenkins/jenkins_controller.tf create mode 100644 terraform/jenkins/main.tf create mode 100644 terraform/jenkins/remote_state.tf diff --git a/terraform/jenkins/image_storage.tf b/terraform/jenkins/image_storage.tf new file mode 100644 index 00000000..c5cd7c64 --- /dev/null +++ b/terraform/jenkins/image_storage.tf @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +# Storage account and storage container used to store VM images + +resource "azurerm_storage_account" "vm_images" { + name = "ghafinfravmimages" + resource_group_name = azurerm_resource_group.default.name + location = azurerm_resource_group.default.location + account_tier = "Standard" + account_replication_type = "LRS" + allow_nested_items_to_be_public = false +} + +resource "azurerm_storage_container" "vm_images" { + name = "ghaf-infra-vm-images" + storage_account_name = azurerm_storage_account.vm_images.name + container_access_type = "private" +} + diff --git a/terraform/jenkins/jenkins_controller.tf b/terraform/jenkins/jenkins_controller.tf new file mode 100644 index 00000000..2d0e8d15 --- /dev/null +++ b/terraform/jenkins/jenkins_controller.tf @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +# Build the Jenkins controller image +module "jenkins_controller_image" { + source = "../../tf-modules/azurerm-nix-vm-image" + + nix_attrpath = "outputs.nixosConfigurations.jenkins-controller.config.system.build.azureImage" + nix_entrypoint = "${path.module}/../.." + + + name = "jenkins-controller" + resource_group_name = azurerm_resource_group.default.name + location = azurerm_resource_group.default.location + + storage_account_name = azurerm_storage_account.vm_images.name + storage_container_name = azurerm_storage_container.vm_images.name +} + +# Create a machine using this image +module "jenkins_controller_vm" { + source = "../../tf-modules/azurerm-linux-vm" + + resource_group_name = azurerm_resource_group.default.name + location = azurerm_resource_group.default.location + + virtual_machine_name = "ghaf-jenkins-controller" + virtual_machine_size = "Standard_D1_v2" + virtual_machine_source_image = module.jenkins_controller_image.image_id + + virtual_machine_custom_data = join("\n", ["#cloud-config", yamlencode({ + users = [ + for user in toset(["bmg", "flokli", "hrosten"]) : { + name = user + sudo = "ALL=(ALL) NOPASSWD:ALL" + ssh_authorized_keys = local.ssh_keys[user] + } + ] + })]) + + subnet_id = azurerm_subnet.jenkins.id +} diff --git a/terraform/jenkins/main.tf b/terraform/jenkins/main.tf new file mode 100644 index 00000000..71cb9dbe --- /dev/null +++ b/terraform/jenkins/main.tf @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +provider "azurerm" { + features {} +} + +# read ssh-keys.yaml into local.ssh_keys +locals { + ssh_keys = yamldecode(file("../../ssh-keys.yaml")) +} + +# The resource group everything in this terraform module lives in +resource "azurerm_resource_group" "default" { + name = "ghaf-infra-jenkins" + location = "northeurope" +} + +# Create a virtual network and slice out a subnet for jenkins. +resource "azurerm_virtual_network" "vnet" { + name = "ghaf-infra-vnet" + address_space = ["10.0.0.0/16"] + location = azurerm_resource_group.default.location + resource_group_name = azurerm_resource_group.default.name +} + +resource "azurerm_subnet" "jenkins" { + name = "ghaf-infra-jenkins" + resource_group_name = azurerm_resource_group.default.name + virtual_network_name = azurerm_virtual_network.vnet.name + address_prefixes = ["10.0.2.0/24"] +} diff --git a/terraform/jenkins/remote_state.tf b/terraform/jenkins/remote_state.tf new file mode 100644 index 00000000..4f68d998 --- /dev/null +++ b/terraform/jenkins/remote_state.tf @@ -0,0 +1,18 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +terraform { + required_providers { + azurerm = { + source = "hashicorp/azurerm" + } + } + # Backend for storing tfstate (see ../azure-storage) + backend "azurerm" { + resource_group_name = "ghaf-infra-storage" + storage_account_name = "ghafinfrastatestorage" + container_name = "ghaf-infra-tfstate-container" + key = "jenkins.tfstate" + } +} From 0ecd3c9405748948a909b269416714d5de26838d Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 6 Dec 2023 17:10:59 +0200 Subject: [PATCH 12/88] flake.nix: bump nixpkgs to 23.11 Signed-off-by: Florian Klink --- flake.lock | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/flake.lock b/flake.lock index f42d7fd4..1a6fa8a7 100644 --- a/flake.lock +++ b/flake.lock @@ -7,11 +7,11 @@ ] }, "locked": { - "lastModified": 1704318910, - "narHash": "sha256-wOIJwAsnZhM0NlFRwYJRgO4Lldh8j9viyzwQXtrbNtM=", + "lastModified": 1701775991, + "narHash": "sha256-/51DaSTzoW+wQfj5P9EnTbSxixDFjjhfnGdMKcSp+is=", "owner": "nix-community", "repo": "disko", - "rev": "aef9a509db64a081186af2dc185654d78dc8e344", + "rev": "f84c3684900d11cf19f530070d32d55f0ed51374", "type": "github" }, "original": { @@ -57,11 +57,11 @@ "nixpkgs-lib": "nixpkgs-lib" }, "locked": { - "lastModified": 1704152458, - "narHash": "sha256-DS+dGw7SKygIWf9w4eNBUZsK+4Ug27NwEWmn2tnbycg=", + "lastModified": 1701473968, + "narHash": "sha256-YcVE5emp1qQ8ieHUnxt1wCZCC3ZfAS+SRRWZ2TMda7E=", "owner": "hercules-ci", "repo": "flake-parts", - "rev": "88a2cd8166694ba0b6cb374700799cec53aef527", + "rev": "34fed993f1674c8d06d58b37ce1e0fe5eebcb9f5", "type": "github" }, "original": { @@ -94,11 +94,11 @@ "utils": "utils" }, "locked": { - "lastModified": 1702912615, - "narHash": "sha256-qseX+/8drgwxOb1I3LKqBYMkmyeI5d5gmHqbZccR660=", + "lastModified": 1688488021, + "narHash": "sha256-vn6xkx4g2q/qykU+jdQYyGSPKFmGePuhGujAdmlHx1Y=", "owner": "aristanetworks", "repo": "nix-serve-ng", - "rev": "21e65cb4c62b5c9e3acc11c3c5e8197248fa46a4", + "rev": "f3931b8120b1ca663da280e11659c745e2e9ad1b", "type": "github" }, "original": { @@ -109,11 +109,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1704295289, - "narHash": "sha256-9WZDRfpMqCYL6g/HNWVvXF0hxdaAgwgIGeLYiOhmes8=", + "lastModified": 1701539137, + "narHash": "sha256-nVO/5QYpf1GwjvtpXhyxx5M3U/WN0MwBro4Lsk+9mL0=", "owner": "nixos", "repo": "nixpkgs", - "rev": "b0b2c5445c64191fd8d0b31f2b1a34e45a64547d", + "rev": "933d7dc155096e7575d207be6fb7792bc9f34f6d", "type": "github" }, "original": { @@ -126,11 +126,11 @@ "nixpkgs-lib": { "locked": { "dir": "lib", - "lastModified": 1703961334, - "narHash": "sha256-M1mV/Cq+pgjk0rt6VxoyyD+O8cOUiai8t9Q6Yyq4noY=", + "lastModified": 1701253981, + "narHash": "sha256-ztaDIyZ7HrTAfEEUt9AtTDNoCYxUdSd6NrRHaYOIxtk=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "b0d36bd0a420ecee3bc916c91886caca87c894e9", + "rev": "e92039b55bcd58469325ded85d4f58dd5a4eaf58", "type": "github" }, "original": { @@ -163,11 +163,11 @@ ] }, "locked": { - "lastModified": 1703991717, - "narHash": "sha256-XfBg2dmDJXPQEB8EdNBnzybvnhswaiAkUeeDj7fa/hQ=", + "lastModified": 1701728052, + "narHash": "sha256-7lOMc3PtW5a55vFReBJLLLOnopsoi1W7MkjJ93jPV4E=", "owner": "mic92", "repo": "sops-nix", - "rev": "cfdbaf68d00bc2f9e071f17ae77be4b27ff72fa6", + "rev": "e91ece6d2cf5a0ae729796b8f0dedceab5107c3d", "type": "github" }, "original": { @@ -198,11 +198,11 @@ ] }, "locked": { - "lastModified": 1704233915, - "narHash": "sha256-GYDC4HjyVizxnyKRbkrh1GugGp8PP3+fJuh40RPCN7k=", + "lastModified": 1701682826, + "narHash": "sha256-2lxeTUGs8Jzz/wjLgWYmZoXn60BYNRMzwHFtxNFUDLU=", "owner": "numtide", "repo": "treefmt-nix", - "rev": "e434da615ef74187ba003b529cc72f425f5d941e", + "rev": "affe7fc3f5790e1d0b5ba51bcff0f7ebe465e92d", "type": "github" }, "original": { From 5b11d0fb2a5c7325c48b776866ece3cf31a912a8 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 6 Dec 2023 17:34:37 +0200 Subject: [PATCH 13/88] hosts/jenkins-controller: use networkd That way, the VM survives reboots - the non-networkd configuration seems to be quite brittle. Signed-off-by: Florian Klink --- hosts/jenkins-controller/configuration.nix | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index 6c284a90..a6b74d16 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -18,5 +18,11 @@ # cloud-config for ssh key management. services.cloud-init.enable = true; + # Use systemd-networkd for network configuration, but keep systemd-resolved disabled. + services.cloud-init.network.enable = true; + networking.useDHCP = false; + networking.useNetworkd = true; + services.resolved.enable = false; + system.stateVersion = "23.05"; } From a0e48ba1f1057141286d2f4c48c470d845775740 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 6 Dec 2023 17:45:07 +0200 Subject: [PATCH 14/88] hosts/jenkins-controller: re-enable resolved Ideally, we'd keep systemd-resolved disabled too, but the way nixpkgs configures cloud-init prevents it from picking up DNS settings from elsewhere. Signed-off-by: Florian Klink --- hosts/jenkins-controller/configuration.nix | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index a6b74d16..635da99a 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -18,11 +18,14 @@ # cloud-config for ssh key management. services.cloud-init.enable = true; - # Use systemd-networkd for network configuration, but keep systemd-resolved disabled. + # Use systemd-networkd for network configuration services.cloud-init.network.enable = true; networking.useDHCP = false; networking.useNetworkd = true; - services.resolved.enable = false; + # FUTUREWORK: Ideally, we'd keep systemd-resolved disabled too, + # but the way nixpkgs configures cloud-init prevents it from picking up DNS + # settings from elsewhere. + # services.resolved.enable = false; system.stateVersion = "23.05"; } From c9d77fb9a8456c748266e5218c7ffec99b87de1e Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 6 Dec 2023 20:42:25 +0200 Subject: [PATCH 15/88] hosts/azure-common-2: init Move the azure-specific config snipped into its own file, so we can import it from multiple configuration.nix. azure-common.nix is already used for the existing machine configurations, and as we don't want to break these, it's using this transient name. Signed-off-by: Florian Klink --- hosts/azure-common-2.nix | 24 ++++++++++++++++++++++ hosts/jenkins-controller/configuration.nix | 16 +-------------- 2 files changed, 25 insertions(+), 15 deletions(-) create mode 100644 hosts/azure-common-2.nix diff --git a/hosts/azure-common-2.nix b/hosts/azure-common-2.nix new file mode 100644 index 00000000..41ba9dfb --- /dev/null +++ b/hosts/azure-common-2.nix @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 +# +# Profile to import for Azure VMs. Imports azure-common.nix from nixpkgs, +# and configures cloud-init. +{modulesPath, ...}: { + imports = [ + "${modulesPath}/virtualisation/azure-config.nix" + ]; + + # enable cloud-init, so instance metadata is set accordingly and we can use + # cloud-config for ssh key management. + services.cloud-init.enable = true; + + # Use systemd-networkd for network configuration. + services.cloud-init.network.enable = true; + networking.useDHCP = false; + networking.useNetworkd = true; + # FUTUREWORK: Ideally, we'd keep systemd-resolved disabled too, + # but the way nixpkgs configures cloud-init prevents it from picking up DNS + # settings from elsewhere. + # services.resolved.enable = false; +} diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index 635da99a..1c6f431b 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -4,28 +4,14 @@ { self, lib, - modulesPath, ... }: { imports = [ - "${modulesPath}/virtualisation/azure-config.nix" + ../azure-common-2.nix self.nixosModules.service-openssh ]; nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; - # enable cloud-init, so instance metadata is set accordingly and we can use - # cloud-config for ssh key management. - services.cloud-init.enable = true; - - # Use systemd-networkd for network configuration - services.cloud-init.network.enable = true; - networking.useDHCP = false; - networking.useNetworkd = true; - # FUTUREWORK: Ideally, we'd keep systemd-resolved disabled too, - # but the way nixpkgs configures cloud-init prevents it from picking up DNS - # settings from elsewhere. - # services.resolved.enable = false; - system.stateVersion = "23.05"; } From 66f58f3b3a967a8d25665de84f2d32768f6e4136 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 6 Dec 2023 19:46:41 +0200 Subject: [PATCH 16/88] tf-modules/azurerm-linux-vm: assign identity This gives each VM a system-assigned identity, and exposes the principal ID as a module output, allowing to grant access to certain resources. Signed-off-by: Florian Klink --- tf-modules/azurerm-linux-vm/virtual_machine.tf | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tf-modules/azurerm-linux-vm/virtual_machine.tf b/tf-modules/azurerm-linux-vm/virtual_machine.tf index 71b988fd..338988cc 100644 --- a/tf-modules/azurerm-linux-vm/virtual_machine.tf +++ b/tf-modules/azurerm-linux-vm/virtual_machine.tf @@ -21,6 +21,10 @@ resource "azurerm_linux_virtual_machine" "main" { network_interface_ids = [azurerm_network_interface.default.id] source_image_id = var.virtual_machine_source_image + identity { + type = "SystemAssigned" + } + # We only set custom_data here, not user_data. # user_data is more recent, and allows updates without recreating the machine, # but at least cloud-init 23.1.2 blocks boot if custom_data is not set. @@ -82,3 +86,7 @@ resource "azurerm_public_ip" "default" { location = var.location allocation_method = "Static" } + +output "virtual_machine_identity_principal_id" { + value = azurerm_linux_virtual_machine.main.identity[0].principal_id +} From 1ede4a2db039da6ac9b710cf6c4b857af6955a5e Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 6 Dec 2023 20:39:10 +0200 Subject: [PATCH 17/88] hosts: add binary-cache config This exposes a read-only HTTP webserver for the contents in the storage container. `rclone serve http` takes care of exposing the storage container over HTTP. We disallow listing (by only allowing access to certain paths), and expose it over HTTP(S) with auto-ssl via caddy. This will work with whatever domain we route to it, so it's not part of the configuration. Signed-off-by: Florian Klink --- hosts/binary-cache/configuration.nix | 76 ++++++++++++++++++++++++++++ hosts/default.nix | 4 ++ 2 files changed, 80 insertions(+) create mode 100644 hosts/binary-cache/configuration.nix diff --git a/hosts/binary-cache/configuration.nix b/hosts/binary-cache/configuration.nix new file mode 100644 index 00000000..34dd594e --- /dev/null +++ b/hosts/binary-cache/configuration.nix @@ -0,0 +1,76 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 +{ + self, + pkgs, + lib, + ... +}: { + imports = [ + ../azure-common-2.nix + self.nixosModules.service-openssh + ]; + + nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; + + # Run a read-only HTTP webserver proxying to the "binary-cache-v1" storage + # container at a unix socket. + # This relies on IAM to grant access to the storage container. + systemd.services.rclone-http = { + after = ["network.target"]; + requires = ["network.target"]; + wantedBy = ["multi-user.target"]; + serviceConfig = { + Type = "notify"; + Restart = "always"; + RestartSec = 2; + DynamicUser = true; + RuntimeDirectory = "rclone-http"; + # FUTUREWORK: set AZURE_STORAGE_ACCOUNT_NAME and storage container name through EnvironmentFile + ExecStart = + "${pkgs.rclone}/bin/rclone " + + "serve http " + + "--azureblob-env-auth " + + "--azureblob-account ghafbinarycache " + + "--read-only " + + "--addr unix://%t/rclone-http/socket " + + ":azureblob:binary-cache-v1"; + # On successful startup, grant caddy write permissions to the socket. + ExecStartPost = "${pkgs.acl.bin}/bin/setfacl -m u:caddy:rw %t/rclone-http/socket"; + }; + }; + + # Expose the rclone-http unix socket over a HTTPS, limiting to certain + # keys only, disallowing listing too. + services.caddy = { + enable = true; + configFile = pkgs.writeTextDir "Caddyfile" '' + # Disable the admin API, we don't want to reconfigure Caddy at runtime. + { + admin off + } + + # Proxy a subset of requests to rclone. + * { + handle /nix-cache-info { + reverse_proxy unix///run/rclone-http/socket + } + handle /*.narinfo { + reverse_proxy unix///run/rclone-http/socket + } + handle /nar/*.nar { + reverse_proxy unix///run/rclone-http/socket + } + handle /nar/*.nar.* { + reverse_proxy unix///run/rclone-http/socket + } + } + ''; + }; + + # Expose the HTTP and HTTPS port. + networking.firewall.allowedTCPPorts = [80 443]; + + system.stateVersion = "23.05"; +} diff --git a/hosts/default.nix b/hosts/default.nix index a825a844..fcc2057a 100644 --- a/hosts/default.nix +++ b/hosts/default.nix @@ -28,6 +28,10 @@ inherit specialArgs; modules = [./binarycache/configuration.nix]; }; + binary-cache = lib.nixosSystem { + inherit specialArgs; + modules = [./binary-cache/configuration.nix]; + }; ficolobuild3 = lib.nixosSystem { inherit specialArgs; modules = [./ficolobuild/build3.nix]; From 9a96c0dd4056336fc427c235080759f07652bd2f Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Thu, 7 Dec 2023 09:17:47 +0200 Subject: [PATCH 18/88] hosts/binary-cache: apply caddy workaround This works around https://github.com/NixOS/nixpkgs/issues/272532, we can revert this once https://github.com/NixOS/nixpkgs/pull/272617 has landed here. Signed-off-by: Florian Klink --- hosts/binary-cache/configuration.nix | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/hosts/binary-cache/configuration.nix b/hosts/binary-cache/configuration.nix index 34dd594e..5f3f7255 100644 --- a/hosts/binary-cache/configuration.nix +++ b/hosts/binary-cache/configuration.nix @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 { self, + config, pkgs, lib, ... @@ -69,6 +70,14 @@ ''; }; + # workaround for https://github.com/NixOS/nixpkgs/issues/272532 + # FUTUREWORK: rebase once https://github.com/NixOS/nixpkgs/pull/272617 landed + services.caddy.enableReload = false; + systemd.services.caddy.serviceConfig.ExecStart = lib.mkForce [ + "" + "${pkgs.caddy}/bin/caddy run --environ --config ${config.services.caddy.configFile}/Caddyfile" + ]; + # Expose the HTTP and HTTPS port. networking.firewall.allowedTCPPorts = [80 443]; From 22f5054f8eeb63ac3ba2407a09f6e06923710f6a Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Thu, 7 Dec 2023 12:33:34 +0200 Subject: [PATCH 19/88] hosts/binary-cache: hardcode domain for now We don't want to blindly issue certs for all domains, but make this configurable. This should be config coming from the environment, via cloud-init. Signed-off-by: Florian Klink --- hosts/binary-cache/configuration.nix | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hosts/binary-cache/configuration.nix b/hosts/binary-cache/configuration.nix index 5f3f7255..48234b71 100644 --- a/hosts/binary-cache/configuration.nix +++ b/hosts/binary-cache/configuration.nix @@ -44,6 +44,7 @@ # Expose the rclone-http unix socket over a HTTPS, limiting to certain # keys only, disallowing listing too. + # TODO: use https://caddyserver.com/docs/caddyfile-tutorial#environment-variables for domain services.caddy = { enable = true; configFile = pkgs.writeTextDir "Caddyfile" '' @@ -53,7 +54,7 @@ } # Proxy a subset of requests to rclone. - * { + ghaf-binary-cache.northeurope.cloudapp.azure.com { handle /nix-cache-info { reverse_proxy unix///run/rclone-http/socket } From a3b4024365c93387e6cf79c145183549e8b4fd2a Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Thu, 7 Dec 2023 12:35:06 +0200 Subject: [PATCH 20/88] tf-modules/azurerm-linux-vm: move out security group config Define this for each machine outside the VM, and describe everything in a single security group. Attaching multiple security groups caused confusing duplicate errors, this might be a Terraform Azure Provider Bug. Signed-off-by: Florian Klink --- terraform/jenkins/jenkins_controller.tf | 23 +++++++++++++++ .../azurerm-linux-vm/virtual_machine.tf | 29 +++++-------------- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/terraform/jenkins/jenkins_controller.tf b/terraform/jenkins/jenkins_controller.tf index 2d0e8d15..0cecf2c8 100644 --- a/terraform/jenkins/jenkins_controller.tf +++ b/terraform/jenkins/jenkins_controller.tf @@ -41,3 +41,26 @@ module "jenkins_controller_vm" { subnet_id = azurerm_subnet.jenkins.id } + +resource "azurerm_network_interface_security_group_association" "jenkins_controller_vm" { + network_interface_id = module.jenkins_controller_vm.virtual_machine_network_interface_id + network_security_group_id = azurerm_network_security_group.jenkins_controller_vm.id +} + +resource "azurerm_network_security_group" "jenkins_controller_vm" { + name = "jenkins-controller-vm" + resource_group_name = azurerm_resource_group.default.name + location = azurerm_resource_group.default.location + + security_rule { + name = "AllowSSHInbound" + priority = 400 + direction = "Inbound" + access = "Allow" + protocol = "Tcp" + source_port_range = "*" + destination_port_ranges = [22] + source_address_prefix = "*" + destination_address_prefix = "*" + } +} diff --git a/tf-modules/azurerm-linux-vm/virtual_machine.tf b/tf-modules/azurerm-linux-vm/virtual_machine.tf index 338988cc..321168b9 100644 --- a/tf-modules/azurerm-linux-vm/virtual_machine.tf +++ b/tf-modules/azurerm-linux-vm/virtual_machine.tf @@ -45,27 +45,6 @@ resource "azurerm_linux_virtual_machine" "main" { } } -resource "azurerm_network_security_group" "ssh_inbound" { - name = "${var.virtual_machine_name}-nsg-ssh-inbound" - resource_group_name = var.resource_group_name - location = var.location - security_rule { - name = "AllowSSHInbound" - priority = 300 - direction = "Inbound" - access = "Allow" - protocol = "Tcp" - source_port_range = "*" - destination_port_range = "22" - source_address_prefix = "*" - destination_address_prefix = "*" - } -} -resource "azurerm_network_interface_security_group_association" "apply_ssh_inbound" { - network_interface_id = azurerm_network_interface.default.id - network_security_group_id = azurerm_network_security_group.ssh_inbound.id -} - resource "azurerm_network_interface" "default" { name = "${var.virtual_machine_name}-nic" resource_group_name = var.resource_group_name @@ -87,6 +66,14 @@ resource "azurerm_public_ip" "default" { allocation_method = "Static" } +output "virtual_machine_id" { + value = azurerm_linux_virtual_machine.main.id +} + output "virtual_machine_identity_principal_id" { value = azurerm_linux_virtual_machine.main.identity[0].principal_id } + +output "virtual_machine_network_interface_id" { + value = azurerm_network_interface.default.id +} From 2f87e7ff670a0cf13a126b054804cca396cdba0e Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Thu, 7 Dec 2023 12:39:49 +0200 Subject: [PATCH 21/88] hosts/azure-common-2: add filesystem tools This adds filesystem-related tools to the $PATH of cloud-init, so it can format disks with its disk_setup module (and fs_setup) config key. This will be used to format data volumes attached to VMs. Signed-off-by: Florian Klink --- hosts/azure-common-2.nix | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hosts/azure-common-2.nix b/hosts/azure-common-2.nix index 41ba9dfb..6a07fc74 100644 --- a/hosts/azure-common-2.nix +++ b/hosts/azure-common-2.nix @@ -21,4 +21,9 @@ # but the way nixpkgs configures cloud-init prevents it from picking up DNS # settings from elsewhere. # services.resolved.enable = false; + + # Add filesystem-related tools to cloud-inits path, so it can format data disks. + services.cloud-init.btrfs.enable = true; + services.cloud-init.ext4.enable = true; + services.cloud-init.xfs.enable = true; } From 07785bcf221bea5a1ddf7581b66b306db1c44282 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Thu, 7 Dec 2023 17:44:27 +0200 Subject: [PATCH 22/88] azure-common: support timeout in disk_setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to use cloud-init to format and mount data volumes in azure, we can't use systemd for it. Due to https://github.com/hashicorp/terraform-provider-azurerm/issues/6117, disks in Azure gets attached late at boot, so any dev-disk-by-….device units created via systemd-fstab-generator might not exist yet at the time the graph for multi-user.target is created, causing systemd to fail starting downstream services due to a missing dependency. Once the volume is attached, the .device unit pops up via udev, and then a manual restart of services depending on data disks would work, but it's messy. Letting cloud-init take care of data disk mounting (and formatting) is the right choice, that way systemd doesn't need to do any dependency tracking of it. Signed-off-by: Florian Klink --- hosts/azure-common-2.nix | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/hosts/azure-common-2.nix b/hosts/azure-common-2.nix index 6a07fc74..3e76ac83 100644 --- a/hosts/azure-common-2.nix +++ b/hosts/azure-common-2.nix @@ -4,11 +4,32 @@ # # Profile to import for Azure VMs. Imports azure-common.nix from nixpkgs, # and configures cloud-init. -{modulesPath, ...}: { +{ + modulesPath, + pkgs, + ... +}: { imports = [ "${modulesPath}/virtualisation/azure-config.nix" ]; + nixpkgs.overlays = [ + (_self: super: { + cloud-init = super.cloud-init.overrideAttrs (old: { + patches = + old.patches + or [] + ++ [ + # Add support for timeout in disk_setup: https://github.com/canonical/cloud-init/pull/4673 + (pkgs.fetchpatch { + url = "https://github.com/canonical/cloud-init/pull/4673/commits/9b2e3dc907dc06d0a2abdaae6f0b1f0612c5c5dc.patch"; + hash = "sha256-KAd+4YT+dgzIoEq5qZj6y4peclIb3rvnuY6QIQObAiY="; + }) + ]; + }); + }) + ]; + # enable cloud-init, so instance metadata is set accordingly and we can use # cloud-config for ssh key management. services.cloud-init.enable = true; From 80b928e5f6833ac18dd878395e38769a9b493221 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Thu, 7 Dec 2023 12:37:30 +0200 Subject: [PATCH 23/88] terraform/jenkins: add binary cache storage This adds the ghafbinarycache storage account, and a binary-cache-v1 storage container inside of it. It's used to serve artifacts from (via the binary-cache) VM, and Nix build artifacts are also uploaded to it. Signed-off-by: Florian Klink --- terraform/jenkins/binary_cache_storage.tf | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 terraform/jenkins/binary_cache_storage.tf diff --git a/terraform/jenkins/binary_cache_storage.tf b/terraform/jenkins/binary_cache_storage.tf new file mode 100644 index 00000000..2c490273 --- /dev/null +++ b/terraform/jenkins/binary_cache_storage.tf @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +# Create the storage account and storage container +resource "azurerm_storage_account" "binary_cache" { + name = "ghafbinarycache" + resource_group_name = azurerm_resource_group.default.name # TODO: separate resource group? + location = azurerm_resource_group.default.location + account_tier = "Standard" + account_replication_type = "LRS" + allow_nested_items_to_be_public = false +} + +resource "azurerm_storage_container" "binary_cache_1" { + name = "binary-cache-v1" + storage_account_name = azurerm_storage_account.binary_cache.name + container_access_type = "private" +} From 2550dbd25d863f93d95bc3b0629029cfa7a204b5 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Thu, 7 Dec 2023 12:38:37 +0200 Subject: [PATCH 24/88] terraform/jenkins: deploy binary cache vm This deploys the VM defined at binary-cache. Attaching the data disks is still a bit messy (requires one reboot, or manual reverse proxy restart). Fixing this requires some more debugging. Signed-off-by: Florian Klink --- terraform/jenkins/binary_cache.tf | 115 ++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 terraform/jenkins/binary_cache.tf diff --git a/terraform/jenkins/binary_cache.tf b/terraform/jenkins/binary_cache.tf new file mode 100644 index 00000000..8b867699 --- /dev/null +++ b/terraform/jenkins/binary_cache.tf @@ -0,0 +1,115 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +module "binary_cache_image" { + source = "../../tf-modules/azurerm-nix-vm-image" + + nix_attrpath = "outputs.nixosConfigurations.binary-cache.config.system.build.azureImage" + nix_entrypoint = "${path.module}/../.." + + + name = "binary-cache" + resource_group_name = azurerm_resource_group.default.name + location = azurerm_resource_group.default.location + + storage_account_name = azurerm_storage_account.vm_images.name + storage_container_name = azurerm_storage_container.vm_images.name +} + +module "binary_cache_vm" { + source = "../../tf-modules/azurerm-linux-vm" + + resource_group_name = azurerm_resource_group.default.name + location = azurerm_resource_group.default.location + + virtual_machine_name = "ghaf-binary-cache" + virtual_machine_size = "Standard_D1_v2" + virtual_machine_source_image = module.binary_cache_image.image_id + + virtual_machine_custom_data = join("\n", ["#cloud-config", yamlencode({ + users = [ + for user in toset(["bmg", "flokli", "hrosten"]) : { + name = user + sudo = "ALL=(ALL) NOPASSWD:ALL" + ssh_authorized_keys = local.ssh_keys[user] + } + ] + # mount /dev/disk/by-lun/10 to /var/lib/caddy + disk_setup = { + "/dev/disk/by-lun/10" = { + layout = false # don't partition + timeout = 60 # wait for device to appear + } + } + fs_setup = [ + { + filesystem = "ext4" + partition = "auto" + device = "/dev/disk/by-lun/10" + label = "caddy" + } + ] + mounts = [ + ["/dev/disk/by-label/caddy", "/var/lib/caddy"] + ] + })]) + + subnet_id = azurerm_subnet.binary_cache.id +} + +resource "azurerm_subnet" "binary_cache" { + name = "ghaf-infra-binary-cache" + resource_group_name = azurerm_resource_group.default.name + virtual_network_name = azurerm_virtual_network.vnet.name + address_prefixes = ["10.0.3.0/28"] +} + +# Allow inbound HTTP(S) +resource "azurerm_network_interface_security_group_association" "binary_cache_vm" { + network_interface_id = module.binary_cache_vm.virtual_machine_network_interface_id + network_security_group_id = azurerm_network_security_group.binary_cache_vm.id +} + +resource "azurerm_network_security_group" "binary_cache_vm" { + name = "binary-cache-vm" + resource_group_name = azurerm_resource_group.default.name + location = azurerm_resource_group.default.location + + security_rule { + name = "AllowSSHHTTPSInbound" + priority = 400 + direction = "Inbound" + access = "Allow" + protocol = "Tcp" + source_port_range = "*" + destination_port_ranges = [22, 80, 443] + source_address_prefix = "*" + destination_address_prefix = "*" + } +} + +# Allow the VM to read from the binary cache bucket +resource "azurerm_role_assignment" "binary_cache_access_storage" { + scope = azurerm_storage_container.binary_cache_1.resource_manager_id + role_definition_name = "Storage Blob Data Reader" + principal_id = module.binary_cache_vm.virtual_machine_identity_principal_id +} + +# Create a data disk +resource "azurerm_managed_disk" "binary_cache_caddy_state" { + name = "binary-cache-vm-caddy-state" + resource_group_name = azurerm_resource_group.default.name + location = azurerm_resource_group.default.location + storage_account_type = "Standard_LRS" + create_option = "Empty" + disk_size_gb = 1 +} + +# Attach to the VM +resource "azurerm_virtual_machine_data_disk_attachment" "binary_cache_vm_caddy_state" { + managed_disk_id = azurerm_managed_disk.binary_cache_caddy_state.id + virtual_machine_id = module.binary_cache_vm.virtual_machine_id + lun = "10" + caching = "None" +} From cfa58d94143f557b75d74722ee780fa81e799350 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Thu, 7 Dec 2023 20:43:55 +0200 Subject: [PATCH 25/88] hosts/jenkins-controller: give jenkins state Signed-off-by: Florian Klink --- flake.lock | 34 ++++++++++++++------ flake.nix | 3 +- hosts/ghafhydra/configuration.nix | 1 - hosts/jenkins-controller/configuration.nix | 13 ++++++++ hosts/monitoring/configuration.nix | 1 - services/binary-cache/default.nix | 8 ++++- terraform/jenkins/jenkins_controller.tf | 36 ++++++++++++++++++++++ 7 files changed, 82 insertions(+), 14 deletions(-) diff --git a/flake.lock b/flake.lock index 1a6fa8a7..9572253e 100644 --- a/flake.lock +++ b/flake.lock @@ -88,9 +88,7 @@ "nix-serve-ng": { "inputs": { "flake-compat": "flake-compat_2", - "nixpkgs": [ - "nixpkgs" - ], + "nixpkgs": "nixpkgs", "utils": "utils" }, "locked": { @@ -109,16 +107,16 @@ }, "nixpkgs": { "locked": { - "lastModified": 1701539137, - "narHash": "sha256-nVO/5QYpf1GwjvtpXhyxx5M3U/WN0MwBro4Lsk+9mL0=", - "owner": "nixos", + "lastModified": 1688403656, + "narHash": "sha256-zmNai3dKWUCKpKubPWsEJ1Q7od96KebWVDJNCnk+fr0=", + "owner": "NixOS", "repo": "nixpkgs", - "rev": "933d7dc155096e7575d207be6fb7792bc9f34f6d", + "rev": "453da3c28f7a95374b73d1f3fd665dd40e6049e9", "type": "github" }, "original": { - "owner": "nixos", - "ref": "nixos-23.11", + "owner": "NixOS", + "ref": "nixpkgs-unstable", "repo": "nixpkgs", "type": "github" } @@ -141,6 +139,22 @@ "type": "github" } }, + "nixpkgs_2": { + "locked": { + "lastModified": 1701539137, + "narHash": "sha256-nVO/5QYpf1GwjvtpXhyxx5M3U/WN0MwBro4Lsk+9mL0=", + "owner": "nixos", + "repo": "nixpkgs", + "rev": "933d7dc155096e7575d207be6fb7792bc9f34f6d", + "type": "github" + }, + "original": { + "owner": "nixos", + "ref": "nixos-23.11", + "repo": "nixpkgs", + "type": "github" + } + }, "root": { "inputs": { "disko": "disko", @@ -148,7 +162,7 @@ "flake-parts": "flake-parts", "flake-root": "flake-root", "nix-serve-ng": "nix-serve-ng", - "nixpkgs": "nixpkgs", + "nixpkgs": "nixpkgs_2", "sops-nix": "sops-nix", "treefmt-nix": "treefmt-nix" } diff --git a/flake.nix b/flake.nix index 15a30b87..75e918a0 100644 --- a/flake.nix +++ b/flake.nix @@ -19,7 +19,8 @@ # Binary cache with nix-serve-ng nix-serve-ng = { url = "github:aristanetworks/nix-serve-ng"; - inputs.nixpkgs.follows = "nixpkgs"; + # Broken with 23.11, base32 misses text >=2.0 && <2.1 + # inputs.nixpkgs.follows = "nixpkgs"; }; # Disko for disk partitioning disko = { diff --git a/hosts/ghafhydra/configuration.nix b/hosts/ghafhydra/configuration.nix index 42dd9176..f4b0afdb 100644 --- a/hosts/ghafhydra/configuration.nix +++ b/hosts/ghafhydra/configuration.nix @@ -16,7 +16,6 @@ imports = lib.flatten [ (with inputs; [ - nix-serve-ng.nixosModules.default sops-nix.nixosModules.sops disko.nixosModules.disko ]) diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index 1c6f431b..3d6c4620 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -11,6 +11,19 @@ self.nixosModules.service-openssh ]; + services.jenkins = { + enable = true; + listenAddress = "localhost"; + port = 8080; + withCLI = true; + }; + + # set StateDirectory=jenkins, so state volume has the right permissions + # https://github.com/NixOS/nixpkgs/pull/272679 + systemd.services.jenkins.serviceConfig.StateDirectory = "jenkins"; + + # TODO: deploy reverse proxy, sort out authentication (SSO?) + nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; system.stateVersion = "23.05"; diff --git a/hosts/monitoring/configuration.nix b/hosts/monitoring/configuration.nix index 3d5dd742..6ac6a32f 100644 --- a/hosts/monitoring/configuration.nix +++ b/hosts/monitoring/configuration.nix @@ -18,7 +18,6 @@ in { imports = lib.flatten [ (with inputs; [ - nix-serve-ng.nixosModules.default sops-nix.nixosModules.sops disko.nixosModules.disko ]) diff --git a/services/binary-cache/default.nix b/services/binary-cache/default.nix index d8913d2f..17bee84b 100644 --- a/services/binary-cache/default.nix +++ b/services/binary-cache/default.nix @@ -1,10 +1,16 @@ # SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) # # SPDX-License-Identifier: Apache-2.0 -{config, ...}: { +{ + config, + inputs, + pkgs, + ... +}: { services = { nix-serve = { enable = true; + package = inputs.nix-serve-ng.packages.${pkgs.system}.default; secretKeyFile = config.sops.secrets.cache-sig-key.path; }; }; diff --git a/terraform/jenkins/jenkins_controller.tf b/terraform/jenkins/jenkins_controller.tf index 0cecf2c8..0d53b914 100644 --- a/terraform/jenkins/jenkins_controller.tf +++ b/terraform/jenkins/jenkins_controller.tf @@ -37,6 +37,24 @@ module "jenkins_controller_vm" { ssh_authorized_keys = local.ssh_keys[user] } ] + # mount /dev/disk/by-lun/10 to /var/lib/jenkins + disk_setup = { + "/dev/disk/by-lun/10" = { + layout = false # don't partition + timeout = 60 # wait for device to appear + } + } + fs_setup = [ + { + filesystem = "ext4" + partition = "auto" + device = "/dev/disk/by-lun/10" + label = "jenkins" + } + ] + mounts = [ + ["/dev/disk/by-label/jenkins", "/var/lib/jenkins"] + ] })]) subnet_id = azurerm_subnet.jenkins.id @@ -64,3 +82,21 @@ resource "azurerm_network_security_group" "jenkins_controller_vm" { destination_address_prefix = "*" } } + +# Create a data disk +resource "azurerm_managed_disk" "jenkins_controller_jenkins_state" { + name = "jenkins-controller-vm-jenkins-state" + resource_group_name = azurerm_resource_group.default.name + location = azurerm_resource_group.default.location + storage_account_type = "Standard_LRS" + create_option = "Empty" + disk_size_gb = 10 +} + +# Attach to the VM +resource "azurerm_virtual_machine_data_disk_attachment" "jenkins_controller_vm_jenkins_state" { + managed_disk_id = azurerm_managed_disk.jenkins_controller_jenkins_state.id + virtual_machine_id = module.jenkins_controller_vm.virtual_machine_id + lun = "10" + caching = "None" +} From dd6a218311b70f4d753d31c0bc5fa83042050aa5 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Fri, 8 Dec 2023 16:46:21 +0200 Subject: [PATCH 26/88] docs, hosts: drop more nix-serve-ng module usages The service-binary-cache module is all the specific hosts need. Signed-off-by: Florian Klink --- docs/adapting-to-new-environments.md | 1 - hosts/binarycache/configuration.nix | 1 - 2 files changed, 2 deletions(-) diff --git a/docs/adapting-to-new-environments.md b/docs/adapting-to-new-environments.md index 81fb3d30..eba9acd2 100644 --- a/docs/adapting-to-new-environments.md +++ b/docs/adapting-to-new-environments.md @@ -164,7 +164,6 @@ $ cat hosts/mytarget/configuration.nix # Define the services you want to run on your target, as well as the users # who can access the target with ssh: imports = [ - inputs.nix-serve-ng.nixosModules.default inputs.sops-nix.nixosModules.sops inputs.disko.nixosModules.disko ../generic-disk-config.nix diff --git a/hosts/binarycache/configuration.nix b/hosts/binarycache/configuration.nix index 7e66a000..72ce6341 100644 --- a/hosts/binarycache/configuration.nix +++ b/hosts/binarycache/configuration.nix @@ -13,7 +13,6 @@ imports = lib.flatten [ (with inputs; [ - nix-serve-ng.nixosModules.default sops-nix.nixosModules.sops disko.nixosModules.disko ]) From 4f59ca7dbc5430b6d6305fee76bbd4d0123f2f61 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Sat, 9 Dec 2023 09:46:35 +0200 Subject: [PATCH 27/88] hosts: explicitly wait for cloud-init.service Otherwise, cloud-init.service might still be running while we start up services expecting the mount to happen. Signed-off-by: Florian Klink --- hosts/binary-cache/configuration.nix | 4 ++++ hosts/jenkins-controller/configuration.nix | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/hosts/binary-cache/configuration.nix b/hosts/binary-cache/configuration.nix index 48234b71..1f258d5b 100644 --- a/hosts/binary-cache/configuration.nix +++ b/hosts/binary-cache/configuration.nix @@ -79,6 +79,10 @@ "${pkgs.caddy}/bin/caddy run --environ --config ${config.services.caddy.configFile}/Caddyfile" ]; + # Wait for cloud-init mounting before we start caddy. + systemd.services.caddy.after = ["cloud-init.service"]; + systemd.services.caddy.requires = ["cloud-init.service"]; + # Expose the HTTP and HTTPS port. networking.firewall.allowedTCPPorts = [80 443]; diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index 3d6c4620..b71c68e9 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -22,6 +22,10 @@ # https://github.com/NixOS/nixpkgs/pull/272679 systemd.services.jenkins.serviceConfig.StateDirectory = "jenkins"; + # Wait for cloud-init mounting before we start jenkins. + systemd.services.jenkins.after = ["cloud-init.service"]; + systemd.services.jenkins.requires = ["cloud-init.service"]; + # TODO: deploy reverse proxy, sort out authentication (SSO?) nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; From 690ef2bb703fcfa711295048b9282a83354f12ff Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 13 Dec 2023 13:37:43 +0200 Subject: [PATCH 28/88] binary-cache: configure params with cloudinit Configure the domain and storage account name with cloud-init. This allows keeping the same NixOS image across multiple deployments of this image, serving another bucket at another domain. Also, switch to listening on port 443 only, caddy can use the TLS-ALPN-01 challenge just fine. Signed-off-by: Florian Klink --- hosts/binary-cache/configuration.nix | 8 +++----- terraform/jenkins/binary_cache.tf | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/hosts/binary-cache/configuration.nix b/hosts/binary-cache/configuration.nix index 1f258d5b..a3f56f56 100644 --- a/hosts/binary-cache/configuration.nix +++ b/hosts/binary-cache/configuration.nix @@ -28,12 +28,10 @@ RestartSec = 2; DynamicUser = true; RuntimeDirectory = "rclone-http"; - # FUTUREWORK: set AZURE_STORAGE_ACCOUNT_NAME and storage container name through EnvironmentFile ExecStart = "${pkgs.rclone}/bin/rclone " + "serve http " + "--azureblob-env-auth " - + "--azureblob-account ghafbinarycache " + "--read-only " + "--addr unix://%t/rclone-http/socket " + ":azureblob:binary-cache-v1"; @@ -54,7 +52,7 @@ } # Proxy a subset of requests to rclone. - ghaf-binary-cache.northeurope.cloudapp.azure.com { + https://{$SITE_ADDRESS} { handle /nix-cache-info { reverse_proxy unix///run/rclone-http/socket } @@ -83,8 +81,8 @@ systemd.services.caddy.after = ["cloud-init.service"]; systemd.services.caddy.requires = ["cloud-init.service"]; - # Expose the HTTP and HTTPS port. - networking.firewall.allowedTCPPorts = [80 443]; + # Expose the HTTPS port. No need for HTTP, as caddy can use TLS-ALPN-01. + networking.firewall.allowedTCPPorts = [443]; system.stateVersion = "23.05"; } diff --git a/terraform/jenkins/binary_cache.tf b/terraform/jenkins/binary_cache.tf index 8b867699..dbce90c6 100644 --- a/terraform/jenkins/binary_cache.tf +++ b/terraform/jenkins/binary_cache.tf @@ -53,6 +53,22 @@ module "binary_cache_vm" { mounts = [ ["/dev/disk/by-label/caddy", "/var/lib/caddy"] ] + # TODO: this should be EnvironmentFile, so we don't need to restart + write_files = [ + { + content = "[Service]\nEnvironment=AZURE_STORAGE_ACCOUNT_NAME=ghafbinarycache", + "path" = "/run/systemd/system/rclone-http.service.d/cloud-init.conf" + }, + { + content = "[Service]\nEnvironment=SITE_ADDRESS=ghaf-binary-cache.northeurope.cloudapp.azure.com", + "path" = "/run/systemd/system/caddy.service.d/cloud-init.conf" + }, + ], + runcmd = [ + "systemctl daemon-reload", # pick up drop-ins + "systemctl restart caddy.service", + "systemctl restart rclone-http.service" + ] })]) subnet_id = azurerm_subnet.binary_cache.id From 4c46f50f8447b3ea9b27405c1cdd5893476b4934 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 13 Dec 2023 13:39:41 +0200 Subject: [PATCH 29/88] terraform/jenkins: don't listen on port 80 This should use tls-alpn-01 on port 443 just fine. Signed-off-by: Florian Klink --- terraform/jenkins/binary_cache.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/jenkins/binary_cache.tf b/terraform/jenkins/binary_cache.tf index dbce90c6..f2d79714 100644 --- a/terraform/jenkins/binary_cache.tf +++ b/terraform/jenkins/binary_cache.tf @@ -99,7 +99,7 @@ resource "azurerm_network_security_group" "binary_cache_vm" { access = "Allow" protocol = "Tcp" source_port_range = "*" - destination_port_ranges = [22, 80, 443] + destination_port_ranges = [22, 443] source_address_prefix = "*" destination_address_prefix = "*" } From 51860e1ad3b119d2301eb5825e2abf2d48456009 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 13 Dec 2023 15:13:17 +0200 Subject: [PATCH 30/88] hosts: use x-systemd.device-timeout=5min option Apparently https://github.com/canonical/cloud-init/pull/4673 and more hacks are not needed, we can simply ramp up the timeout that systemd is willing to wait for the .device unit to appear. Signed-off-by: Florian Klink --- hosts/azure-common-2.nix | 28 +--------------------- hosts/binary-cache/configuration.nix | 16 +++++++++++++ hosts/jenkins-controller/configuration.nix | 21 ++++++++++++---- terraform/jenkins/binary_cache.tf | 18 -------------- terraform/jenkins/jenkins_controller.tf | 18 -------------- 5 files changed, 34 insertions(+), 67 deletions(-) diff --git a/hosts/azure-common-2.nix b/hosts/azure-common-2.nix index 3e76ac83..41ba9dfb 100644 --- a/hosts/azure-common-2.nix +++ b/hosts/azure-common-2.nix @@ -4,32 +4,11 @@ # # Profile to import for Azure VMs. Imports azure-common.nix from nixpkgs, # and configures cloud-init. -{ - modulesPath, - pkgs, - ... -}: { +{modulesPath, ...}: { imports = [ "${modulesPath}/virtualisation/azure-config.nix" ]; - nixpkgs.overlays = [ - (_self: super: { - cloud-init = super.cloud-init.overrideAttrs (old: { - patches = - old.patches - or [] - ++ [ - # Add support for timeout in disk_setup: https://github.com/canonical/cloud-init/pull/4673 - (pkgs.fetchpatch { - url = "https://github.com/canonical/cloud-init/pull/4673/commits/9b2e3dc907dc06d0a2abdaae6f0b1f0612c5c5dc.patch"; - hash = "sha256-KAd+4YT+dgzIoEq5qZj6y4peclIb3rvnuY6QIQObAiY="; - }) - ]; - }); - }) - ]; - # enable cloud-init, so instance metadata is set accordingly and we can use # cloud-config for ssh key management. services.cloud-init.enable = true; @@ -42,9 +21,4 @@ # but the way nixpkgs configures cloud-init prevents it from picking up DNS # settings from elsewhere. # services.resolved.enable = false; - - # Add filesystem-related tools to cloud-inits path, so it can format data disks. - services.cloud-init.btrfs.enable = true; - services.cloud-init.ext4.enable = true; - services.cloud-init.xfs.enable = true; } diff --git a/hosts/binary-cache/configuration.nix b/hosts/binary-cache/configuration.nix index a3f56f56..018b4508 100644 --- a/hosts/binary-cache/configuration.nix +++ b/hosts/binary-cache/configuration.nix @@ -15,6 +15,22 @@ nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; + # Configure /var/lib/caddy in /etc/fstab. + # Due to an implicit RequiresMountsFor=$state-dir, systemd + # will block starting the service until this mounted. + fileSystems."/var/lib/caddy" = { + device = "/dev/disk/by-lun/10"; + fsType = "ext4"; + options = [ + # Due to https://github.com/hashicorp/terraform-provider-azurerm/issues/6117 + # disks get attached later during boot. + # The default of 90s doesn't seem to be sufficient. + "x-systemd.device-timeout=5min" + "x-systemd.makefs" + "x-systemd.growfs" + ]; + }; + # Run a read-only HTTP webserver proxying to the "binary-cache-v1" storage # container at a unix socket. # This relies on IAM to grant access to the storage container. diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index b71c68e9..4d0c7566 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -11,6 +11,22 @@ self.nixosModules.service-openssh ]; + # Configure /var/lib/jenkins in /etc/fstab. + # Due to an implicit RequiresMountsFor=$state-dir, systemd + # will block starting the service until this mounted. + fileSystems."/var/lib/jenkins" = { + device = "/dev/disk/by-lun/10"; + fsType = "ext4"; + options = [ + # Due to https://github.com/hashicorp/terraform-provider-azurerm/issues/6117 + # disks get attached later during boot. + # The default of 90s doesn't seem to be sufficient. + "x-systemd.device-timeout=5min" + "x-systemd.makefs" + "x-systemd.growfs" + ]; + }; + services.jenkins = { enable = true; listenAddress = "localhost"; @@ -19,13 +35,10 @@ }; # set StateDirectory=jenkins, so state volume has the right permissions + # and we wait on the mountpoint to appear. # https://github.com/NixOS/nixpkgs/pull/272679 systemd.services.jenkins.serviceConfig.StateDirectory = "jenkins"; - # Wait for cloud-init mounting before we start jenkins. - systemd.services.jenkins.after = ["cloud-init.service"]; - systemd.services.jenkins.requires = ["cloud-init.service"]; - # TODO: deploy reverse proxy, sort out authentication (SSO?) nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; diff --git a/terraform/jenkins/binary_cache.tf b/terraform/jenkins/binary_cache.tf index f2d79714..0cc5c285 100644 --- a/terraform/jenkins/binary_cache.tf +++ b/terraform/jenkins/binary_cache.tf @@ -35,24 +35,6 @@ module "binary_cache_vm" { ssh_authorized_keys = local.ssh_keys[user] } ] - # mount /dev/disk/by-lun/10 to /var/lib/caddy - disk_setup = { - "/dev/disk/by-lun/10" = { - layout = false # don't partition - timeout = 60 # wait for device to appear - } - } - fs_setup = [ - { - filesystem = "ext4" - partition = "auto" - device = "/dev/disk/by-lun/10" - label = "caddy" - } - ] - mounts = [ - ["/dev/disk/by-label/caddy", "/var/lib/caddy"] - ] # TODO: this should be EnvironmentFile, so we don't need to restart write_files = [ { diff --git a/terraform/jenkins/jenkins_controller.tf b/terraform/jenkins/jenkins_controller.tf index 0d53b914..1b780f5b 100644 --- a/terraform/jenkins/jenkins_controller.tf +++ b/terraform/jenkins/jenkins_controller.tf @@ -37,24 +37,6 @@ module "jenkins_controller_vm" { ssh_authorized_keys = local.ssh_keys[user] } ] - # mount /dev/disk/by-lun/10 to /var/lib/jenkins - disk_setup = { - "/dev/disk/by-lun/10" = { - layout = false # don't partition - timeout = 60 # wait for device to appear - } - } - fs_setup = [ - { - filesystem = "ext4" - partition = "auto" - device = "/dev/disk/by-lun/10" - label = "jenkins" - } - ] - mounts = [ - ["/dev/disk/by-label/jenkins", "/var/lib/jenkins"] - ] })]) subnet_id = azurerm_subnet.jenkins.id From d34a3ff6b135db392816e6ce29b893802af5705d Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 13 Dec 2023 17:49:56 +0200 Subject: [PATCH 31/88] binary-cache: move to EnvironmentFile= Load the environment via EnvironmentFile=. This avoids the service crashing the first time, because the initial transaction didn't see the service drop-ins created by cloud-init yet. Signed-off-by: Florian Klink --- hosts/binary-cache/configuration.nix | 2 ++ terraform/jenkins/binary_cache.tf | 15 +++++---------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/hosts/binary-cache/configuration.nix b/hosts/binary-cache/configuration.nix index 018b4508..4f882347 100644 --- a/hosts/binary-cache/configuration.nix +++ b/hosts/binary-cache/configuration.nix @@ -53,6 +53,7 @@ + ":azureblob:binary-cache-v1"; # On successful startup, grant caddy write permissions to the socket. ExecStartPost = "${pkgs.acl.bin}/bin/setfacl -m u:caddy:rw %t/rclone-http/socket"; + EnvironmentFile = "/run/rclone-http.env"; }; }; @@ -92,6 +93,7 @@ "" "${pkgs.caddy}/bin/caddy run --environ --config ${config.services.caddy.configFile}/Caddyfile" ]; + systemd.services.caddy.serviceConfig.EnvironmentFile = "/run/caddy.env"; # Wait for cloud-init mounting before we start caddy. systemd.services.caddy.after = ["cloud-init.service"]; diff --git a/terraform/jenkins/binary_cache.tf b/terraform/jenkins/binary_cache.tf index 0cc5c285..fa1a6280 100644 --- a/terraform/jenkins/binary_cache.tf +++ b/terraform/jenkins/binary_cache.tf @@ -35,22 +35,17 @@ module "binary_cache_vm" { ssh_authorized_keys = local.ssh_keys[user] } ] - # TODO: this should be EnvironmentFile, so we don't need to restart + # See corresponding EnvironmentFile= directives in services write_files = [ { - content = "[Service]\nEnvironment=AZURE_STORAGE_ACCOUNT_NAME=ghafbinarycache", - "path" = "/run/systemd/system/rclone-http.service.d/cloud-init.conf" + content = "AZURE_STORAGE_ACCOUNT_NAME=ghafbinarycache", + "path" = "/run/rclone-http.env" }, { - content = "[Service]\nEnvironment=SITE_ADDRESS=ghaf-binary-cache.northeurope.cloudapp.azure.com", - "path" = "/run/systemd/system/caddy.service.d/cloud-init.conf" + content = "SITE_ADDRESS=ghaf-binary-cache.northeurope.cloudapp.azure.com", + "path" = "/run/caddy.env" }, ], - runcmd = [ - "systemctl daemon-reload", # pick up drop-ins - "systemctl restart caddy.service", - "systemctl restart rclone-http.service" - ] })]) subnet_id = azurerm_subnet.binary_cache.id From e5fa53e2312a991eb171ba183e84911893797c65 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 13 Dec 2023 20:02:15 +0200 Subject: [PATCH 32/88] azurerm-linux-vm: use azurerm_virtual_machine We have to fall back to using this resource until https://github.com/hashicorp/terraform-provider-azurerm/issues/6117 is fixed. With `azurerm_linux_virtual_machine` and `azurerm_virtual_machine_data_disk_attachment` the disk only gets attached once the VM is booted up, and the VM can't boot up if it waits for the data disk to appear. Signed-off-by: Florian Klink --- hosts/binary-cache/configuration.nix | 4 - hosts/jenkins-controller/configuration.nix | 4 - nix/devshell.nix | 1 + terraform/jenkins/binary_cache.tf | 19 ++-- terraform/jenkins/jenkins_controller.tf | 18 ++-- tf-modules/azurerm-linux-vm/README.md | 10 ++ tf-modules/azurerm-linux-vm/variables.tf | 3 + .../azurerm-linux-vm/virtual_machine.tf | 101 +++++++++++++----- 8 files changed, 109 insertions(+), 51 deletions(-) diff --git a/hosts/binary-cache/configuration.nix b/hosts/binary-cache/configuration.nix index 4f882347..b190fd15 100644 --- a/hosts/binary-cache/configuration.nix +++ b/hosts/binary-cache/configuration.nix @@ -22,10 +22,6 @@ device = "/dev/disk/by-lun/10"; fsType = "ext4"; options = [ - # Due to https://github.com/hashicorp/terraform-provider-azurerm/issues/6117 - # disks get attached later during boot. - # The default of 90s doesn't seem to be sufficient. - "x-systemd.device-timeout=5min" "x-systemd.makefs" "x-systemd.growfs" ]; diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index 4d0c7566..30c12963 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -18,10 +18,6 @@ device = "/dev/disk/by-lun/10"; fsType = "ext4"; options = [ - # Due to https://github.com/hashicorp/terraform-provider-azurerm/issues/6117 - # disks get attached later during boot. - # The default of 90s doesn't seem to be sufficient. - "x-systemd.device-timeout=5min" "x-systemd.makefs" "x-systemd.growfs" ]; diff --git a/nix/devshell.nix b/nix/devshell.nix index ed5b777b..cd6c6060 100644 --- a/nix/devshell.nix +++ b/nix/devshell.nix @@ -24,6 +24,7 @@ p.azurerm p.external p.null + p.random p.sops ])) ]; diff --git a/terraform/jenkins/binary_cache.tf b/terraform/jenkins/binary_cache.tf index fa1a6280..7a26ee37 100644 --- a/terraform/jenkins/binary_cache.tf +++ b/terraform/jenkins/binary_cache.tf @@ -49,6 +49,17 @@ module "binary_cache_vm" { })]) subnet_id = azurerm_subnet.binary_cache.id + + # Attach disk to the VM + data_disks = [{ + name = azurerm_managed_disk.binary_cache_caddy_state.name + managed_disk_id = azurerm_managed_disk.binary_cache_caddy_state.id + virtual_machine_id = module.jenkins_controller_vm.virtual_machine_id + lun = "10" + create_option = "Attach" + caching = "None" + disk_size_gb = azurerm_managed_disk.binary_cache_caddy_state.disk_size_gb + }] } resource "azurerm_subnet" "binary_cache" { @@ -98,11 +109,3 @@ resource "azurerm_managed_disk" "binary_cache_caddy_state" { create_option = "Empty" disk_size_gb = 1 } - -# Attach to the VM -resource "azurerm_virtual_machine_data_disk_attachment" "binary_cache_vm_caddy_state" { - managed_disk_id = azurerm_managed_disk.binary_cache_caddy_state.id - virtual_machine_id = module.binary_cache_vm.virtual_machine_id - lun = "10" - caching = "None" -} diff --git a/terraform/jenkins/jenkins_controller.tf b/terraform/jenkins/jenkins_controller.tf index 1b780f5b..4d4a7a39 100644 --- a/terraform/jenkins/jenkins_controller.tf +++ b/terraform/jenkins/jenkins_controller.tf @@ -40,6 +40,16 @@ module "jenkins_controller_vm" { })]) subnet_id = azurerm_subnet.jenkins.id + + # Attach disk to the VM + data_disks = [{ + name = azurerm_managed_disk.jenkins_controller_jenkins_state.name + managed_disk_id = azurerm_managed_disk.jenkins_controller_jenkins_state.id + lun = "10" + # create_option = "Attach" + caching = "None" + disk_size_gb = azurerm_managed_disk.jenkins_controller_jenkins_state.disk_size_gb + }] } resource "azurerm_network_interface_security_group_association" "jenkins_controller_vm" { @@ -74,11 +84,3 @@ resource "azurerm_managed_disk" "jenkins_controller_jenkins_state" { create_option = "Empty" disk_size_gb = 10 } - -# Attach to the VM -resource "azurerm_virtual_machine_data_disk_attachment" "jenkins_controller_vm_jenkins_state" { - managed_disk_id = azurerm_managed_disk.jenkins_controller_jenkins_state.id - virtual_machine_id = module.jenkins_controller_vm.virtual_machine_id - lun = "10" - caching = "None" -} diff --git a/tf-modules/azurerm-linux-vm/README.md b/tf-modules/azurerm-linux-vm/README.md index 7dde4a7b..c0cc5aee 100644 --- a/tf-modules/azurerm-linux-vm/README.md +++ b/tf-modules/azurerm-linux-vm/README.md @@ -7,3 +7,13 @@ SPDX-License-Identifier: Apache-2.0 # azurerm-linux-vm Terraform module spinning up a Azure VM. + +This uses the `azurerm_virtual_machine` resource to spin up the VM, as it allows +data disks to be attached on boot. + +This is due to +https://github.com/hashicorp/terraform-provider-azurerm/issues/6117 +- with `azurerm_linux_virtual_machine` and +`azurerm_virtual_machine_data_disk_attachment` the disk only gets attached once +the VM is booted up, and the VM can't boot up if it waits for the data disk +to appear. diff --git a/tf-modules/azurerm-linux-vm/variables.tf b/tf-modules/azurerm-linux-vm/variables.tf index 4a6bd79c..3ae57201 100644 --- a/tf-modules/azurerm-linux-vm/variables.tf +++ b/tf-modules/azurerm-linux-vm/variables.tf @@ -32,3 +32,6 @@ variable "subnet_id" { description = "The subnet ID to attach to the VM and allocate an IP from" } +variable "data_disks" { + description = "List of dict containing keys of the storage_data_disk block" +} diff --git a/tf-modules/azurerm-linux-vm/virtual_machine.tf b/tf-modules/azurerm-linux-vm/virtual_machine.tf index 321168b9..45cecb22 100644 --- a/tf-modules/azurerm-linux-vm/virtual_machine.tf +++ b/tf-modules/azurerm-linux-vm/virtual_machine.tf @@ -2,46 +2,77 @@ # # SPDX-License-Identifier: Apache-2.0 -resource "azurerm_linux_virtual_machine" "main" { +resource "azurerm_virtual_machine" "main" { name = var.virtual_machine_name resource_group_name = var.resource_group_name location = var.location - size = var.virtual_machine_size + vm_size = var.virtual_machine_size - # Unused, but required by the API. May not be root either - admin_username = "foo" - admin_password = "S00persecret" - - # We *don't* support password auth, and this doesn't change anything. - # However, if we don't set this to false we need to - # specify additional pubkeys. - disable_password_authentication = false - # We can't use admin_ssh_key, as it only works for the admin_username. + delete_os_disk_on_termination = true + delete_data_disks_on_termination = false network_interface_ids = [azurerm_network_interface.default.id] - source_image_id = var.virtual_machine_source_image + + storage_image_reference { + id = var.virtual_machine_source_image + } identity { type = "SystemAssigned" } - # We only set custom_data here, not user_data. - # user_data is more recent, and allows updates without recreating the machine, - # but at least cloud-init 23.1.2 blocks boot if custom_data is not set. - # (It logs about not being able to mount /dev/sr0 to /metadata). - # This can be worked around by setting custom_data to a static placeholder, - # but user_data is still ignored. - # TODO: check this again with a more recent cloud-init version. - custom_data = (var.virtual_machine_custom_data == "") ? null : base64encode(var.virtual_machine_custom_data) + os_profile { + computer_name = var.virtual_machine_name + # Unused, but required by the API. May not be root either + admin_username = "foo" + admin_password = "S00persecret" + + # We only set custom_data here, not user_data. + # user_data is more recent, and allows updates without recreating the machine, + # but at least cloud-init 23.1.2 blocks boot if custom_data is not set. + # (It logs about not being able to mount /dev/sr0 to /metadata). + # This can be worked around by setting custom_data to a static placeholder, + # but user_data is still ignored. + # TODO: check this again with a more recent cloud-init version. + custom_data = (var.virtual_machine_custom_data == "") ? null : base64encode(var.virtual_machine_custom_data) + } + + os_profile_linux_config { + # We *don't* support password auth, and this doesn't change anything. + # However, if we don't set this to false we need to + # specify additional pubkeys. + disable_password_authentication = false + # We can't use admin_ssh_key, as it only works for the admin_username. + } - # Enable boot diagnostics, use the managed storage account to store them boot_diagnostics { - storage_account_uri = null + enabled = true + # azurerm_virtual_machine doesn't support the managed storage account + storage_uri = azurerm_storage_account.boot_diag.primary_blob_endpoint + } + + storage_os_disk { + name = "${var.virtual_machine_name}-osdisk" # needs to be unique + caching = "ReadWrite" + create_option = "FromImage" + managed_disk_type = "Standard_LRS" } - os_disk { - caching = "ReadWrite" - storage_account_type = "Standard_LRS" + dynamic "storage_data_disk" { + for_each = var.data_disks + + content { + # use lookup here, so keys can be set optionally + name = lookup(storage_data_disk.value, "name", null) + caching = lookup(storage_data_disk.value, "caching", null) + create_option = "Attach" + # This has to be passed, even for "Attach" + disk_size_gb = lookup(storage_data_disk.value, "disk_size_gb", null) + lun = lookup(storage_data_disk.value, "lun", null) + + managed_disk_type = lookup(storage_data_disk.value, "managed_disk_type", null) + managed_disk_id = lookup(storage_data_disk.value, "managed_disk_id", null) + } } } @@ -66,12 +97,28 @@ resource "azurerm_public_ip" "default" { allocation_method = "Static" } +# Create a random string, and a storage account using that random string. +resource "random_string" "boot_diag" { + length = "8" + special = "false" + upper = false +} + +resource "azurerm_storage_account" "boot_diag" { + name = "${random_string.boot_diag.result}bootdiag" + resource_group_name = var.resource_group_name + location = var.location + account_tier = "Standard" + account_replication_type = "GRS" +} + + output "virtual_machine_id" { - value = azurerm_linux_virtual_machine.main.id + value = azurerm_virtual_machine.main.id } output "virtual_machine_identity_principal_id" { - value = azurerm_linux_virtual_machine.main.identity[0].principal_id + value = azurerm_virtual_machine.main.identity[0].principal_id } output "virtual_machine_network_interface_id" { From 3ba044e45ad0a0e7667de89d8cab87fb5dc8215b Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Thu, 14 Dec 2023 16:02:43 +0200 Subject: [PATCH 33/88] flake: switch to nixpkgs master Temporarily move to a nixpkgs master, as that contains waagent fixes needed for in-initrd resource disk management. Signed-off-by: Florian Klink --- flake.lock | 8 ++++---- flake.nix | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/flake.lock b/flake.lock index 9572253e..ad4ffbba 100644 --- a/flake.lock +++ b/flake.lock @@ -141,16 +141,16 @@ }, "nixpkgs_2": { "locked": { - "lastModified": 1701539137, - "narHash": "sha256-nVO/5QYpf1GwjvtpXhyxx5M3U/WN0MwBro4Lsk+9mL0=", + "lastModified": 1703092220, + "narHash": "sha256-O1W4RXGNCXVOOfFr6AyOZKS+2gAviatUBZwBzZEPeFc=", "owner": "nixos", "repo": "nixpkgs", - "rev": "933d7dc155096e7575d207be6fb7792bc9f34f6d", + "rev": "e3f251c662bc525e4bae3edfa3fc67e52d690d4f", "type": "github" }, "original": { "owner": "nixos", - "ref": "nixos-23.11", + "ref": "master", "repo": "nixpkgs", "type": "github" } diff --git a/flake.nix b/flake.nix index 75e918a0..eabe93ee 100644 --- a/flake.nix +++ b/flake.nix @@ -6,7 +6,7 @@ inputs = { # Nixpkgs - nixpkgs.url = "github:nixos/nixpkgs/nixos-23.11"; + nixpkgs.url = "github:nixos/nixpkgs/master"; # Allows us to structure the flake with the NixOS module system flake-parts.url = "github:hercules-ci/flake-parts"; flake-root.url = "github:srid/flake-root"; From 162a406df234db2cf31c1d778efa059a38276bb4 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Fri, 15 Dec 2023 14:15:49 +0200 Subject: [PATCH 34/88] azure-scratch-store-common.nix: init This configures systemd-in-initrd, and overlays /nix/store with a place on the local scratch disk available at /dev/disk/azure/resource-part1. The general concept of this should probably be factored out into nixpkgs, so we only need to enable that option and take care of the azure-specific bits here. Signed-off-by: Florian Klink --- hosts/azure-scratch-store-common.nix | 86 ++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 hosts/azure-scratch-store-common.nix diff --git a/hosts/azure-scratch-store-common.nix b/hosts/azure-scratch-store-common.nix new file mode 100644 index 00000000..5dcfea78 --- /dev/null +++ b/hosts/azure-scratch-store-common.nix @@ -0,0 +1,86 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 +{ + pkgs, + utils, + ... +}: { + # Disable explicit resource disk handling in waagent. + # We want to take control over it in initrd already. + virtualisation.azure.agent.mountResourceDisk = false; + + boot.initrd.systemd = { + # This requires systemd-in-initrd. + enable = true; + + # We need the wipefs binary available in the initrd + extraBin = { + "wipefs" = "${pkgs.util-linux}/bin/wipefs"; + }; + + # The resource disk comes pre-formatted with NTFS, not ext4. + # Wipe the superblock if it's NTFS (and only then, to not wipe on every reboot). + # Once we get `filesystems`-syntax to work again, we could delegate the mkfs + # part to systemd-makefs (and make this `wantedBy` and `before` that makefs + # unit). + services.wipe-resource-disk = { + description = "Wipe resource disk before makefs"; + requires = ["${utils.escapeSystemdPath "dev/disk/azure/resource-part1"}.device"]; + after = ["${utils.escapeSystemdPath "dev/disk/azure/resource-part1"}.device"]; + wantedBy = ["${utils.escapeSystemdPath "sysroot/mnt/resource"}.mount"]; + before = ["${utils.escapeSystemdPath "sysroot/mnt/resource"}.mount"]; + + script = '' + if [[ $(wipefs --output=TYPE -p /dev/disk/azure/resource-part1) == "ntfs" ]]; then + echo "wiping resource disk (was ntfs)" + wipefs -a /dev/disk/azure/resource-part1 + mkfs.ext4 /dev/disk/azure/resource-part1 + else + echo "skip wiping resource disk (not ntfs)" + fi + ''; + }; + + # Once /sysroot/mnt/resource is mounted, ensure the two .rw-store/ + # {work,store} directories that overlayfs is using are present. + # The kernel doesn't create them on its own and fails the mount if they're + # not present, so we set `wantedBy` and `before` to the .mount unit. + services.setup-resource-disk = { + description = "Setup resource disk after it's mounted"; + unitConfig.RequiresMountsFor = "/sysroot/mnt/resource"; + wantedBy = ["${utils.escapeSystemdPath "sysroot/nix/store"}.mount"]; + before = ["${utils.escapeSystemdPath "sysroot/nix/store"}.mount"]; + + script = '' + mkdir -p /sysroot/mnt/resource/.rw-store/{work,store} + ''; + }; + + # These describe the mountpoints inside the initrd + # (/sysroot/mnt/resource, /sysroot/nix/store). + # In the future, this should be moved to `filesystems`-syntax, so we can + # make use of systemd-makefs and can write some things more concisely. + mounts = [ + { + where = "/sysroot/mnt/resource"; + what = "/dev/disk/azure/resource-part1"; + type = "ext4"; + } + # describe the overlay mount + { + where = "/sysroot/nix/store"; + what = "overlay"; + type = "overlay"; + options = "lowerdir=/sysroot/nix/store,upperdir=/sysroot/mnt/resource/.rw-store/store,workdir=/sysroot/mnt/resource/.rw-store/work"; + wantedBy = ["initrd-fs.target"]; + before = ["initrd-fs.target"]; + requires = ["setup-resource-disk.service"]; + after = ["setup-resource-disk.service"]; + unitConfig.RequiresMountsFor = ["/sysroot" "/sysroot/mnt/resource"]; + } + ]; + }; + # load the overlay kernel module + boot.initrd.kernelModules = ["overlay"]; +} From a3ca9e0f83f1d02c7b50bdd53d340c25cff74f2e Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Fri, 15 Dec 2023 14:18:01 +0200 Subject: [PATCH 35/88] hosts: enable scratch /nix/store Load the snippet to overlay /nix/store with the scratch disk. Signed-off-by: Florian Klink --- hosts/binary-cache/configuration.nix | 1 + hosts/jenkins-controller/configuration.nix | 1 + 2 files changed, 2 insertions(+) diff --git a/hosts/binary-cache/configuration.nix b/hosts/binary-cache/configuration.nix index b190fd15..a2c4e099 100644 --- a/hosts/binary-cache/configuration.nix +++ b/hosts/binary-cache/configuration.nix @@ -10,6 +10,7 @@ }: { imports = [ ../azure-common-2.nix + ../azure-scratch-store-common.nix self.nixosModules.service-openssh ]; diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index 30c12963..b98fb139 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -8,6 +8,7 @@ }: { imports = [ ../azure-common-2.nix + ../azure-scratch-store-common.nix self.nixosModules.service-openssh ]; From 5ad2e74795f31c94b1cd35a264e7ebaff91bc33b Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Mon, 18 Dec 2023 15:23:52 +0200 Subject: [PATCH 36/88] terraform/jenkins: interpolate storageaccount name No change in rendered config, but properly describes the dependency inside Terraform. Signed-off-by: Florian Klink --- terraform/jenkins/binary_cache.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/jenkins/binary_cache.tf b/terraform/jenkins/binary_cache.tf index 7a26ee37..b6a606d4 100644 --- a/terraform/jenkins/binary_cache.tf +++ b/terraform/jenkins/binary_cache.tf @@ -38,7 +38,7 @@ module "binary_cache_vm" { # See corresponding EnvironmentFile= directives in services write_files = [ { - content = "AZURE_STORAGE_ACCOUNT_NAME=ghafbinarycache", + content = "AZURE_STORAGE_ACCOUNT_NAME=${azurerm_storage_account.binary_cache.name}", "path" = "/run/rclone-http.env" }, { From 03d49cdcc611a6b02bad63a4d4d3025af9b9cbf1 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Mon, 18 Dec 2023 16:33:48 +0200 Subject: [PATCH 37/88] binary-cache: rclone env file: move to /var/lib It seems /run/rclone-http is a bit racy under some circumstances. Signed-off-by: Florian Klink --- hosts/binary-cache/configuration.nix | 2 +- terraform/jenkins/binary_cache.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hosts/binary-cache/configuration.nix b/hosts/binary-cache/configuration.nix index a2c4e099..dc28e679 100644 --- a/hosts/binary-cache/configuration.nix +++ b/hosts/binary-cache/configuration.nix @@ -50,7 +50,7 @@ + ":azureblob:binary-cache-v1"; # On successful startup, grant caddy write permissions to the socket. ExecStartPost = "${pkgs.acl.bin}/bin/setfacl -m u:caddy:rw %t/rclone-http/socket"; - EnvironmentFile = "/run/rclone-http.env"; + EnvironmentFile = "/var/lib/rclone-http/env"; }; }; diff --git a/terraform/jenkins/binary_cache.tf b/terraform/jenkins/binary_cache.tf index b6a606d4..d8fc06bd 100644 --- a/terraform/jenkins/binary_cache.tf +++ b/terraform/jenkins/binary_cache.tf @@ -39,7 +39,7 @@ module "binary_cache_vm" { write_files = [ { content = "AZURE_STORAGE_ACCOUNT_NAME=${azurerm_storage_account.binary_cache.name}", - "path" = "/run/rclone-http.env" + "path" = "/var/lib/rclone-http/env" }, { content = "SITE_ADDRESS=ghaf-binary-cache.northeurope.cloudapp.azure.com", From 46a45fac027b158b2e41f1ff5d1e119022d28f2a Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Mon, 18 Dec 2023 15:19:29 +0200 Subject: [PATCH 38/88] services: add remote-build module This adds an additional "remote-build" ssh user. The Jenkins controller will use this as user to do remote Nix builds. Signed-off-by: Florian Klink --- services/default.nix | 3 ++- services/remote-build/default.nix | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 services/remote-build/default.nix diff --git a/services/default.nix b/services/default.nix index d5003410..63fea607 100644 --- a/services/default.nix +++ b/services/default.nix @@ -6,7 +6,8 @@ service-binary-cache = import ./binary-cache; service-hydra = import ./hydra; service-nginx = import ./nginx; - service-openssh = import ./openssh; service-node-exporter = import ./node-exporter; + service-openssh = import ./openssh; + service-remote-build = import ./remote-build; }; } diff --git a/services/remote-build/default.nix b/services/remote-build/default.nix new file mode 100644 index 00000000..e7ba619b --- /dev/null +++ b/services/remote-build/default.nix @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 +_: { + # Adds a "remote-build" ssh user, which can trigger nix builds. + # TODO: once they all use a common binary cache, we can drop the trusted user + # statement, so jenkins can't copy store paths, but builders can only + # substitute. + nix.settings.trusted-users = ["remote-build"]; + users.users.remote-build = { + isNormalUser = true; + name = "remote-build"; + }; +} From 6af53c204c6bf950adbbc13dcc97cfaa4f50d316 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Mon, 18 Dec 2023 15:24:57 +0200 Subject: [PATCH 39/88] hosts: add builder node Signed-off-by: Florian Klink --- hosts/builder/configuration.nix | 51 +++++++++++++++++++++++++++++++++ hosts/default.nix | 4 +++ 2 files changed, 55 insertions(+) create mode 100644 hosts/builder/configuration.nix diff --git a/hosts/builder/configuration.nix b/hosts/builder/configuration.nix new file mode 100644 index 00000000..8b90b312 --- /dev/null +++ b/hosts/builder/configuration.nix @@ -0,0 +1,51 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 +{ + self, + pkgs, + lib, + ... +}: { + imports = [ + ../azure-common-2.nix + ../azure-scratch-store-common.nix + self.nixosModules.service-openssh + self.nixosModules.service-remote-build + ]; + + nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; + + # Run a read-only HTTP webserver proxying to the "binary-cache-v1" storage + # container via http://localhost:8080. + # This relies on IAM to grant access to the storage container. + systemd.services.rclone-http = { + after = ["network.target"]; + requires = ["network.target"]; + wantedBy = ["multi-user.target"]; + serviceConfig = { + Type = "notify"; + Restart = "always"; + RestartSec = 2; + DynamicUser = true; + RuntimeDirectory = "rclone-http"; + ExecStart = + "${pkgs.rclone}/bin/rclone " + + "serve http " + + "--azureblob-env-auth " + + "--read-only " + + "--addr localhost:8080 " + + ":azureblob:binary-cache-v1"; + EnvironmentFile = "/var/lib/rclone-http/env"; + }; + }; + + # Configure Nix to use this as a substitutor. + # TODO: add the public key used to sign on the controller here. + nix.settings.trusted-public-keys = []; + nix.settings.substituters = [ + "http://localhost:8080" + ]; + + system.stateVersion = "23.05"; +} diff --git a/hosts/default.nix b/hosts/default.nix index fcc2057a..19f5d4b2 100644 --- a/hosts/default.nix +++ b/hosts/default.nix @@ -32,6 +32,10 @@ inherit specialArgs; modules = [./binary-cache/configuration.nix]; }; + builder = lib.nixosSystem { + inherit specialArgs; + modules = [./builder/configuration.nix]; + }; ficolobuild3 = lib.nixosSystem { inherit specialArgs; modules = [./ficolobuild/build3.nix]; From efb967c4c4eaa1fa42b75884bc6e9a71c9b31b8b Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Mon, 18 Dec 2023 15:28:08 +0200 Subject: [PATCH 40/88] tf-modules: linux-vm: allow no data disks Signed-off-by: Florian Klink --- tf-modules/azurerm-linux-vm/variables.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/tf-modules/azurerm-linux-vm/variables.tf b/tf-modules/azurerm-linux-vm/variables.tf index 3ae57201..0e4bfe13 100644 --- a/tf-modules/azurerm-linux-vm/variables.tf +++ b/tf-modules/azurerm-linux-vm/variables.tf @@ -34,4 +34,5 @@ variable "subnet_id" { variable "data_disks" { description = "List of dict containing keys of the storage_data_disk block" + default = [] } From 849e76a42685da8ce5b959c3106d4dbb8ac57be1 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Mon, 18 Dec 2023 18:46:20 +0200 Subject: [PATCH 41/88] tf-modules/azurerm-linux-vm allow non-public ips This adds a allocate_public_ip boolean variable (defaulting to false), and will only create a public ip if it's set to true. Signed-off-by: Florian Klink --- terraform/jenkins/binary_cache.tf | 3 ++- terraform/jenkins/jenkins_controller.tf | 3 ++- tf-modules/azurerm-linux-vm/variables.tf | 5 +++++ tf-modules/azurerm-linux-vm/virtual_machine.tf | 4 +++- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/terraform/jenkins/binary_cache.tf b/terraform/jenkins/binary_cache.tf index d8fc06bd..07246b89 100644 --- a/terraform/jenkins/binary_cache.tf +++ b/terraform/jenkins/binary_cache.tf @@ -48,7 +48,8 @@ module "binary_cache_vm" { ], })]) - subnet_id = azurerm_subnet.binary_cache.id + allocate_public_ip = true + subnet_id = azurerm_subnet.binary_cache.id # Attach disk to the VM data_disks = [{ diff --git a/terraform/jenkins/jenkins_controller.tf b/terraform/jenkins/jenkins_controller.tf index 4d4a7a39..4c766029 100644 --- a/terraform/jenkins/jenkins_controller.tf +++ b/terraform/jenkins/jenkins_controller.tf @@ -39,7 +39,8 @@ module "jenkins_controller_vm" { ] })]) - subnet_id = azurerm_subnet.jenkins.id + allocate_public_ip = true + subnet_id = azurerm_subnet.jenkins.id # Attach disk to the VM data_disks = [{ diff --git a/tf-modules/azurerm-linux-vm/variables.tf b/tf-modules/azurerm-linux-vm/variables.tf index 0e4bfe13..88be7e64 100644 --- a/tf-modules/azurerm-linux-vm/variables.tf +++ b/tf-modules/azurerm-linux-vm/variables.tf @@ -27,6 +27,11 @@ variable "virtual_machine_custom_data" { default = "" } +variable "allocate_public_ip" { + type = bool + default = false +} + variable "subnet_id" { type = string description = "The subnet ID to attach to the VM and allocate an IP from" diff --git a/tf-modules/azurerm-linux-vm/virtual_machine.tf b/tf-modules/azurerm-linux-vm/virtual_machine.tf index 45cecb22..a587aa2e 100644 --- a/tf-modules/azurerm-linux-vm/virtual_machine.tf +++ b/tf-modules/azurerm-linux-vm/virtual_machine.tf @@ -85,11 +85,13 @@ resource "azurerm_network_interface" "default" { name = "internal" subnet_id = var.subnet_id private_ip_address_allocation = "Dynamic" - public_ip_address_id = azurerm_public_ip.default.id + public_ip_address_id = (var.allocate_public_ip) ? azurerm_public_ip.default[0].id : null } } resource "azurerm_public_ip" "default" { + count = (var.allocate_public_ip) ? 1 : 0 + name = "${var.virtual_machine_name}-pub-ip" domain_name_label = var.virtual_machine_name resource_group_name = var.resource_group_name From 2ddd2e607129fc01dc7902e531cb6e9f792526a2 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Mon, 18 Dec 2023 16:36:20 +0200 Subject: [PATCH 42/88] terraform: deploy builders This deploys two builders in a new subnet. Signed-off-by: Florian Klink --- terraform/jenkins/builder.tf | 96 ++++++++++++++++++++++++++++++++++++ terraform/jenkins/main.tf | 11 ++++- 2 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 terraform/jenkins/builder.tf diff --git a/terraform/jenkins/builder.tf b/terraform/jenkins/builder.tf new file mode 100644 index 00000000..bd8dcf6d --- /dev/null +++ b/terraform/jenkins/builder.tf @@ -0,0 +1,96 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +module "builder_image" { + source = "../../tf-modules/azurerm-nix-vm-image" + + nix_attrpath = "outputs.nixosConfigurations.builder.config.system.build.azureImage" + nix_entrypoint = "${path.module}/../.." + + + name = "builder" + resource_group_name = azurerm_resource_group.default.name + location = azurerm_resource_group.default.location + + storage_account_name = azurerm_storage_account.vm_images.name + storage_container_name = azurerm_storage_container.vm_images.name +} + +locals { + num_builders = 2 +} + +module "builder_vm" { + source = "../../tf-modules/azurerm-linux-vm" + + count = local.num_builders + + resource_group_name = azurerm_resource_group.default.name + location = azurerm_resource_group.default.location + + virtual_machine_name = "ghaf-builder-${count.index}" + virtual_machine_size = "Standard_D16_v3" + virtual_machine_source_image = module.builder_image.image_id + + virtual_machine_custom_data = join("\n", ["#cloud-config", yamlencode({ + users = concat([ + # TODO: drop once this is known to work. + for user in toset(["bmg", "flokli", "hrosten"]) : { + name = user + sudo = "ALL=(ALL) NOPASSWD:ALL" + ssh_authorized_keys = local.ssh_keys[user] + } + ], [{ + name = "remote-build" + ssh_authorized_keys = [ + "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAFdY1Nnn5JkkrvwoDfqtBFfn7oZaW2YJPwtsh/UzRZO remote-build" + ] + }]) + write_files = [ + { + content = "AZURE_STORAGE_ACCOUNT_NAME=${azurerm_storage_account.binary_cache.name}", + "path" = "/var/lib/rclone-http/env" + } + ], + })]) + + subnet_id = azurerm_subnet.builders.id +} + +# Allow inbound SSH from the jenkins subnet (only) +resource "azurerm_network_interface_security_group_association" "builder_vm" { + count = local.num_builders + + network_interface_id = module.builder_vm[count.index].virtual_machine_network_interface_id + network_security_group_id = azurerm_network_security_group.binary_cache_vm.id +} + +resource "azurerm_network_security_group" "builder_vm" { + count = local.num_builders + + name = "builder-vm-${count.index}" + resource_group_name = azurerm_resource_group.default.name + location = azurerm_resource_group.default.location + + security_rule { + name = "AllowSSHFromJenkins" + priority = 400 + direction = "Inbound" + access = "Allow" + protocol = "Tcp" + source_port_range = "*" + destination_port_ranges = [22] + source_address_prefix = azurerm_subnet.jenkins.address_prefixes[0] + destination_address_prefix = "*" + } +} + +# Allow the VMs to read from the binary cache bucket +resource "azurerm_role_assignment" "builder_access_binary_cache" { + count = local.num_builders + + scope = azurerm_storage_container.binary_cache_1.resource_manager_id + role_definition_name = "Storage Blob Data Reader" + principal_id = module.builder_vm[count.index].virtual_machine_identity_principal_id +} diff --git a/terraform/jenkins/main.tf b/terraform/jenkins/main.tf index 71cb9dbe..16bdafec 100644 --- a/terraform/jenkins/main.tf +++ b/terraform/jenkins/main.tf @@ -17,7 +17,7 @@ resource "azurerm_resource_group" "default" { location = "northeurope" } -# Create a virtual network and slice out a subnet for jenkins. +# Create a virtual network resource "azurerm_virtual_network" "vnet" { name = "ghaf-infra-vnet" address_space = ["10.0.0.0/16"] @@ -25,9 +25,18 @@ resource "azurerm_virtual_network" "vnet" { resource_group_name = azurerm_resource_group.default.name } +# Slice out a subnet for jenkins. resource "azurerm_subnet" "jenkins" { name = "ghaf-infra-jenkins" resource_group_name = azurerm_resource_group.default.name virtual_network_name = azurerm_virtual_network.vnet.name address_prefixes = ["10.0.2.0/24"] } + +# Slice out a subnet for the buidlers. +resource "azurerm_subnet" "builders" { + name = "ghaf-infra-builders" + resource_group_name = azurerm_resource_group.default.name + virtual_network_name = azurerm_virtual_network.vnet.name + address_prefixes = ["10.0.4.0/28"] +} From 5acda094a30f0877a5538a252f8452b4b9f0dcd1 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Mon, 18 Dec 2023 19:29:01 +0200 Subject: [PATCH 43/88] terraform/jenkins: create ed25519 key with terraform Signed-off-by: Florian Klink --- .reuse/dep5 | 2 +- nix/devshell.nix | 2 ++ terraform/jenkins/builder.tf | 2 +- terraform/jenkins/id_ed25519_remote_build.pub | 1 + terraform/jenkins/remote_build_ssh.tf | 16 ++++++++++++++++ 5 files changed, 21 insertions(+), 2 deletions(-) create mode 100644 terraform/jenkins/id_ed25519_remote_build.pub create mode 100644 terraform/jenkins/remote_build_ssh.tf diff --git a/.reuse/dep5 b/.reuse/dep5 index 3dbec680..9bc6023b 100644 --- a/.reuse/dep5 +++ b/.reuse/dep5 @@ -2,4 +2,4 @@ Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ Copyright: 2023 Technology Innovation Institute (TII) License: Apache-2.0 -Files: *.lock *.png *.svg *.csv *.yaml \ No newline at end of file +Files: *.lock *.png *.svg *.csv *.yaml *.pub diff --git a/nix/devshell.nix b/nix/devshell.nix index cd6c6060..427c4cde 100644 --- a/nix/devshell.nix +++ b/nix/devshell.nix @@ -23,9 +23,11 @@ (terraform.withPlugins (p: [ p.azurerm p.external + p.local p.null p.random p.sops + p.tls ])) ]; }; diff --git a/terraform/jenkins/builder.tf b/terraform/jenkins/builder.tf index bd8dcf6d..91041fcd 100644 --- a/terraform/jenkins/builder.tf +++ b/terraform/jenkins/builder.tf @@ -44,7 +44,7 @@ module "builder_vm" { ], [{ name = "remote-build" ssh_authorized_keys = [ - "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAFdY1Nnn5JkkrvwoDfqtBFfn7oZaW2YJPwtsh/UzRZO remote-build" + tls_private_key.ed25519_remote_build.public_key_openssh ] }]) write_files = [ diff --git a/terraform/jenkins/id_ed25519_remote_build.pub b/terraform/jenkins/id_ed25519_remote_build.pub new file mode 100644 index 00000000..2e1b54e4 --- /dev/null +++ b/terraform/jenkins/id_ed25519_remote_build.pub @@ -0,0 +1 @@ +ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBlHMthtFYhBK6WsZCNXeJcOUx6xeVLNAHhWI6zFofiD diff --git a/terraform/jenkins/remote_build_ssh.tf b/terraform/jenkins/remote_build_ssh.tf new file mode 100644 index 00000000..e9e46c14 --- /dev/null +++ b/terraform/jenkins/remote_build_ssh.tf @@ -0,0 +1,16 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +# Create a ED25519 key, which the jenkins master will use to authenticate with +# builders. +resource "tls_private_key" "ed25519_remote_build" { + algorithm = "ED25519" +} + +# Dump the ed25519 public key to disk +resource "local_file" "ed25519_remote_build_pubkey" { + filename = "${path.module}/id_ed25519_remote_build.pub" + file_permission = "0644" + content = tls_private_key.ed25519_remote_build.public_key_openssh +} From a396ced1b1736edc8cfea8edf6fbee6ebe31a3d2 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Tue, 19 Dec 2023 10:21:16 +0200 Subject: [PATCH 44/88] terraform/jenkins: put privkey in azure key vault This creates an azure key vault and adds the private key as a secret into there, then grants the jenkins-controller VM access to read that secret. Signed-off-by: Florian Klink --- terraform/jenkins/jenkins_controller.tf | 12 +++++++ terraform/jenkins/remote_build_ssh.tf | 42 +++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/terraform/jenkins/jenkins_controller.tf b/terraform/jenkins/jenkins_controller.tf index 4c766029..59595d16 100644 --- a/terraform/jenkins/jenkins_controller.tf +++ b/terraform/jenkins/jenkins_controller.tf @@ -85,3 +85,15 @@ resource "azurerm_managed_disk" "jenkins_controller_jenkins_state" { create_option = "Empty" disk_size_gb = 10 } + +# Grant the VM read-only access to the Azure Key Vault Secret containing the +# ed25519 private key used to connect to remote builders. +resource "azurerm_key_vault_access_policy" "ssh_remote_build_jenkins_controller" { + key_vault_id = azurerm_key_vault.ssh_remote_build.id + tenant_id = data.azurerm_client_config.current.tenant_id + object_id = module.jenkins_controller_vm.virtual_machine_identity_principal_id + + secret_permissions = [ + "Get", + ] +} diff --git a/terraform/jenkins/remote_build_ssh.tf b/terraform/jenkins/remote_build_ssh.tf index e9e46c14..77eb73cd 100644 --- a/terraform/jenkins/remote_build_ssh.tf +++ b/terraform/jenkins/remote_build_ssh.tf @@ -14,3 +14,45 @@ resource "local_file" "ed25519_remote_build_pubkey" { file_permission = "0644" content = tls_private_key.ed25519_remote_build.public_key_openssh } + +# Create an Azure key vault. +resource "azurerm_key_vault" "ssh_remote_build" { + # this must be globally unique + name = "ghaf-ssh-remote-build" + location = azurerm_resource_group.default.location + resource_group_name = azurerm_resource_group.default.name + sku_name = "standard" + # The Azure Active Directory tenant ID that should be used for authenticating + # requests to the key vault. + tenant_id = data.azurerm_client_config.current.tenant_id +} + +data "azurerm_client_config" "current" {} + +# Put the ed25519 private key used for ssh as a secret. +resource "azurerm_key_vault_secret" "ssh_remote_build" { + name = "remote-build-ssh-private-key" + value = tls_private_key.ed25519_remote_build.private_key_openssh + key_vault_id = azurerm_key_vault.ssh_remote_build.id + + # Each of the secrets needs an explicit dependency on the access policy. + # Otherwise, Terraform may attempt to create the secret before creating the + # access policy. + # https://stackoverflow.com/a/74747333 + depends_on = [ + azurerm_key_vault_access_policy.ssh_remote_build_terraform + ] +} + +resource "azurerm_key_vault_access_policy" "ssh_remote_build_terraform" { + key_vault_id = azurerm_key_vault.ssh_remote_build.id + tenant_id = data.azurerm_client_config.current.tenant_id + # TODO: set some common group as object_id? + object_id = data.azurerm_client_config.current.object_id + + secret_permissions = [ + "Get", + "List", + "Set" + ] +} From 2fd51692359df08aac72da2889c1ba6676db5fc7 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Tue, 19 Dec 2023 12:12:18 +0200 Subject: [PATCH 45/88] terraform/jenkins: use TerraformAdminsGHAFInfra Use the common group, instead of the current client object id. Signed-off-by: Florian Klink --- terraform/jenkins/remote_build_ssh.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/terraform/jenkins/remote_build_ssh.tf b/terraform/jenkins/remote_build_ssh.tf index 77eb73cd..a8acaae7 100644 --- a/terraform/jenkins/remote_build_ssh.tf +++ b/terraform/jenkins/remote_build_ssh.tf @@ -47,8 +47,8 @@ resource "azurerm_key_vault_secret" "ssh_remote_build" { resource "azurerm_key_vault_access_policy" "ssh_remote_build_terraform" { key_vault_id = azurerm_key_vault.ssh_remote_build.id tenant_id = data.azurerm_client_config.current.tenant_id - # TODO: set some common group as object_id? - object_id = data.azurerm_client_config.current.object_id + # "TerraformAdminsGHAFInfra" group + object_id = "f80c2488-2301-4de8-89d6-4954b77f453e" secret_permissions = [ "Get", From a696bc7e593128ff2629b76952bd3b853a2a43bd Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Tue, 19 Dec 2023 10:49:34 +0200 Subject: [PATCH 46/88] hosts/jenkins-controller: fetch secret from vault This adds a fetch-build-ssh-key systemd service that fetches the ssh private key into /etc/secrets/remote-build-ssh-key (owned by root), and orders itself before nix-daemon. Signed-off-by: Florian Klink --- hosts/jenkins-controller/configuration.nix | 32 ++++++++++++++++++++++ hosts/jenkins-controller/get_secret.py | 24 ++++++++++++++++ terraform/jenkins/jenkins_controller.tf | 7 +++++ 3 files changed, 63 insertions(+) create mode 100644 hosts/jenkins-controller/get_secret.py diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index b98fb139..514f6638 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 { + pkgs, self, lib, ... @@ -36,6 +37,37 @@ # https://github.com/NixOS/nixpkgs/pull/272679 systemd.services.jenkins.serviceConfig.StateDirectory = "jenkins"; + # Define a fetch-remote-build-ssh-key unit populating + # /etc/secrets/remote-build-ssh-key from Azure Key Vault. + # Make it before and requiredBy nix-daemon.service. + systemd.services.fetch-build-ssh-key = { + after = ["network.target"]; + before = ["nix-daemon.service"]; + requires = ["network.target"]; + wantedBy = [ + # nix-daemon is socket-activated, and having it here should be sufficient + # to fetch the keys whenever a jenkins job connects to the daemon first. + # This means this service will effectively get socket-activated on the + # first nix-daemon connection. + "nix-daemon.service" + ]; + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + EnvironmentFile = "/var/lib/fetch-build-ssh-key/env"; + Restart = "on-failure"; + }; + script = let + get-secret = pkgs.writers.writePython3 "get-secret" { + libraries = with pkgs.python3.pkgs; [azure-keyvault-secrets azure-identity]; + } (builtins.readFile ./get_secret.py); + in '' + umask 077 + mkdir -p /etc/secrets/ + ${get-secret} > /etc/secrets/remote-build-ssh-key + ''; + }; + # TODO: deploy reverse proxy, sort out authentication (SSO?) nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; diff --git a/hosts/jenkins-controller/get_secret.py b/hosts/jenkins-controller/get_secret.py new file mode 100644 index 00000000..ba59ea60 --- /dev/null +++ b/hosts/jenkins-controller/get_secret.py @@ -0,0 +1,24 @@ +""" +This script retrieves a secret specified in $SECRET_NAME +from an Azure Key Vault in $KEY_VAULT_NAME +and prints it to stdout. + +It uses the default Azure credential client. +""" + +from azure.keyvault.secrets import SecretClient +from azure.identity import DefaultAzureCredential + +import os + +key_vault_name = os.environ["KEY_VAULT_NAME"] +secret_name = os.environ["SECRET_NAME"] + +credential = DefaultAzureCredential() +client = SecretClient( + vault_url=f"https://{key_vault_name}.vault.azure.net", + credential=credential +) + +s = client.get_secret(secret_name) +print(s.value) diff --git a/terraform/jenkins/jenkins_controller.tf b/terraform/jenkins/jenkins_controller.tf index 59595d16..beb5cea2 100644 --- a/terraform/jenkins/jenkins_controller.tf +++ b/terraform/jenkins/jenkins_controller.tf @@ -37,6 +37,13 @@ module "jenkins_controller_vm" { ssh_authorized_keys = local.ssh_keys[user] } ] + # See corresponding EnvironmentFile= directives in services + write_files = [ + { + content = "KEY_VAULT_NAME=${azurerm_key_vault.ssh_remote_build.name}\nSECRET_NAME=${azurerm_key_vault_secret.ssh_remote_build.name}", + "path" = "/var/lib/fetch-build-ssh-key/env" + } + ] })]) allocate_public_ip = true From cce8d775bba57c44257f9f38387a630bef4a7e14 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Tue, 19 Dec 2023 12:11:52 +0200 Subject: [PATCH 47/88] tf-modules/linux-vm: expose private ip Signed-off-by: Florian Klink --- tf-modules/azurerm-linux-vm/virtual_machine.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tf-modules/azurerm-linux-vm/virtual_machine.tf b/tf-modules/azurerm-linux-vm/virtual_machine.tf index a587aa2e..90209ae7 100644 --- a/tf-modules/azurerm-linux-vm/virtual_machine.tf +++ b/tf-modules/azurerm-linux-vm/virtual_machine.tf @@ -126,3 +126,8 @@ output "virtual_machine_identity_principal_id" { output "virtual_machine_network_interface_id" { value = azurerm_network_interface.default.id } + +output "virtual_machine_private_ip_address" { + description = "The first private IP address of the network interface." + value = azurerm_network_interface.default.private_ip_address +} From 73560b0de9d20fefa237035ceedc0cc73ec82e36 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Tue, 19 Dec 2023 12:12:53 +0200 Subject: [PATCH 48/88] terraform/jenkins: render /etc/nix/machines Render /etc/nix/machines with terraform. In the future, we might want to autodiscover this, or better, have agents register with the controller, rather than having to recreate the VM whenever the list of builders is changed. Signed-off-by: Florian Klink --- terraform/jenkins/jenkins_controller.tf | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/terraform/jenkins/jenkins_controller.tf b/terraform/jenkins/jenkins_controller.tf index beb5cea2..570a5022 100644 --- a/terraform/jenkins/jenkins_controller.tf +++ b/terraform/jenkins/jenkins_controller.tf @@ -42,6 +42,16 @@ module "jenkins_controller_vm" { { content = "KEY_VAULT_NAME=${azurerm_key_vault.ssh_remote_build.name}\nSECRET_NAME=${azurerm_key_vault_secret.ssh_remote_build.name}", "path" = "/var/lib/fetch-build-ssh-key/env" + }, + # Render /etc/nix/machines with terraform. In the future, we might want to + # autodiscover this, or better, have agents register with the controller, + # rather than having to recreate the VM whenever the list of builders is + # changed. + { + content = join("\n", [ + for ip in toset(module.builder_vm[*].virtual_machine_private_ip_address) : "ssh://remote-build@${ip} x86_64-linux /etc/secrets/remote-build-ssh-key 10 10 kvm,big-parallel - -" + ]), + "path" = "/etc/nix/machines" } ] })]) From 9e85b3d0a62e3ef0d33bef7cbce049969413f9ed Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Tue, 19 Dec 2023 15:10:03 +0200 Subject: [PATCH 49/88] terraform: add terraform-provider-secret Signed-off-by: Florian Klink --- nix/devshell.nix | 1 + terraform/jenkins/main.tf | 11 +++++++++++ terraform/jenkins/remote_state.tf | 5 ----- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/nix/devshell.nix b/nix/devshell.nix index 427c4cde..75c6b136 100644 --- a/nix/devshell.nix +++ b/nix/devshell.nix @@ -26,6 +26,7 @@ p.local p.null p.random + p.secret p.sops p.tls ])) diff --git a/terraform/jenkins/main.tf b/terraform/jenkins/main.tf index 16bdafec..130a5107 100644 --- a/terraform/jenkins/main.tf +++ b/terraform/jenkins/main.tf @@ -6,6 +6,17 @@ provider "azurerm" { features {} } +terraform { + required_providers { + azurerm = { + source = "hashicorp/azurerm" + } + secret = { + source = "numtide/secret" + } + } +} + # read ssh-keys.yaml into local.ssh_keys locals { ssh_keys = yamldecode(file("../../ssh-keys.yaml")) diff --git a/terraform/jenkins/remote_state.tf b/terraform/jenkins/remote_state.tf index 4f68d998..3fdf2ce8 100644 --- a/terraform/jenkins/remote_state.tf +++ b/terraform/jenkins/remote_state.tf @@ -3,11 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 terraform { - required_providers { - azurerm = { - source = "hashicorp/azurerm" - } - } # Backend for storing tfstate (see ../azure-storage) backend "azurerm" { resource_group_name = "ghaf-infra-storage" From 0e03f3d5652390fe5a6bd511dc2e0f319179d48b Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Tue, 19 Dec 2023 15:10:31 +0200 Subject: [PATCH 50/88] terraform/jenkins: add post-build-hook and signing This creates a Nix signing key, and uses terraform-provider-secret to hold it in the terraform state. It's then uploaded into an Azure key vault. The jenkins-controller VM has access to it, and puts it at /etc/secrets/ nix-signing-key. A post-build-hook is configured, uploading every build to the binary cache bucket, with the signature. Signed-off-by: Florian Klink --- hosts/builder/configuration.nix | 7 +- hosts/jenkins-controller/configuration.nix | 80 +++++++++++++++++++++- terraform/jenkins/binary_cache_signing.tf | 51 ++++++++++++++ terraform/jenkins/jenkins_controller.tf | 29 +++++++- 4 files changed, 162 insertions(+), 5 deletions(-) create mode 100644 terraform/jenkins/binary_cache_signing.tf diff --git a/hosts/builder/configuration.nix b/hosts/builder/configuration.nix index 8b90b312..76761243 100644 --- a/hosts/builder/configuration.nix +++ b/hosts/builder/configuration.nix @@ -40,9 +40,10 @@ }; }; - # Configure Nix to use this as a substitutor. - # TODO: add the public key used to sign on the controller here. - nix.settings.trusted-public-keys = []; + # Configure Nix to use this as a substitutor, and the public key used for signing. + nix.settings.trusted-public-keys = [ + "ghaf-jenkins:5OXpzoevBwH4sBR0S0HaIQCik2adrOrGawIXO+WADCk=" + ]; nix.settings.substituters = [ "http://localhost:8080" ]; diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index 514f6638..1410f973 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -6,7 +6,16 @@ self, lib, ... -}: { +}: let + post-build-hook = pkgs.writeScript "upload" '' + set -eu + set -f # disable globbing + export IFS=' ' + + echo "Uploading paths" $OUT_PATHS + exec nix --extra-experimental-features nix-command copy --to 'http://localhost:8080?secret-key=/etc/secrets/nix-signing-key&compression=zstd' $OUT_PATHS + ''; +in { imports = [ ../azure-common-2.nix ../azure-scratch-store-common.nix @@ -68,6 +77,75 @@ ''; }; + # Define a fetch-binary-cache-signing-key unit populating + # /etc/secrets/nix-signing-key from Azure Key Vault. + # Make it before and requiredBy nix-daemon.service. + systemd.services.fetch-binary-cache-signing-key = { + after = ["network.target"]; + before = ["nix-daemon.service"]; + requires = ["network.target"]; + wantedBy = [ + # nix-daemon is socket-activated, and having it here should be sufficient + # to fetch the keys whenever a jenkins job connects to the daemon first. + # This means this service will effectively get socket-activated on the + # first nix-daemon connection. + "nix-daemon.service" + ]; + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + EnvironmentFile = "/var/lib/fetch-binary-cache-signing-key/env"; + Restart = "on-failure"; + }; + script = let + get-secret = pkgs.writers.writePython3 "get-secret" { + libraries = with pkgs.python3.pkgs; [azure-keyvault-secrets azure-identity]; + } (builtins.readFile ./get_secret.py); + in '' + umask 077 + mkdir -p /etc/secrets/ + ${get-secret} > /etc/secrets/nix-signing-key + ''; + }; + + # Run a read-write HTTP webserver proxying to the "binary-cache-v1" storage + # This is used by the post-build-hook to upload to the binary cache. + # This relies on IAM to grant access to the storage container. + systemd.services.rclone-http = { + after = ["network.target"]; + requires = ["network.target"]; + wantedBy = ["multi-user.target"]; + serviceConfig = { + Type = "notify"; + Restart = "always"; + RestartSec = 2; + DynamicUser = true; + RuntimeDirectory = "rclone-http"; + ExecStart = + "${pkgs.rclone}/bin/rclone " + + "serve webdav " + + "--azureblob-env-auth " + + "--addr localhost:8080 " + + ":azureblob:binary-cache-v1"; + EnvironmentFile = "/var/lib/rclone-http/env"; + }; + }; + + # Configure Nix to use this as a substitutor, and the public key used for signing. + nix.settings.trusted-public-keys = [ + "ghaf-jenkins:5OXpzoevBwH4sBR0S0HaIQCik2adrOrGawIXO+WADCk=" + ]; + nix.settings.substituters = [ + "http://localhost:8080" + ]; + nix.extraOptions = '' + builders-use-substitutes = true + builders = @/etc/nix/machines + # Build remote by default + max-jobs = 0 + post-build-hook = ${post-build-hook} + ''; + # TODO: deploy reverse proxy, sort out authentication (SSO?) nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; diff --git a/terraform/jenkins/binary_cache_signing.tf b/terraform/jenkins/binary_cache_signing.tf new file mode 100644 index 00000000..87c64e56 --- /dev/null +++ b/terraform/jenkins/binary_cache_signing.tf @@ -0,0 +1,51 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +# nix-store --generate-binary-cache-key foo secret-key public-key +# terraform import secret_resource.binary_cache_signing_key "$(< ./secret-key)" +resource "secret_resource" "binary_cache_signing_key" { + lifecycle { + prevent_destroy = true + } +} + +# Create an Azure key vault. +resource "azurerm_key_vault" "binary_cache_signing_key" { + # this must be globally unique + name = "ghaf-binarycache-signing" + location = azurerm_resource_group.default.location + resource_group_name = azurerm_resource_group.default.name + sku_name = "standard" + # The Azure Active Directory tenant ID that should be used for authenticating + # requests to the key vault. + tenant_id = data.azurerm_client_config.current.tenant_id +} + +# Upload the binary cache signing key as a vault secret +resource "azurerm_key_vault_secret" "binary_cache_signing_key" { + name = "binary-cache-signing-key" + value = secret_resource.binary_cache_signing_key.value + key_vault_id = azurerm_key_vault.binary_cache_signing_key.id + + # Each of the secrets needs an explicit dependency on the access policy. + # Otherwise, Terraform may attempt to create the secret before creating the + # access policy. + # https://stackoverflow.com/a/74747333 + depends_on = [ + azurerm_key_vault_access_policy.binary_cache_signing_key_terraform + ] +} + +resource "azurerm_key_vault_access_policy" "binary_cache_signing_key_terraform" { + key_vault_id = azurerm_key_vault.binary_cache_signing_key.id + tenant_id = data.azurerm_client_config.current.tenant_id + # "TerraformAdminsGHAFInfra" group + object_id = "f80c2488-2301-4de8-89d6-4954b77f453e" + + secret_permissions = [ + "Get", + "List", + "Set" + ] +} diff --git a/terraform/jenkins/jenkins_controller.tf b/terraform/jenkins/jenkins_controller.tf index 570a5022..3aeb31b6 100644 --- a/terraform/jenkins/jenkins_controller.tf +++ b/terraform/jenkins/jenkins_controller.tf @@ -37,12 +37,20 @@ module "jenkins_controller_vm" { ssh_authorized_keys = local.ssh_keys[user] } ] - # See corresponding EnvironmentFile= directives in services write_files = [ + # See corresponding EnvironmentFile= directives in services { content = "KEY_VAULT_NAME=${azurerm_key_vault.ssh_remote_build.name}\nSECRET_NAME=${azurerm_key_vault_secret.ssh_remote_build.name}", "path" = "/var/lib/fetch-build-ssh-key/env" }, + { + content = "KEY_VAULT_NAME=${azurerm_key_vault.binary_cache_signing_key.name}\nSECRET_NAME=${azurerm_key_vault_secret.binary_cache_signing_key.name}", + "path" = "/var/lib/fetch-binary-cache-signing-key/env" + }, + { + content = "AZURE_STORAGE_ACCOUNT_NAME=${azurerm_storage_account.binary_cache.name}", + "path" = "/var/lib/rclone-http/env" + }, # Render /etc/nix/machines with terraform. In the future, we might want to # autodiscover this, or better, have agents register with the controller, # rather than having to recreate the VM whenever the list of builders is @@ -114,3 +122,22 @@ resource "azurerm_key_vault_access_policy" "ssh_remote_build_jenkins_controller" "Get", ] } + +# Allow the VM to *write* to (and read from) the binary cache bucket +resource "azurerm_role_assignment" "jenkins_controller_access_storage" { + scope = azurerm_storage_container.binary_cache_1.resource_manager_id + role_definition_name = "Storage Blob Data Contributor" + principal_id = module.jenkins_controller_vm.virtual_machine_identity_principal_id +} + +# Grant the VM read-only access to the Azure Key Vault Secret containing the +# binary cache signing key. +resource "azurerm_key_vault_access_policy" "binary_cache_signing_key_jenkins_controller" { + key_vault_id = azurerm_key_vault.binary_cache_signing_key.id + tenant_id = data.azurerm_client_config.current.tenant_id + object_id = module.jenkins_controller_vm.virtual_machine_identity_principal_id + + secret_permissions = [ + "Get", + ] +} From 4f6769afd4e676f380a69f1b90a1c6126a48ab7d Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Tue, 19 Dec 2023 16:21:33 +0200 Subject: [PATCH 51/88] terraform/jenkins: ensure nar/ exists Signed-off-by: Florian Klink --- terraform/jenkins/binary_cache_storage.tf | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/terraform/jenkins/binary_cache_storage.tf b/terraform/jenkins/binary_cache_storage.tf index 2c490273..dae9706f 100644 --- a/terraform/jenkins/binary_cache_storage.tf +++ b/terraform/jenkins/binary_cache_storage.tf @@ -17,3 +17,14 @@ resource "azurerm_storage_container" "binary_cache_1" { storage_account_name = azurerm_storage_account.binary_cache.name container_access_type = "private" } + +# Create a file inside the nar/ subdir. +# It seems rclone doesn't create the parent directory and fails to upload the +# first NAR otherwise. +resource "azurerm_storage_blob" "nar_keep" { + name = "nar/.keep" + storage_account_name = azurerm_storage_account.binary_cache.name + storage_container_name = azurerm_storage_container.binary_cache_1.name + type = "Block" + source_content = "" +} From 3d5ac0d397aa035643ccea61734abd90106b6bee Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Tue, 19 Dec 2023 16:41:16 +0200 Subject: [PATCH 52/88] terraform/jenkins: drop user ssh on builders There's no need for any user to ssh into builders, this can be dropped. Signed-off-by: Florian Klink --- terraform/jenkins/builder.tf | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/terraform/jenkins/builder.tf b/terraform/jenkins/builder.tf index 91041fcd..588b9399 100644 --- a/terraform/jenkins/builder.tf +++ b/terraform/jenkins/builder.tf @@ -34,19 +34,12 @@ module "builder_vm" { virtual_machine_source_image = module.builder_image.image_id virtual_machine_custom_data = join("\n", ["#cloud-config", yamlencode({ - users = concat([ - # TODO: drop once this is known to work. - for user in toset(["bmg", "flokli", "hrosten"]) : { - name = user - sudo = "ALL=(ALL) NOPASSWD:ALL" - ssh_authorized_keys = local.ssh_keys[user] - } - ], [{ - name = "remote-build" - ssh_authorized_keys = [ - tls_private_key.ed25519_remote_build.public_key_openssh - ] - }]) + users = [{ + name = "remote-build" + ssh_authorized_keys = [ + tls_private_key.ed25519_remote_build.public_key_openssh + ] + }] write_files = [ { content = "AZURE_STORAGE_ACCOUNT_NAME=${azurerm_storage_account.binary_cache.name}", From 8ef0c76e482b6d5ad136c5a132e57d9a2214ccf9 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Tue, 19 Dec 2023 16:41:59 +0200 Subject: [PATCH 53/88] jenkins-controller: populate known_hosts The consumes a list of IPs to ssh-keycan once, on startup. In the future, we might want to add support for dynamic discovery, as additional (longer-lived) static hosts. Signed-off-by: Florian Klink --- hosts/jenkins-controller/configuration.nix | 25 ++++++++++++++++++++++ terraform/jenkins/jenkins_controller.tf | 5 +++++ 2 files changed, 30 insertions(+) diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index 1410f973..3df5632c 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -77,6 +77,31 @@ in { ''; }; + # populate-known-hosts populates /root/.ssh/known_hosts with all hosts in the + # builder subnet. + systemd.services.populate-known-hosts = { + after = ["network.target"]; + before = ["nix-daemon.service"]; + requires = ["network.target"]; + wantedBy = [ + # nix-daemon is socket-activated, and having it here should be sufficient + # to fetch the keys whenever a jenkins job connects to the daemon first. + # This means this service will effectively get socket-activated on the + # first nix-daemon connection. + "nix-daemon.service" + ]; + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + Restart = "on-failure"; + }; + script = '' + umask 077 + mkdir -p /root/.ssh + ${pkgs.openssh}/bin/ssh-keyscan -f /var/lib/builder-keyscan/scanlist -v -t ed25519 > /root/.ssh/known_hosts + ''; + }; + # Define a fetch-binary-cache-signing-key unit populating # /etc/secrets/nix-signing-key from Azure Key Vault. # Make it before and requiredBy nix-daemon.service. diff --git a/terraform/jenkins/jenkins_controller.tf b/terraform/jenkins/jenkins_controller.tf index 3aeb31b6..c5072154 100644 --- a/terraform/jenkins/jenkins_controller.tf +++ b/terraform/jenkins/jenkins_controller.tf @@ -60,6 +60,11 @@ module "jenkins_controller_vm" { for ip in toset(module.builder_vm[*].virtual_machine_private_ip_address) : "ssh://remote-build@${ip} x86_64-linux /etc/secrets/remote-build-ssh-key 10 10 kvm,big-parallel - -" ]), "path" = "/etc/nix/machines" + }, + # Render /var/lib/builder-keyscan/scanlist, so known_hosts can be populated. + { + content = join("\n", toset(module.builder_vm[*].virtual_machine_private_ip_address)) + "path" = "/var/lib/builder-keyscan/scanlist" } ] })]) From 7d5d4e072f4b9b13cd18c9ef537e1075578ec63d Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Tue, 19 Dec 2023 17:48:44 +0200 Subject: [PATCH 54/88] jenkins-controller: move jenkins itself to port 8081 Signed-off-by: Florian Klink --- hosts/jenkins-controller/configuration.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index 3df5632c..ff72f952 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -37,7 +37,7 @@ in { services.jenkins = { enable = true; listenAddress = "localhost"; - port = 8080; + port = 8081; withCLI = true; }; From 56289f07916fc71f9fba68bfaf2c69fe7b47c2d3 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 20 Dec 2023 17:16:18 +0200 Subject: [PATCH 55/88] hosts/jenkins-controller: document url params Signed-off-by: Florian Klink --- hosts/jenkins-controller/configuration.nix | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index ff72f952..3db8c7f3 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -7,6 +7,11 @@ lib, ... }: let + # whenever a build is done, upload it to the blob storage via http (going + # through the rclone proxy). + # The secret-key= URL parameter configures the store, and which signing key it + # should use while uploading, but neither the key nor its location is sent + # over HTTP. post-build-hook = pkgs.writeScript "upload" '' set -eu set -f # disable globbing From e4d17a6d9dd9a7bbffdd530423d91f257fc2cff4 Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 20 Dec 2023 17:18:27 +0200 Subject: [PATCH 56/88] hosts/jenkins-controller: inline get_secret.py Prevent the repo and nixpkgs linter from fighting each other about formatting. Signed-off-by: Florian Klink --- hosts/jenkins-controller/configuration.nix | 42 ++++++++++++++++------ hosts/jenkins-controller/get_secret.py | 24 ------------- 2 files changed, 32 insertions(+), 34 deletions(-) delete mode 100644 hosts/jenkins-controller/get_secret.py diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index 3db8c7f3..a1f70455 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -20,6 +20,36 @@ echo "Uploading paths" $OUT_PATHS exec nix --extra-experimental-features nix-command copy --to 'http://localhost:8080?secret-key=/etc/secrets/nix-signing-key&compression=zstd' $OUT_PATHS ''; + + get-secret = + pkgs.writers.writePython3 "get-secret" { + libraries = with pkgs.python3.pkgs; [azure-keyvault-secrets azure-identity]; + } '' + """ + This script retrieves a secret specified in $SECRET_NAME + from an Azure Key Vault in $KEY_VAULT_NAME + and prints it to stdout. + + It uses the default Azure credential client. + """ + + from azure.keyvault.secrets import SecretClient + from azure.identity import DefaultAzureCredential + + import os + + key_vault_name = os.environ["KEY_VAULT_NAME"] + secret_name = os.environ["SECRET_NAME"] + + credential = DefaultAzureCredential() + client = SecretClient( + vault_url=f"https://{key_vault_name}.vault.azure.net", + credential=credential + ) + + s = client.get_secret(secret_name) + print(s.value) + ''; in { imports = [ ../azure-common-2.nix @@ -71,11 +101,7 @@ in { EnvironmentFile = "/var/lib/fetch-build-ssh-key/env"; Restart = "on-failure"; }; - script = let - get-secret = pkgs.writers.writePython3 "get-secret" { - libraries = with pkgs.python3.pkgs; [azure-keyvault-secrets azure-identity]; - } (builtins.readFile ./get_secret.py); - in '' + script = '' umask 077 mkdir -p /etc/secrets/ ${get-secret} > /etc/secrets/remote-build-ssh-key @@ -127,11 +153,7 @@ in { EnvironmentFile = "/var/lib/fetch-binary-cache-signing-key/env"; Restart = "on-failure"; }; - script = let - get-secret = pkgs.writers.writePython3 "get-secret" { - libraries = with pkgs.python3.pkgs; [azure-keyvault-secrets azure-identity]; - } (builtins.readFile ./get_secret.py); - in '' + script = '' umask 077 mkdir -p /etc/secrets/ ${get-secret} > /etc/secrets/nix-signing-key diff --git a/hosts/jenkins-controller/get_secret.py b/hosts/jenkins-controller/get_secret.py deleted file mode 100644 index ba59ea60..00000000 --- a/hosts/jenkins-controller/get_secret.py +++ /dev/null @@ -1,24 +0,0 @@ -""" -This script retrieves a secret specified in $SECRET_NAME -from an Azure Key Vault in $KEY_VAULT_NAME -and prints it to stdout. - -It uses the default Azure credential client. -""" - -from azure.keyvault.secrets import SecretClient -from azure.identity import DefaultAzureCredential - -import os - -key_vault_name = os.environ["KEY_VAULT_NAME"] -secret_name = os.environ["SECRET_NAME"] - -credential = DefaultAzureCredential() -client = SecretClient( - vault_url=f"https://{key_vault_name}.vault.azure.net", - credential=credential -) - -s = client.get_secret(secret_name) -print(s.value) From 6ab24d6adbc0ae2fc91a68f8617002faed82e93c Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Wed, 20 Dec 2023 18:43:26 +0200 Subject: [PATCH 57/88] terraform/jenkins: add README This describes the current concepts and components in this PR with more prose. It also describes some of the known issues / compromises. --- terraform/jenkins/README.md | 185 ++++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 terraform/jenkins/README.md diff --git a/terraform/jenkins/README.md b/terraform/jenkins/README.md new file mode 100644 index 00000000..b2794baf --- /dev/null +++ b/terraform/jenkins/README.md @@ -0,0 +1,185 @@ + + +# terraform/jenkins + +This directory contains the root terraform module describing the image-based CI +setup in Azure. + +The Azure Setup uses: + + - Azure Blob Storage for the Nix binary cache + - Azure Key Vault to retrieve (two) secrets onto the jenkins-controller VM + - the local scratch disk as an overlay mount to /nix/store + - `rclone serve {http,webdav}` as a proxy/translator from azure blob storage to + plain http requests + - cloud-init for environment-specific configuration + +## Image-based builds +The setup uses Nix to build disk images, uploads them to Azure, and then boots +virtual machines off of them. + +Images are considered "appliance images", meant the Nix code describing their +configuration describes the exact same purpose of the machine (no two-staged +deployment process, the machine does the thing it's supposed to do after +bootup), allowing to remove the need for e.g. ssh access as much as possible. + +Machines are considered ephemeral, every change in the appliance image / nixos +configuration causes a new image to be built, and a new VM to be booted with +that new image. + +State that needs to be kept persistent for longer needs to be saved by attaching +managed data volumes, and mounting them to the state directories of the specific +service. + +### Environment-agnostic +Images are supposed to be *environment-agnostic*, allowing multiple deployments +to share the same image / Nix code to build it. + +Environment-specific configuration is injected into the machine on bootup via +cloud-init, which writes things like domain names, allowed ssh keys or bucket +names to text files, which are read in by systemd during later stages in +startup. + +### Platform-agnostic +The images are environment-agnostic, but not cloud-provider/platform-agnostic. + +Azure has a different block storage provider than other clouds, different secret +handling, and requires different agents and VM configuration. + +However, the setup itself is meant to be cloud-provider agnostic, allowing the +different components and concepts to be recombined differently to work on +another cloud provider, or bare metal. + + +## Components +The setup consists of the following components: + +### Jenkins Controller +The main machine that's gonna evaluate Nix code and *trigger* Nix builds. + +The Nix Daemon on that machine is configured to not building (much) on its own +(it has `max-jobs` set to `0`), causing it to dispatch builds to the builders +specified in `/etc/nix/machines`. + +It uses ssh to connect to builders (currently pulls an ssh ed25519 key from an +Azure Key vault on bootup that's used for authentication to the builders). + +Once the build has happened, the results are copied back from the builders, +signed with the signing key (currently living on-disk and pulled from the Azure +Key Vault) and uploaded to the binary cache bucket (via a read-write `rclone +serve` service). + +Only the root user, which is what the nix-daemon is running as, has access to +the binary cache signing key and ssh ed25519 remote build private ssh key. +Even if a build would run locally, it'd only run as a `nixbld*` user. + +The machine also has a Jenkins service running, which is supposed to trigger +nix builds. However, the fact that it's jenkins is an implementation detail, the +setup is CI-agnostic. + +*State*: Managed disk for Jenkins state + +### Builder +The builder allows ssh login as the `remote-build` user from the +`jenkins-controller` VM IP range. It substitutes build inputs from the binary +cache (via a local read-only `rclone serve` service), builds the derivation it's +requested to build, and sends the results back to the `jenkins-controller` VM +over the same connection. + +It has no state, no public IP addresses, and no secrets. + +### Binary cache VM +The binary cache VM has a read-only `rclone-serve` service deployed, and exposes +a subset of these paths (essentially, without a listing) publicly over HTTPS. + +For this, caddy is deployed as a reverse proxy, getting a LE Certificate, using +the TLS-ALPN-01 challenge so port 80 can stay closed. + +*State*: Managed disk for caddy certificates and LE account data + +## Future Work + +This tracks some known issues / compromises in the current design, and describes +possible ways to solve them. + +### Configurable `trusted-public-keys` +An annoyance in the current image process. It's currently not possible to +(re-)configure list of trusted public keys with something like cloud-init, as +it's baked into the /etc/nix/nix.conf that's generated by the filesystem. + +This causes our environment-agnostic images to still be specific to a set of +public keys. + +We can probably fix this by extending `nix.conf` on bootup, and pointing +`nix-daemon.service` to the extended config file. + +### Jenkins configuration and authentication +Currently, Jenkins is configured purely imperatively, using its state volume for +pipeline config. It also has no user setup (yet), but logs an admin password +that's supposed to be used for login. + +We should configure some Jenkins pipelines, probably via cloud-init (so we don't +need to bake new images all the time), configure SSO for login and properly +expose this via a domain and HTTPS. + +### More dynamic nix builder registration +Nix reads the list of available builders from `/etc/nix/machines`. + +This file is assembled with terraform, creating a strong coupling between this +file and all builder machines. + +It is written to disk by cloud-init once on startup, and the same list of +builders is scanned for ssh host keys once on bootup. + +This makes registering new builders quite a churn: + + - It requires an update of the cloud-init userdata and VM recreation to update + that list (or manual tinkering over ssh), stopping all builds. + - It (currently) doesn't allow explicitly specifying host keys. + While we don't know until the target machine has booted up, we might know for + longer-running builders. + - Redeploying a builder causes its host key to change, requiring the known host + keys to be updated. + - managing ssh private keys in general is annoying, and there's little reason + to allow ingoing ssh. + +There should be a more dynamic "agent-based" registration process. + +Ideally, the builders could register themselves with the controller, advertise +their capabilities (number of cores, architectures), and keep the connection alive. + +`/etc/nix/machines` on the controller could then provide a "live view" into +the currently connected builders, and give a unix domain socket (using `unix:// +` URL) that'll connect to a `nix daemon --stdio` on the other side of the +connection. + +Authentication TBD. At least in Azure, this could use Machine Identity. + +### Offload signing +We have the Nix binary cache signing key as a literal file (only accessible for +root), and let `nix copy` take care of creating signatures with it. + +Obviously this is very bad, as leaving the private key on the host makes it at +least possible to steal the private key. + +It seems Azure Key Vault supports dealing with ed25519 key material, and doing +signatures. Rather than keeping the key as a file on disk. + +A much cleaner solution would be to allow offloading the signing mechanism to +Azure Key Vault, or "external signers in general". + +In terms of complexity, there's already +[go-nix](https://github.com/nix-community/go-nix) (Go) and +[nix-compat](https://cs.tvl.fyi/depot/-/tree/tvix/nix-compat/src) (Rust) client +libraries available that can deal with NARs, NARInfo files, signatures and +fingerprints, so we could replace the post-build-hook command with another +version that produces the same files, but offloads the signing operation to an +external signer. + +We could also integrate with https://github.com/NixOS/nix/pull/9076, which +defines a "Nix remote signing API", and provide a "server counterpart" +translating these requests into communication with Azure Key Vault. From 60d1ffd9f93c3a6af3f091e34afa531f978d90b3 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Mon, 15 Jan 2024 16:34:01 +0200 Subject: [PATCH 58/88] binary-cache: hardcode caddy domain Signed-off-by: Henri Rosten --- hosts/binary-cache/configuration.nix | 4 +--- terraform/jenkins/binary_cache.tf | 4 ---- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/hosts/binary-cache/configuration.nix b/hosts/binary-cache/configuration.nix index dc28e679..6d5f3c75 100644 --- a/hosts/binary-cache/configuration.nix +++ b/hosts/binary-cache/configuration.nix @@ -66,7 +66,7 @@ } # Proxy a subset of requests to rclone. - https://{$SITE_ADDRESS} { + https://ghaf-binary-cache.northeurope.cloudapp.azure.com { handle /nix-cache-info { reverse_proxy unix///run/rclone-http/socket } @@ -90,8 +90,6 @@ "" "${pkgs.caddy}/bin/caddy run --environ --config ${config.services.caddy.configFile}/Caddyfile" ]; - systemd.services.caddy.serviceConfig.EnvironmentFile = "/run/caddy.env"; - # Wait for cloud-init mounting before we start caddy. systemd.services.caddy.after = ["cloud-init.service"]; systemd.services.caddy.requires = ["cloud-init.service"]; diff --git a/terraform/jenkins/binary_cache.tf b/terraform/jenkins/binary_cache.tf index 07246b89..25e46015 100644 --- a/terraform/jenkins/binary_cache.tf +++ b/terraform/jenkins/binary_cache.tf @@ -41,10 +41,6 @@ module "binary_cache_vm" { content = "AZURE_STORAGE_ACCOUNT_NAME=${azurerm_storage_account.binary_cache.name}", "path" = "/var/lib/rclone-http/env" }, - { - content = "SITE_ADDRESS=ghaf-binary-cache.northeurope.cloudapp.azure.com", - "path" = "/run/caddy.env" - }, ], })]) From edbd397f9f87d813e8db366166544cc067739d23 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Tue, 16 Jan 2024 07:59:35 +0200 Subject: [PATCH 59/88] azure-common: install system packages Signed-off-by: Henri Rosten --- hosts/azure-common-2.nix | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/hosts/azure-common-2.nix b/hosts/azure-common-2.nix index 41ba9dfb..ff30ff86 100644 --- a/hosts/azure-common-2.nix +++ b/hosts/azure-common-2.nix @@ -4,7 +4,11 @@ # # Profile to import for Azure VMs. Imports azure-common.nix from nixpkgs, # and configures cloud-init. -{modulesPath, ...}: { +{ + modulesPath, + pkgs, + ... +}: { imports = [ "${modulesPath}/virtualisation/azure-config.nix" ]; @@ -21,4 +25,11 @@ # but the way nixpkgs configures cloud-init prevents it from picking up DNS # settings from elsewhere. # services.resolved.enable = false; + + # List packages installed in system profile + environment.systemPackages = with pkgs; [ + git + vim + htop + ]; } From 2fbbe2198f0957ba8315e24f26a2a6ed48747545 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Tue, 16 Jan 2024 08:01:17 +0200 Subject: [PATCH 60/88] azure-common: enable flakes and nix Signed-off-by: Henri Rosten --- hosts/azure-common-2.nix | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/hosts/azure-common-2.nix b/hosts/azure-common-2.nix index ff30ff86..73a4e520 100644 --- a/hosts/azure-common-2.nix +++ b/hosts/azure-common-2.nix @@ -13,6 +13,13 @@ "${modulesPath}/virtualisation/azure-config.nix" ]; + nix = { + settings = { + # Enable flakes and 'nix' command + experimental-features = "nix-command flakes"; + }; + }; + # enable cloud-init, so instance metadata is set accordingly and we can use # cloud-config for ssh key management. services.cloud-init.enable = true; From 8ad2f5398bdbd405219ba0cf94e4ea59761009fb Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Tue, 16 Jan 2024 08:07:58 +0200 Subject: [PATCH 61/88] builder: downgrade and reduce VMs Signed-off-by: Henri Rosten --- terraform/jenkins/builder.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/terraform/jenkins/builder.tf b/terraform/jenkins/builder.tf index 588b9399..d3de6ffe 100644 --- a/terraform/jenkins/builder.tf +++ b/terraform/jenkins/builder.tf @@ -18,7 +18,7 @@ module "builder_image" { } locals { - num_builders = 2 + num_builders = 1 } module "builder_vm" { @@ -30,7 +30,7 @@ module "builder_vm" { location = azurerm_resource_group.default.location virtual_machine_name = "ghaf-builder-${count.index}" - virtual_machine_size = "Standard_D16_v3" + virtual_machine_size = "Standard_D4_v3" virtual_machine_source_image = module.builder_image.image_id virtual_machine_custom_data = join("\n", ["#cloud-config", yamlencode({ From 9dc1f3f1fbd4e879f2346eb5795246a16d994aa2 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Tue, 16 Jan 2024 08:49:09 +0200 Subject: [PATCH 62/88] jenkins-controller: beef up the VM Signed-off-by: Henri Rosten --- terraform/jenkins/jenkins_controller.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/jenkins/jenkins_controller.tf b/terraform/jenkins/jenkins_controller.tf index c5072154..c9e81c41 100644 --- a/terraform/jenkins/jenkins_controller.tf +++ b/terraform/jenkins/jenkins_controller.tf @@ -26,7 +26,7 @@ module "jenkins_controller_vm" { location = azurerm_resource_group.default.location virtual_machine_name = "ghaf-jenkins-controller" - virtual_machine_size = "Standard_D1_v2" + virtual_machine_size = "Standard_D2_v2" virtual_machine_source_image = module.jenkins_controller_image.image_id virtual_machine_custom_data = join("\n", ["#cloud-config", yamlencode({ From 1d7e6bb9ba6a216922d27dcbfb48d4770d38de29 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Sat, 20 Jan 2024 10:04:59 +0200 Subject: [PATCH 63/88] playground: initial version Support using terraform workspaces to setup private development environments on Azure. Adds `playground/terraform-playground.sh` to facilitate the developer usage of such workspaces. Adds `playground/test-infra.tf` to demonstrate the usage of workspaces in a test infrastructure. Signed-off-by: Henri Rosten --- terraform/playground/terraform-playground.sh | 172 +++++++++++++++++++ terraform/playground/test-infra.tf | 154 +++++++++++++++++ 2 files changed, 326 insertions(+) create mode 100755 terraform/playground/terraform-playground.sh create mode 100644 terraform/playground/test-infra.tf diff --git a/terraform/playground/terraform-playground.sh b/terraform/playground/terraform-playground.sh new file mode 100755 index 00000000..765d8a77 --- /dev/null +++ b/terraform/playground/terraform-playground.sh @@ -0,0 +1,172 @@ +#!/usr/bin/env bash + +# SPDX-FileCopyrightText: 2024 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +set -e # exit immediately if a command fails +set -u # treat unset variables as an error and exit +set -o pipefail # exit if any pipeline command fails + +################################################################################ + +MYNAME=$(basename "$0") +usage () { + echo "Usage: $MYNAME [activate|destroy|list]" + echo "" + echo "This script is a thin wrapper around terraform workspaces to enable private" + echo "development environment setup for testing Azure infra changes." + echo "" + echo "COMMANDS" + echo " activate Activate private infra development environment" + echo " destroy Destroy private infra development environment" + echo " list List current terraform workspaces" + echo "" + echo "" + echo " EXAMPLE:" + echo " ./$MYNAME activate" + echo "" + echo " Activate and - unless already created - create a new terraform workspace" + echo " to allow testing the infra setup in a private development environment." + echo "" + echo "" + echo " EXAMPLE:" + echo " ./$MYNAME destroy" + echo " " + echo " Deactivate and destroy the private development infra that was previously" + echo " created with the 'activate' command. This command deletes all the infra" + echo " resources and removes the terraform workspace." + echo "" +} + +################################################################################ + +exit_unless_command_exists () { + if ! command -v "$1" &> /dev/null; then + echo "Error: command '$1' is not installed" >&2 + exit 1 + fi +} + +generate_azure_private_workspace_name () { + # Generate workspace name based on azure signed-in-user: + # - .userPrincipalName returns the signed-in azure username + # - cut removes everything up until the first '@' + # - sed keeps only letter and number characters + # - final cut keeps at most 20 characters + # Thus, given a signed-in user 'foo.bar@baz.com', the workspace name + # becomes 'foobar'. + # Below command errors out with the azure error message if the azure user + # is not signed-in. + WORKSPACE=$(az ad signed-in-user show | jq -cr .userPrincipalName | cut -d'@' -f1 | sed 's/[^a-zA-Z0-9]//g' | cut -c 1-20) + # Check WORKSPACE is non-empty and not 'default' + if [ -z "$WORKSPACE" ] || [ "$WORKSPACE" = "default" ]; then + echo "Error: invalid workspace name: '$WORKSPACE'" + exit 1 + fi +} + +import_sigkey () { + # This function is a hack to automatically generate the binary cache + # signing key for the (ghaf-infra) private dev environment. + + # No need to import anything if the below key isn't defined in the infra + if ! grep -q secret_resource.binary_cache_signing_key -- *.tf; then + return + fi + + # Skip import if signing key is imported already + if terraform state list | grep -q secret_resource.binary_cache_signing_key ; then + return + fi + + # Generate and import the key + nix-store --generate-binary-cache-key "$WORKSPACE" sigkey-secret.tmp sigkey-public.tmp + terraform import secret_resource.binary_cache_signing_key "$(< ./sigkey-secret.tmp)" +} + +delete_keyvault () { + # This function is a hack to automatically delete keyvaults + # from the (ghaf-infra) private dev environment. + set +e + if grep -qP "sig-.*name_postfix" -- *.tf; then + az keyvault delete --name "sig-$WORKSPACE" 2>/dev/null + az keyvault purge --name "sig-$WORKSPACE" 2>/dev/null + fi + if grep -qP "ssh-.*name_postfix" -- *.tf; then + az keyvault delete --name "ssh-$WORKSPACE" 2>/dev/null + az keyvault purge --name "ssh-$WORKSPACE" 2>/dev/null + fi + set -e +} + +activate () { + echo "[+] Activating workspace: '$WORKSPACE'" + if terraform workspace list | grep -q "$WORKSPACE"; then + terraform workspace select "$WORKSPACE" + else + terraform workspace new "$WORKSPACE" + terraform workspace select "$WORKSPACE" + fi + import_sigkey + echo "[+] Done, use terraform [validate|plan|apply] to work with your dev infra" +} + +destroy () { + if ! terraform workspace list | grep -q "$WORKSPACE"; then + echo "[+] Devenv workspace '$WORKSPACE' does not exist, nothing to destroy" + exit 0 + fi + echo "[+] Destroying workspace: '$WORKSPACE'" + terraform workspace select "$WORKSPACE" + delete_keyvault + terraform apply -destroy -auto-approve + terraform workspace select default +} + +list () { + echo "Terraform workspaces:" + terraform workspace list +} + +################################################################################ + +main () { + if [ $# -ne 1 ]; then + usage + exit 0 + fi + if [ "$1" != "activate" ] && [ "$1" != "destroy" ] && [ "$1" != "list" ]; then + echo "Error: invalid command: '$1'" + usage + exit 1 + fi + + exit_unless_command_exists az + exit_unless_command_exists terraform + exit_unless_command_exists nix-store + exit_unless_command_exists jq + exit_unless_command_exists sed + exit_unless_command_exists cut + + # Assigns $WORKSPACE variable + generate_azure_private_workspace_name + + # It is safe to run terraform init multiple times + terraform init &> /dev/null + + # Run the given command + if [ "$1" == "activate" ]; then + activate + fi + if [ "$1" == "destroy" ]; then + destroy + fi + if [ "$1" == "list" ]; then + list + fi +} + +main "$@" + +################################################################################ diff --git a/terraform/playground/test-infra.tf b/terraform/playground/test-infra.tf new file mode 100644 index 00000000..e5f67a94 --- /dev/null +++ b/terraform/playground/test-infra.tf @@ -0,0 +1,154 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +terraform { + required_providers { + azurerm = { + source = "hashicorp/azurerm" + } + } + # Backend for storing tfstate (see ./azure-storage) + backend "azurerm" { + resource_group_name = "ghaf-infra-storage" + storage_account_name = "ghafinfrastatestorage" + container_name = "ghaf-infra-tfstate-container" + key = "ghaf-infra-playground.tfstate" + } +} +provider "azurerm" { + features {} +} +# Resource group +resource "azurerm_resource_group" "playground_rg" { + name = "ghaf-infra-playground-${terraform.workspace}" + location = "northeurope" +} +# Virtual Network +resource "azurerm_virtual_network" "ghaf_infra_tf_vnet" { + name = "ghaf-infra-tf-vnet" + address_space = ["10.0.0.0/16"] + location = azurerm_resource_group.playground_rg.location + resource_group_name = azurerm_resource_group.playground_rg.name +} +# Subnet +resource "azurerm_subnet" "playground_subnet" { + name = "ghaf-infra-tf-subnet" + resource_group_name = azurerm_resource_group.playground_rg.name + virtual_network_name = azurerm_virtual_network.ghaf_infra_tf_vnet.name + address_prefixes = ["10.0.5.0/24"] +} +# read ssh-keys.yaml into local.ssh_keys +locals { + ssh_keys = yamldecode(file("../../ssh-keys.yaml")) +} + +################################################################################ + +# Image storage + +# Create a random string +resource "random_string" "imgstr" { + length = "12" + special = "false" + upper = false +} + +resource "azurerm_storage_account" "vm_images" { + name = "nixosimages${random_string.imgstr.result}" + resource_group_name = azurerm_resource_group.playground_rg.name + location = azurerm_resource_group.playground_rg.location + account_tier = "Standard" + account_replication_type = "LRS" + allow_nested_items_to_be_public = false +} + +resource "azurerm_storage_container" "vm_images" { + name = "ghaf-test-vm-images" + storage_account_name = azurerm_storage_account.vm_images.name + container_access_type = "private" +} + +################################################################################ + +# VM + +module "test_image" { + source = "../../tf-modules/azurerm-nix-vm-image" + + nix_attrpath = "outputs.nixosConfigurations.builder.config.system.build.azureImage" + nix_entrypoint = "${path.module}/../.." + + name = "playground_vm_img" + resource_group_name = azurerm_resource_group.playground_rg.name + location = azurerm_resource_group.playground_rg.location + + storage_account_name = azurerm_storage_account.vm_images.name + storage_container_name = azurerm_storage_container.vm_images.name +} + +locals { + num_vms = 1 +} + +module "test_vm" { + source = "../../tf-modules/azurerm-linux-vm" + + count = local.num_vms + + resource_group_name = azurerm_resource_group.playground_rg.name + location = azurerm_resource_group.playground_rg.location + + virtual_machine_name = "ghaf-playground-${count.index}-${terraform.workspace}" + # Demonstrate a way to use different configurations in different workspaces. + # Here, we define the following image sizes: + # - Use 'Standard_D2_v2' if the workspace is 'default' (2 vCPUs, 7 GiB RAM) + # - Use 'Standard_D1_v2' if the workspace is anything but 'default' (1 vCPU, 3.5 GiB RAM) + # The idea is based on the following article: + # https://blog.gruntwork.io/how-to-manage-multiple-environments-with-terraform-using-workspaces-98680d89a03e#2bc6 + # + # Full list of Azure image sizes are available in: + # https://azure.microsoft.com/en-us/pricing/details/virtual-machines/linux/#pricing + virtual_machine_size = terraform.workspace == "default" ? "Standard_D2_v2" : "Standard_D1_v2" + virtual_machine_source_image = module.test_image.image_id + + virtual_machine_custom_data = join("\n", ["#cloud-config", yamlencode({ + users = [ + { + name = "hrosten" + sudo = "ALL=(ALL) NOPASSWD:ALL" + ssh_authorized_keys = local.ssh_keys["hrosten"] + }, + ] + })]) + + allocate_public_ip = true + subnet_id = azurerm_subnet.playground_subnet.id +} + +# Allow inbound SSH +resource "azurerm_network_security_group" "test_vm" { + count = local.num_vms + name = "test-vm-${count.index}" + resource_group_name = azurerm_resource_group.playground_rg.name + location = azurerm_resource_group.playground_rg.location + security_rule { + name = "AllowSSH" + priority = 400 + direction = "Inbound" + access = "Allow" + protocol = "Tcp" + source_port_range = "*" + destination_port_ranges = [22] + source_address_prefix = "*" + destination_address_prefix = "*" + } +} +resource "azurerm_network_interface_security_group_association" "test_vm" { + count = local.num_vms + network_interface_id = module.test_vm[count.index].virtual_machine_network_interface_id + network_security_group_id = azurerm_network_security_group.test_vm[count.index].id +} + +################################################################################ + From d996ae0c63b45644c22d0503d412635d6b0bc564 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Sat, 20 Jan 2024 10:11:01 +0200 Subject: [PATCH 64/88] ghaf-infra-jenkins: support workspaces Add terraform workspace support for ghaf-infra-jenkins to make it possible to use the playground tooling to support setting-up temporary development environments of the ghaf-infra-jenkins infra. Signed-off-by: Henri Rosten --- terraform/jenkins/binary_cache.tf | 6 ++++-- terraform/jenkins/binary_cache_signing.tf | 7 +++++-- terraform/jenkins/binary_cache_storage.tf | 2 +- terraform/jenkins/builder.tf | 6 ++++-- terraform/jenkins/image_storage.tf | 2 +- terraform/jenkins/jenkins_controller.tf | 2 +- terraform/jenkins/main.tf | 8 ++++++-- terraform/jenkins/remote_build_ssh.tf | 7 +++++-- 8 files changed, 27 insertions(+), 13 deletions(-) diff --git a/terraform/jenkins/binary_cache.tf b/terraform/jenkins/binary_cache.tf index 25e46015..f166b770 100644 --- a/terraform/jenkins/binary_cache.tf +++ b/terraform/jenkins/binary_cache.tf @@ -23,8 +23,10 @@ module "binary_cache_vm" { resource_group_name = azurerm_resource_group.default.name location = azurerm_resource_group.default.location - virtual_machine_name = "ghaf-binary-cache" - virtual_machine_size = "Standard_D1_v2" + virtual_machine_name = "ghaf-binary-cache-${local.name_postfix}" + # Use 'Standard_D2_v2' if the workspace is 'default' (2 vCPUs, 7 GiB RAM) + # Use 'Standard_D1_v2' if the workspace is anything but 'default' (1 vCPU, 3.5 GiB RAM) + virtual_machine_size = terraform.workspace == "default" ? "Standard_D2_v2" : "Standard_D1_v2" virtual_machine_source_image = module.binary_cache_image.image_id virtual_machine_custom_data = join("\n", ["#cloud-config", yamlencode({ diff --git a/terraform/jenkins/binary_cache_signing.tf b/terraform/jenkins/binary_cache_signing.tf index 87c64e56..0016dd49 100644 --- a/terraform/jenkins/binary_cache_signing.tf +++ b/terraform/jenkins/binary_cache_signing.tf @@ -6,14 +6,17 @@ # terraform import secret_resource.binary_cache_signing_key "$(< ./secret-key)" resource "secret_resource" "binary_cache_signing_key" { lifecycle { - prevent_destroy = true + # To support automatically generating and destroying temp signing keys + # with playground/terraform-az-dev.sh, `prevent_destroy` needs to be set + # to `false`: + prevent_destroy = false } } # Create an Azure key vault. resource "azurerm_key_vault" "binary_cache_signing_key" { # this must be globally unique - name = "ghaf-binarycache-signing" + name = "sig-${local.name_postfix}" location = azurerm_resource_group.default.location resource_group_name = azurerm_resource_group.default.name sku_name = "standard" diff --git a/terraform/jenkins/binary_cache_storage.tf b/terraform/jenkins/binary_cache_storage.tf index dae9706f..52797c9a 100644 --- a/terraform/jenkins/binary_cache_storage.tf +++ b/terraform/jenkins/binary_cache_storage.tf @@ -4,7 +4,7 @@ # Create the storage account and storage container resource "azurerm_storage_account" "binary_cache" { - name = "ghafbinarycache" + name = "bche${local.name_postfix}" resource_group_name = azurerm_resource_group.default.name # TODO: separate resource group? location = azurerm_resource_group.default.location account_tier = "Standard" diff --git a/terraform/jenkins/builder.tf b/terraform/jenkins/builder.tf index d3de6ffe..f9a6d052 100644 --- a/terraform/jenkins/builder.tf +++ b/terraform/jenkins/builder.tf @@ -29,8 +29,10 @@ module "builder_vm" { resource_group_name = azurerm_resource_group.default.name location = azurerm_resource_group.default.location - virtual_machine_name = "ghaf-builder-${count.index}" - virtual_machine_size = "Standard_D4_v3" + virtual_machine_name = "ghaf-builder-${count.index}-${local.name_postfix}" + # Use 'Standard_D4_v3' if the workspace is 'default' (4 vCPUs, 16 GiB RAM) + # Use 'Standard_D2_v3' if the workspace is anything but 'default' (2 vCPU, 8 GiB RAM) + virtual_machine_size = terraform.workspace == "default" ? "Standard_D4_v3" : "Standard_D2_v3" virtual_machine_source_image = module.builder_image.image_id virtual_machine_custom_data = join("\n", ["#cloud-config", yamlencode({ diff --git a/terraform/jenkins/image_storage.tf b/terraform/jenkins/image_storage.tf index c5cd7c64..ffad74b1 100644 --- a/terraform/jenkins/image_storage.tf +++ b/terraform/jenkins/image_storage.tf @@ -5,7 +5,7 @@ # Storage account and storage container used to store VM images resource "azurerm_storage_account" "vm_images" { - name = "ghafinfravmimages" + name = "vimg${local.name_postfix}" resource_group_name = azurerm_resource_group.default.name location = azurerm_resource_group.default.location account_tier = "Standard" diff --git a/terraform/jenkins/jenkins_controller.tf b/terraform/jenkins/jenkins_controller.tf index c9e81c41..06054ee0 100644 --- a/terraform/jenkins/jenkins_controller.tf +++ b/terraform/jenkins/jenkins_controller.tf @@ -25,7 +25,7 @@ module "jenkins_controller_vm" { resource_group_name = azurerm_resource_group.default.name location = azurerm_resource_group.default.location - virtual_machine_name = "ghaf-jenkins-controller" + virtual_machine_name = "ghaf-jenkins-controller-${local.name_postfix}" virtual_machine_size = "Standard_D2_v2" virtual_machine_source_image = module.jenkins_controller_image.image_id diff --git a/terraform/jenkins/main.tf b/terraform/jenkins/main.tf index 130a5107..e11134f7 100644 --- a/terraform/jenkins/main.tf +++ b/terraform/jenkins/main.tf @@ -17,14 +17,18 @@ terraform { } } -# read ssh-keys.yaml into local.ssh_keys locals { + # read ssh-keys.yaml into local.ssh_keys ssh_keys = yamldecode(file("../../ssh-keys.yaml")) + # postfix used in the resource group name + rg_postfix = terraform.workspace == "default" ? "prod" : terraform.workspace + # postfix used in various resource names + name_postfix = terraform.workspace == "default" ? "ghafprod" : terraform.workspace } # The resource group everything in this terraform module lives in resource "azurerm_resource_group" "default" { - name = "ghaf-infra-jenkins" + name = "ghaf-infra-jenkins-${local.rg_postfix}" location = "northeurope" } diff --git a/terraform/jenkins/remote_build_ssh.tf b/terraform/jenkins/remote_build_ssh.tf index a8acaae7..d9106134 100644 --- a/terraform/jenkins/remote_build_ssh.tf +++ b/terraform/jenkins/remote_build_ssh.tf @@ -9,8 +9,11 @@ resource "tls_private_key" "ed25519_remote_build" { } # Dump the ed25519 public key to disk +# TODO: why do we need to dump the builder public key to disk and store it +# in git? resource "local_file" "ed25519_remote_build_pubkey" { - filename = "${path.module}/id_ed25519_remote_build.pub" + # For non-default workspaces, add extension .tmp to the filename + filename = terraform.workspace == "default" ? "${path.module}/id_ed25519_remote_build.pub" : "${path.module}/id_ed25519_remote_build.pub.tmp" file_permission = "0644" content = tls_private_key.ed25519_remote_build.public_key_openssh } @@ -18,7 +21,7 @@ resource "local_file" "ed25519_remote_build_pubkey" { # Create an Azure key vault. resource "azurerm_key_vault" "ssh_remote_build" { # this must be globally unique - name = "ghaf-ssh-remote-build" + name = "ssh-${local.name_postfix}" location = azurerm_resource_group.default.location resource_group_name = azurerm_resource_group.default.name sku_name = "standard" From 47ad617041323f51bfbb46e183be9f7af74e9776 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Mon, 22 Jan 2024 11:09:55 +0200 Subject: [PATCH 65/88] playground: add documentation Signed-off-by: Henri Rosten --- terraform/playground/README.md | 93 ++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 terraform/playground/README.md diff --git a/terraform/playground/README.md b/terraform/playground/README.md new file mode 100644 index 00000000..c70b98a2 --- /dev/null +++ b/terraform/playground/README.md @@ -0,0 +1,93 @@ + + +# Terraform Playground + +This project uses terraform to automate the creation of infrastructure resources. +To support infrastructure development in isolated development environments, we use [terraform workspaces](https://developer.hashicorp.com/terraform/cli/workspaces). + +The tooling under this `playground` directory is provided to facilitate the usage of terraform workspaces in setting-up a distinct copy of the target infrastructure to test a set of changes before modifying shared (dev/prod) infrastructure. + +This page documents the usage of `terraform-playground.sh` to help facilitate the usage of private development environments for testing infra changes. + +**Note**: the environments created with `terraform-playground.sh` are supposed to be temporary and short-lived. Each active (non-destroyed) playground instance will cost some real money, so be sure to destroy the playground instances as soon as they are no longer needed. It's easy to spin-up a new playground environment using `terraform-playground.sh`, so there's no need to keep them alive '*just in case*'. + +## Usage + +If you still don't have nix package manager on your local host, install it following the package manager installation instructions from https://nixos.org/download.html. + +Then, clone this repository: +```bash +$ git clone https://github.com/tiiuae/ghaf-infra.git +$ cd ghaf-infra/ +``` + +All commands in this document are executed from nix-shell inside the `terraform/jenkins` directory. + +Bootstrap nix-shell with the required dependencies: +```bash +# Start a nix-shell with required dependencies: +$ nix-shell + +# Authenticate with az login: +$ az login + +# We use the configuration under terraform/jenkins as an example: +$ cd terraform/jenkins +``` + +## Activating Playground Environment +```bash +# Activate private development environment +$ ../playground/terraform-playground.sh activate +# ... +[+] Done, use terraform [validate|plan|apply] to work with your dev infra +``` +The `activate` command sets-up a terraform workspace for your private development environment: +```bash +# List the current terraform worskapce +$ ../playground/terraform-playground.sh list +Terraform workspaces: + default +* henrirosten # <-- indicates active workspace +``` + +## Testing Infrastructure Changes +With the private development workspace now setup, we can test infrastructure changes in a private development environment: +```bash +# In directory terraform/jenkins +$ pwd +[..]/ghaf-infra/terraform/jenkins + +# Check terraform configuration files format: +$ terraform fmt -recursive + +# Check the the terraform configuration is valid: +$ terraform validate + +# Show configuration changes: +$ terraform plan + +# Deploy the infrastructure: +$ terraform apply -auto-approve +``` + +Once `terraform apply` completes, the private development infrastructure is deployed. +You can now play around in your isolated copy of the infrastructure, testing and updating the changes, making sure the changes work as expected before proposing the changes to a shared (prod/dev) environment. + +## Destroying Playground Environment +Once the configuration changes have been tested, the private development environment can be destroyed: +```bash +# Destroy the private terraform worskapce +$ ../playground/terraform-playground.sh destroy +``` +The above command removes all the resources that were created for the private development environment. + + +## References +- Terraform workspaces: https://developer.hashicorp.com/terraform/cli/workspaces +- How to manage multiple environments with Terraform using workspaces: https://blog.gruntwork.io/how-to-manage-multiple-environments-with-terraform-using-workspaces-98680d89a03e + From ce14f8c0dd51d2fd20e55079461951847df0e216 Mon Sep 17 00:00:00 2001 From: Karim Mdmirajul <143718491+karim20230@users.noreply.github.com> Date: Thu, 25 Jan 2024 11:51:07 +0200 Subject: [PATCH 66/88] terraform/playground: lower case workspace name (#61) --- terraform/playground/terraform-playground.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/terraform/playground/terraform-playground.sh b/terraform/playground/terraform-playground.sh index 765d8a77..5586c768 100755 --- a/terraform/playground/terraform-playground.sh +++ b/terraform/playground/terraform-playground.sh @@ -54,11 +54,12 @@ generate_azure_private_workspace_name () { # - cut removes everything up until the first '@' # - sed keeps only letter and number characters # - final cut keeps at most 20 characters + # - tr converts the string to lower case # Thus, given a signed-in user 'foo.bar@baz.com', the workspace name # becomes 'foobar'. # Below command errors out with the azure error message if the azure user # is not signed-in. - WORKSPACE=$(az ad signed-in-user show | jq -cr .userPrincipalName | cut -d'@' -f1 | sed 's/[^a-zA-Z0-9]//g' | cut -c 1-20) + WORKSPACE=$(az ad signed-in-user show | jq -cr .userPrincipalName | cut -d'@' -f1 | sed 's/[^a-zA-Z0-9]//g' | cut -c 1-20 | tr '[:upper:]' '[:lower:]') # Check WORKSPACE is non-empty and not 'default' if [ -z "$WORKSPACE" ] || [ "$WORKSPACE" = "default" ]; then echo "Error: invalid workspace name: '$WORKSPACE'" @@ -148,6 +149,7 @@ main () { exit_unless_command_exists jq exit_unless_command_exists sed exit_unless_command_exists cut + exit_unless_command_exists tr # Assigns $WORKSPACE variable generate_azure_private_workspace_name From c38736189f092c5e9a34d9b79988cf6877b2a6ca Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Wed, 24 Jan 2024 09:21:31 +0200 Subject: [PATCH 67/88] terraform/jenkins: don't dump builder public key Signed-off-by: Henri Rosten --- terraform/jenkins/id_ed25519_remote_build.pub | 1 - terraform/jenkins/remote_build_ssh.tf | 10 ---------- 2 files changed, 11 deletions(-) delete mode 100644 terraform/jenkins/id_ed25519_remote_build.pub diff --git a/terraform/jenkins/id_ed25519_remote_build.pub b/terraform/jenkins/id_ed25519_remote_build.pub deleted file mode 100644 index 2e1b54e4..00000000 --- a/terraform/jenkins/id_ed25519_remote_build.pub +++ /dev/null @@ -1 +0,0 @@ -ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBlHMthtFYhBK6WsZCNXeJcOUx6xeVLNAHhWI6zFofiD diff --git a/terraform/jenkins/remote_build_ssh.tf b/terraform/jenkins/remote_build_ssh.tf index d9106134..992b4416 100644 --- a/terraform/jenkins/remote_build_ssh.tf +++ b/terraform/jenkins/remote_build_ssh.tf @@ -8,16 +8,6 @@ resource "tls_private_key" "ed25519_remote_build" { algorithm = "ED25519" } -# Dump the ed25519 public key to disk -# TODO: why do we need to dump the builder public key to disk and store it -# in git? -resource "local_file" "ed25519_remote_build_pubkey" { - # For non-default workspaces, add extension .tmp to the filename - filename = terraform.workspace == "default" ? "${path.module}/id_ed25519_remote_build.pub" : "${path.module}/id_ed25519_remote_build.pub.tmp" - file_permission = "0644" - content = tls_private_key.ed25519_remote_build.public_key_openssh -} - # Create an Azure key vault. resource "azurerm_key_vault" "ssh_remote_build" { # this must be globally unique From 7076d9edc3908c07e70a3586ff026d5fa06ede2c Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Thu, 25 Jan 2024 12:50:40 +0200 Subject: [PATCH 68/88] hosts: stop using scratch store Signed-off-by: Henri Rosten --- hosts/azure-common-2.nix | 3 +++ hosts/binary-cache/configuration.nix | 1 - hosts/builder/configuration.nix | 1 - hosts/jenkins-controller/configuration.nix | 1 - tf-modules/azurerm-linux-vm/virtual_machine.tf | 1 + 5 files changed, 4 insertions(+), 3 deletions(-) diff --git a/hosts/azure-common-2.nix b/hosts/azure-common-2.nix index 73a4e520..bcf14288 100644 --- a/hosts/azure-common-2.nix +++ b/hosts/azure-common-2.nix @@ -20,6 +20,9 @@ }; }; + # Enable azure agent + virtualisation.azure.agent.enable = true; + # enable cloud-init, so instance metadata is set accordingly and we can use # cloud-config for ssh key management. services.cloud-init.enable = true; diff --git a/hosts/binary-cache/configuration.nix b/hosts/binary-cache/configuration.nix index 6d5f3c75..cdddd204 100644 --- a/hosts/binary-cache/configuration.nix +++ b/hosts/binary-cache/configuration.nix @@ -10,7 +10,6 @@ }: { imports = [ ../azure-common-2.nix - ../azure-scratch-store-common.nix self.nixosModules.service-openssh ]; diff --git a/hosts/builder/configuration.nix b/hosts/builder/configuration.nix index 76761243..580f3228 100644 --- a/hosts/builder/configuration.nix +++ b/hosts/builder/configuration.nix @@ -9,7 +9,6 @@ }: { imports = [ ../azure-common-2.nix - ../azure-scratch-store-common.nix self.nixosModules.service-openssh self.nixosModules.service-remote-build ]; diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index a1f70455..ada0d736 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -53,7 +53,6 @@ in { imports = [ ../azure-common-2.nix - ../azure-scratch-store-common.nix self.nixosModules.service-openssh ]; diff --git a/tf-modules/azurerm-linux-vm/virtual_machine.tf b/tf-modules/azurerm-linux-vm/virtual_machine.tf index 90209ae7..de7f7c2e 100644 --- a/tf-modules/azurerm-linux-vm/virtual_machine.tf +++ b/tf-modules/azurerm-linux-vm/virtual_machine.tf @@ -56,6 +56,7 @@ resource "azurerm_virtual_machine" "main" { caching = "ReadWrite" create_option = "FromImage" managed_disk_type = "Standard_LRS" + disk_size_gb = "100" } dynamic "storage_data_disk" { From 994eb5c82576a0f7e8fbb4f129fcdd3bf555500f Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Thu, 25 Jan 2024 12:55:26 +0200 Subject: [PATCH 69/88] azure-common-2: fix cloud-config startup Fix cloud-config startup by adding a dependency to mnt-resource.mount Signed-off-by: Henri Rosten --- hosts/azure-common-2.nix | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hosts/azure-common-2.nix b/hosts/azure-common-2.nix index bcf14288..4c87b716 100644 --- a/hosts/azure-common-2.nix +++ b/hosts/azure-common-2.nix @@ -26,6 +26,8 @@ # enable cloud-init, so instance metadata is set accordingly and we can use # cloud-config for ssh key management. services.cloud-init.enable = true; + systemd.services.cloud-config.after = ["mnt-resource.mount"]; + systemd.services.cloud-config.requires = ["mnt-resource.mount"]; # Use systemd-networkd for network configuration. services.cloud-init.network.enable = true; From 9298f7d7c160ca4ef27068f2a2e104a2ef66dee8 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Thu, 25 Jan 2024 13:00:16 +0200 Subject: [PATCH 70/88] Revert "binary-cache: hardcode caddy domain" This reverts commit c083e6b43b54027da615af0ced87786b1e985c16. --- hosts/binary-cache/configuration.nix | 4 +++- terraform/jenkins/binary_cache.tf | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/hosts/binary-cache/configuration.nix b/hosts/binary-cache/configuration.nix index cdddd204..90664861 100644 --- a/hosts/binary-cache/configuration.nix +++ b/hosts/binary-cache/configuration.nix @@ -65,7 +65,7 @@ } # Proxy a subset of requests to rclone. - https://ghaf-binary-cache.northeurope.cloudapp.azure.com { + https://{$SITE_ADDRESS} { handle /nix-cache-info { reverse_proxy unix///run/rclone-http/socket } @@ -89,6 +89,8 @@ "" "${pkgs.caddy}/bin/caddy run --environ --config ${config.services.caddy.configFile}/Caddyfile" ]; + systemd.services.caddy.serviceConfig.EnvironmentFile = "/run/caddy.env"; + # Wait for cloud-init mounting before we start caddy. systemd.services.caddy.after = ["cloud-init.service"]; systemd.services.caddy.requires = ["cloud-init.service"]; diff --git a/terraform/jenkins/binary_cache.tf b/terraform/jenkins/binary_cache.tf index f166b770..6a530177 100644 --- a/terraform/jenkins/binary_cache.tf +++ b/terraform/jenkins/binary_cache.tf @@ -43,6 +43,10 @@ module "binary_cache_vm" { content = "AZURE_STORAGE_ACCOUNT_NAME=${azurerm_storage_account.binary_cache.name}", "path" = "/var/lib/rclone-http/env" }, + { + content = "SITE_ADDRESS=ghaf-binary-cache.northeurope.cloudapp.azure.com", + "path" = "/run/caddy.env" + }, ], })]) From f0595beca84939339b59c74e3da8f4fda53e4c02 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Thu, 25 Jan 2024 13:07:24 +0200 Subject: [PATCH 71/88] binary-cache: configure domain with terraform Signed-off-by: Henri Rosten --- terraform/jenkins/binary_cache.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/jenkins/binary_cache.tf b/terraform/jenkins/binary_cache.tf index 6a530177..b25ea36f 100644 --- a/terraform/jenkins/binary_cache.tf +++ b/terraform/jenkins/binary_cache.tf @@ -44,7 +44,7 @@ module "binary_cache_vm" { "path" = "/var/lib/rclone-http/env" }, { - content = "SITE_ADDRESS=ghaf-binary-cache.northeurope.cloudapp.azure.com", + content = "SITE_ADDRESS=ghaf-binary-cache-${local.name_postfix}.northeurope.cloudapp.azure.com", "path" = "/run/caddy.env" }, ], From 0938a4ba305f461575e66b021eb91894bd8e9d5b Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Thu, 25 Jan 2024 14:19:00 +0200 Subject: [PATCH 72/88] hosts/builder: use cache.vedenemo.dev substituter Signed-off-by: Henri Rosten --- hosts/builder/configuration.nix | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hosts/builder/configuration.nix b/hosts/builder/configuration.nix index 580f3228..e16c2258 100644 --- a/hosts/builder/configuration.nix +++ b/hosts/builder/configuration.nix @@ -40,11 +40,14 @@ }; # Configure Nix to use this as a substitutor, and the public key used for signing. + # TODO: remove cache.vedenemo.dev substituter nix.settings.trusted-public-keys = [ "ghaf-jenkins:5OXpzoevBwH4sBR0S0HaIQCik2adrOrGawIXO+WADCk=" + "cache.vedenemo.dev:8NhplARANhClUSWJyLVk4WMyy1Wb4rhmWW2u8AejH9E=" ]; nix.settings.substituters = [ "http://localhost:8080" + "https://cache.vedenemo.dev" ]; system.stateVersion = "23.05"; From cd798a508324c194e9c27a070b83beb636c092e8 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Fri, 26 Jan 2024 13:56:07 +0200 Subject: [PATCH 73/88] terraform/jenkins: move binary cache signing key Move binary cache signing key to its own resource group, this makes it possible to share the signing key between the private development environments. Signed-off-by: Henri Rosten --- hosts/builder/configuration.nix | 2 +- hosts/jenkins-controller/configuration.nix | 2 +- .../azure-secrets/binary_cache_signing.tf | 85 +++++++++++++++++++ terraform/jenkins/binary_cache_signing.tf | 56 +++--------- terraform/jenkins/jenkins_controller.tf | 4 +- 5 files changed, 99 insertions(+), 50 deletions(-) create mode 100644 terraform/azure-secrets/binary_cache_signing.tf diff --git a/hosts/builder/configuration.nix b/hosts/builder/configuration.nix index e16c2258..664bcf9b 100644 --- a/hosts/builder/configuration.nix +++ b/hosts/builder/configuration.nix @@ -42,7 +42,7 @@ # Configure Nix to use this as a substitutor, and the public key used for signing. # TODO: remove cache.vedenemo.dev substituter nix.settings.trusted-public-keys = [ - "ghaf-jenkins:5OXpzoevBwH4sBR0S0HaIQCik2adrOrGawIXO+WADCk=" + "ghaf-jenkins:5mkxSJ9AQd7qVhsbtNV8eBGiHOm7oeMxFXVAr8VHTlI=" "cache.vedenemo.dev:8NhplARANhClUSWJyLVk4WMyy1Wb4rhmWW2u8AejH9E=" ]; nix.settings.substituters = [ diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index ada0d736..86b24381 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -184,7 +184,7 @@ in { # Configure Nix to use this as a substitutor, and the public key used for signing. nix.settings.trusted-public-keys = [ - "ghaf-jenkins:5OXpzoevBwH4sBR0S0HaIQCik2adrOrGawIXO+WADCk=" + "ghaf-jenkins:5mkxSJ9AQd7qVhsbtNV8eBGiHOm7oeMxFXVAr8VHTlI=" ]; nix.settings.substituters = [ "http://localhost:8080" diff --git a/terraform/azure-secrets/binary_cache_signing.tf b/terraform/azure-secrets/binary_cache_signing.tf new file mode 100644 index 00000000..860106f2 --- /dev/null +++ b/terraform/azure-secrets/binary_cache_signing.tf @@ -0,0 +1,85 @@ +# SPDX-FileCopyrightText: 2024 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +terraform { + required_providers { + azurerm = { + source = "hashicorp/azurerm" + } + secret = { + source = "numtide/secret" + } + } + # Backend for storing tfstate (see ./azure-storage) + backend "azurerm" { + resource_group_name = "ghaf-infra-storage" + storage_account_name = "ghafinfrastatestorage" + container_name = "ghaf-infra-tfstate-container" + key = "ghaf-infra-sigkey.tfstate" + } +} +provider "azurerm" { + features {} +} +# Resource group +resource "azurerm_resource_group" "default" { + name = "ghaf-infra-sigkey" + location = "northeurope" +} + +################################################################################ + + +# nix-store --generate-binary-cache-key foo secret-key public-key +# terraform import secret_resource.binary_cache_signing_key "$(< ./secret-key)" +# terraform apply +resource "secret_resource" "binary_cache_signing_key" { + lifecycle { + prevent_destroy = true + } +} + +data "azurerm_client_config" "current" {} + +# Create an Azure key vault. +resource "azurerm_key_vault" "binary_cache_signing_key" { + # this must be globally unique + name = "ghaf-binarycache-signing" + location = azurerm_resource_group.default.location + resource_group_name = azurerm_resource_group.default.name + sku_name = "standard" + # The Azure Active Directory tenant ID that should be used for authenticating + # requests to the key vault. + tenant_id = data.azurerm_client_config.current.tenant_id +} + +# Upload the binary cache signing key as a vault secret +resource "azurerm_key_vault_secret" "binary_cache_signing_key" { + name = "binary-cache-signing-key" + value = secret_resource.binary_cache_signing_key.value + key_vault_id = azurerm_key_vault.binary_cache_signing_key.id + + # Each of the secrets needs an explicit dependency on the access policy. + # Otherwise, Terraform may attempt to create the secret before creating the + # access policy. + # https://stackoverflow.com/a/74747333 + depends_on = [ + azurerm_key_vault_access_policy.binary_cache_signing_key_terraform + ] +} + +resource "azurerm_key_vault_access_policy" "binary_cache_signing_key_terraform" { + key_vault_id = azurerm_key_vault.binary_cache_signing_key.id + tenant_id = data.azurerm_client_config.current.tenant_id + # "TerraformAdminsGHAFInfra" group + object_id = "f80c2488-2301-4de8-89d6-4954b77f453e" + + secret_permissions = [ + "Get", + "List", + "Set" + ] +} + +################################################################################ diff --git a/terraform/jenkins/binary_cache_signing.tf b/terraform/jenkins/binary_cache_signing.tf index 0016dd49..24284d7f 100644 --- a/terraform/jenkins/binary_cache_signing.tf +++ b/terraform/jenkins/binary_cache_signing.tf @@ -1,54 +1,18 @@ -# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# SPDX-FileCopyrightText: 2024 Technology Innovation Institute (TII) # # SPDX-License-Identifier: Apache-2.0 -# nix-store --generate-binary-cache-key foo secret-key public-key -# terraform import secret_resource.binary_cache_signing_key "$(< ./secret-key)" -resource "secret_resource" "binary_cache_signing_key" { - lifecycle { - # To support automatically generating and destroying temp signing keys - # with playground/terraform-az-dev.sh, `prevent_destroy` needs to be set - # to `false`: - prevent_destroy = false - } -} +# Reads the binary cache signing key from a key vault +# on resource group "ghaf-infra-sigkey" (see: ../azure-secrets) -# Create an Azure key vault. -resource "azurerm_key_vault" "binary_cache_signing_key" { - # this must be globally unique - name = "sig-${local.name_postfix}" - location = azurerm_resource_group.default.location - resource_group_name = azurerm_resource_group.default.name - sku_name = "standard" - # The Azure Active Directory tenant ID that should be used for authenticating - # requests to the key vault. - tenant_id = data.azurerm_client_config.current.tenant_id +data "azurerm_key_vault" "binary_cache_signing_key" { + name = "ghaf-binarycache-signing" + resource_group_name = "ghaf-infra-sigkey" + provider = azurerm } -# Upload the binary cache signing key as a vault secret -resource "azurerm_key_vault_secret" "binary_cache_signing_key" { +data "azurerm_key_vault_secret" "binary_cache_signing_key" { name = "binary-cache-signing-key" - value = secret_resource.binary_cache_signing_key.value - key_vault_id = azurerm_key_vault.binary_cache_signing_key.id - - # Each of the secrets needs an explicit dependency on the access policy. - # Otherwise, Terraform may attempt to create the secret before creating the - # access policy. - # https://stackoverflow.com/a/74747333 - depends_on = [ - azurerm_key_vault_access_policy.binary_cache_signing_key_terraform - ] -} - -resource "azurerm_key_vault_access_policy" "binary_cache_signing_key_terraform" { - key_vault_id = azurerm_key_vault.binary_cache_signing_key.id - tenant_id = data.azurerm_client_config.current.tenant_id - # "TerraformAdminsGHAFInfra" group - object_id = "f80c2488-2301-4de8-89d6-4954b77f453e" - - secret_permissions = [ - "Get", - "List", - "Set" - ] + key_vault_id = data.azurerm_key_vault.binary_cache_signing_key.id + provider = azurerm } diff --git a/terraform/jenkins/jenkins_controller.tf b/terraform/jenkins/jenkins_controller.tf index 06054ee0..646d3ab2 100644 --- a/terraform/jenkins/jenkins_controller.tf +++ b/terraform/jenkins/jenkins_controller.tf @@ -44,7 +44,7 @@ module "jenkins_controller_vm" { "path" = "/var/lib/fetch-build-ssh-key/env" }, { - content = "KEY_VAULT_NAME=${azurerm_key_vault.binary_cache_signing_key.name}\nSECRET_NAME=${azurerm_key_vault_secret.binary_cache_signing_key.name}", + content = "KEY_VAULT_NAME=${data.azurerm_key_vault.binary_cache_signing_key.name}\nSECRET_NAME=${data.azurerm_key_vault_secret.binary_cache_signing_key.name}", "path" = "/var/lib/fetch-binary-cache-signing-key/env" }, { @@ -138,7 +138,7 @@ resource "azurerm_role_assignment" "jenkins_controller_access_storage" { # Grant the VM read-only access to the Azure Key Vault Secret containing the # binary cache signing key. resource "azurerm_key_vault_access_policy" "binary_cache_signing_key_jenkins_controller" { - key_vault_id = azurerm_key_vault.binary_cache_signing_key.id + key_vault_id = data.azurerm_key_vault.binary_cache_signing_key.id tenant_id = data.azurerm_client_config.current.tenant_id object_id = module.jenkins_controller_vm.virtual_machine_identity_principal_id From a7a1d8a4fe8cd6985aee915aaf27dd26f02cbf8e Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Fri, 26 Jan 2024 16:25:25 +0200 Subject: [PATCH 74/88] terraform/jenkins: move ssh key to azure-secrets Move ssh private key to azure-secrets resource group, similarly to how binary cache signing key was moved in the previous commit. Signed-off-by: Henri Rosten --- .../azure-secrets/binary_cache_signing.tf | 4 +- terraform/azure-secrets/remote_build_ssh.tf | 60 +++++++++++++++++++ terraform/jenkins/binary_cache_signing.tf | 4 +- terraform/jenkins/builder.tf | 2 +- terraform/jenkins/jenkins_controller.tf | 6 +- terraform/jenkins/remote_build_ssh.tf | 53 ++++------------ 6 files changed, 82 insertions(+), 47 deletions(-) create mode 100644 terraform/azure-secrets/remote_build_ssh.tf diff --git a/terraform/azure-secrets/binary_cache_signing.tf b/terraform/azure-secrets/binary_cache_signing.tf index 860106f2..45ff49f8 100644 --- a/terraform/azure-secrets/binary_cache_signing.tf +++ b/terraform/azure-secrets/binary_cache_signing.tf @@ -16,7 +16,7 @@ terraform { resource_group_name = "ghaf-infra-storage" storage_account_name = "ghafinfrastatestorage" container_name = "ghaf-infra-tfstate-container" - key = "ghaf-infra-sigkey.tfstate" + key = "ghaf-infra-secrets.tfstate" } } provider "azurerm" { @@ -24,7 +24,7 @@ provider "azurerm" { } # Resource group resource "azurerm_resource_group" "default" { - name = "ghaf-infra-sigkey" + name = "ghaf-infra-secrets" location = "northeurope" } diff --git a/terraform/azure-secrets/remote_build_ssh.tf b/terraform/azure-secrets/remote_build_ssh.tf new file mode 100644 index 00000000..cc78db1c --- /dev/null +++ b/terraform/azure-secrets/remote_build_ssh.tf @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: 2024 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +# Create a ED25519 key, which the jenkins master will use to authenticate with +# builders. +resource "tls_private_key" "ed25519_remote_build" { + algorithm = "ED25519" +} + +# Create an Azure key vault. +resource "azurerm_key_vault" "ssh_remote_build" { + # this must be globally unique + name = "ghaf-ssh-remote-build" + location = azurerm_resource_group.default.location + resource_group_name = azurerm_resource_group.default.name + sku_name = "standard" + # The Azure Active Directory tenant ID that should be used for authenticating + # requests to the key vault. + tenant_id = data.azurerm_client_config.current.tenant_id +} + +# Put the ed25519 private key used for ssh as a secret. +resource "azurerm_key_vault_secret" "ssh_remote_build" { + name = "remote-build-ssh-private-key" + value = tls_private_key.ed25519_remote_build.private_key_openssh + key_vault_id = azurerm_key_vault.ssh_remote_build.id + + # Each of the secrets needs an explicit dependency on the access policy. + # Otherwise, Terraform may attempt to create the secret before creating the + # access policy. + # https://stackoverflow.com/a/74747333 + depends_on = [ + azurerm_key_vault_access_policy.ssh_remote_build_terraform + ] +} + +# Put the ed25519 public key used for ssh as a secret to make it accessible +# in builder configuration +resource "azurerm_key_vault_secret" "ssh_remote_build_pub" { + name = "remote-build-ssh-public-key" + value = tls_private_key.ed25519_remote_build.public_key_openssh + key_vault_id = azurerm_key_vault.ssh_remote_build.id + depends_on = [ + azurerm_key_vault_access_policy.ssh_remote_build_terraform + ] +} + +resource "azurerm_key_vault_access_policy" "ssh_remote_build_terraform" { + key_vault_id = azurerm_key_vault.ssh_remote_build.id + tenant_id = data.azurerm_client_config.current.tenant_id + # "TerraformAdminsGHAFInfra" group + object_id = "f80c2488-2301-4de8-89d6-4954b77f453e" + + secret_permissions = [ + "Get", + "List", + "Set" + ] +} diff --git a/terraform/jenkins/binary_cache_signing.tf b/terraform/jenkins/binary_cache_signing.tf index 24284d7f..1554c7a3 100644 --- a/terraform/jenkins/binary_cache_signing.tf +++ b/terraform/jenkins/binary_cache_signing.tf @@ -3,11 +3,11 @@ # SPDX-License-Identifier: Apache-2.0 # Reads the binary cache signing key from a key vault -# on resource group "ghaf-infra-sigkey" (see: ../azure-secrets) +# on resource group "ghaf-infra-secrets" (see: ../azure-secrets) data "azurerm_key_vault" "binary_cache_signing_key" { name = "ghaf-binarycache-signing" - resource_group_name = "ghaf-infra-sigkey" + resource_group_name = "ghaf-infra-secrets" provider = azurerm } diff --git a/terraform/jenkins/builder.tf b/terraform/jenkins/builder.tf index f9a6d052..29de9b8d 100644 --- a/terraform/jenkins/builder.tf +++ b/terraform/jenkins/builder.tf @@ -39,7 +39,7 @@ module "builder_vm" { users = [{ name = "remote-build" ssh_authorized_keys = [ - tls_private_key.ed25519_remote_build.public_key_openssh + "${data.azurerm_key_vault_secret.ssh_remote_build_pub.value}" ] }] write_files = [ diff --git a/terraform/jenkins/jenkins_controller.tf b/terraform/jenkins/jenkins_controller.tf index 646d3ab2..732f4e84 100644 --- a/terraform/jenkins/jenkins_controller.tf +++ b/terraform/jenkins/jenkins_controller.tf @@ -40,7 +40,7 @@ module "jenkins_controller_vm" { write_files = [ # See corresponding EnvironmentFile= directives in services { - content = "KEY_VAULT_NAME=${azurerm_key_vault.ssh_remote_build.name}\nSECRET_NAME=${azurerm_key_vault_secret.ssh_remote_build.name}", + content = "KEY_VAULT_NAME=${data.azurerm_key_vault.ssh_remote_build.name}\nSECRET_NAME=${data.azurerm_key_vault_secret.ssh_remote_build.name}", "path" = "/var/lib/fetch-build-ssh-key/env" }, { @@ -116,10 +116,12 @@ resource "azurerm_managed_disk" "jenkins_controller_jenkins_state" { disk_size_gb = 10 } +data "azurerm_client_config" "current" {} + # Grant the VM read-only access to the Azure Key Vault Secret containing the # ed25519 private key used to connect to remote builders. resource "azurerm_key_vault_access_policy" "ssh_remote_build_jenkins_controller" { - key_vault_id = azurerm_key_vault.ssh_remote_build.id + key_vault_id = data.azurerm_key_vault.ssh_remote_build.id tenant_id = data.azurerm_client_config.current.tenant_id object_id = module.jenkins_controller_vm.virtual_machine_identity_principal_id diff --git a/terraform/jenkins/remote_build_ssh.tf b/terraform/jenkins/remote_build_ssh.tf index 992b4416..3474894f 100644 --- a/terraform/jenkins/remote_build_ssh.tf +++ b/terraform/jenkins/remote_build_ssh.tf @@ -2,50 +2,23 @@ # # SPDX-License-Identifier: Apache-2.0 -# Create a ED25519 key, which the jenkins master will use to authenticate with -# builders. -resource "tls_private_key" "ed25519_remote_build" { - algorithm = "ED25519" -} +# Reads the builder ssh key from a key vault +# on resource group "ghaf-infra-secrets" (see: ../azure-secrets) -# Create an Azure key vault. -resource "azurerm_key_vault" "ssh_remote_build" { - # this must be globally unique - name = "ssh-${local.name_postfix}" - location = azurerm_resource_group.default.location - resource_group_name = azurerm_resource_group.default.name - sku_name = "standard" - # The Azure Active Directory tenant ID that should be used for authenticating - # requests to the key vault. - tenant_id = data.azurerm_client_config.current.tenant_id +data "azurerm_key_vault" "ssh_remote_build" { + name = "ghaf-ssh-remote-build" + resource_group_name = "ghaf-infra-secrets" + provider = azurerm } -data "azurerm_client_config" "current" {} - -# Put the ed25519 private key used for ssh as a secret. -resource "azurerm_key_vault_secret" "ssh_remote_build" { +data "azurerm_key_vault_secret" "ssh_remote_build" { name = "remote-build-ssh-private-key" - value = tls_private_key.ed25519_remote_build.private_key_openssh - key_vault_id = azurerm_key_vault.ssh_remote_build.id - - # Each of the secrets needs an explicit dependency on the access policy. - # Otherwise, Terraform may attempt to create the secret before creating the - # access policy. - # https://stackoverflow.com/a/74747333 - depends_on = [ - azurerm_key_vault_access_policy.ssh_remote_build_terraform - ] + key_vault_id = data.azurerm_key_vault.ssh_remote_build.id + provider = azurerm } -resource "azurerm_key_vault_access_policy" "ssh_remote_build_terraform" { - key_vault_id = azurerm_key_vault.ssh_remote_build.id - tenant_id = data.azurerm_client_config.current.tenant_id - # "TerraformAdminsGHAFInfra" group - object_id = "f80c2488-2301-4de8-89d6-4954b77f453e" - - secret_permissions = [ - "Get", - "List", - "Set" - ] +data "azurerm_key_vault_secret" "ssh_remote_build_pub" { + name = "remote-build-ssh-public-key" + key_vault_id = data.azurerm_key_vault.ssh_remote_build.id + provider = azurerm } From 834d341f7918c08ee677b383ed940b86919a4942 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Mon, 29 Jan 2024 08:28:15 +0200 Subject: [PATCH 75/88] playground: remove secret handling hacks Remove functions `delete_keyvault` and `import_sigkey` which are no longer needed after the two previous commits, that move the builder ssh private key and the binary cache signing key to their own resource group. These secrets now persist even after workspace destruction, so there's no need to generate or delete them separately outside terraform. Signed-off-by: Henri Rosten --- terraform/playground/terraform-playground.sh | 36 -------------------- 1 file changed, 36 deletions(-) diff --git a/terraform/playground/terraform-playground.sh b/terraform/playground/terraform-playground.sh index 5586c768..2abcd36c 100755 --- a/terraform/playground/terraform-playground.sh +++ b/terraform/playground/terraform-playground.sh @@ -67,40 +67,6 @@ generate_azure_private_workspace_name () { fi } -import_sigkey () { - # This function is a hack to automatically generate the binary cache - # signing key for the (ghaf-infra) private dev environment. - - # No need to import anything if the below key isn't defined in the infra - if ! grep -q secret_resource.binary_cache_signing_key -- *.tf; then - return - fi - - # Skip import if signing key is imported already - if terraform state list | grep -q secret_resource.binary_cache_signing_key ; then - return - fi - - # Generate and import the key - nix-store --generate-binary-cache-key "$WORKSPACE" sigkey-secret.tmp sigkey-public.tmp - terraform import secret_resource.binary_cache_signing_key "$(< ./sigkey-secret.tmp)" -} - -delete_keyvault () { - # This function is a hack to automatically delete keyvaults - # from the (ghaf-infra) private dev environment. - set +e - if grep -qP "sig-.*name_postfix" -- *.tf; then - az keyvault delete --name "sig-$WORKSPACE" 2>/dev/null - az keyvault purge --name "sig-$WORKSPACE" 2>/dev/null - fi - if grep -qP "ssh-.*name_postfix" -- *.tf; then - az keyvault delete --name "ssh-$WORKSPACE" 2>/dev/null - az keyvault purge --name "ssh-$WORKSPACE" 2>/dev/null - fi - set -e -} - activate () { echo "[+] Activating workspace: '$WORKSPACE'" if terraform workspace list | grep -q "$WORKSPACE"; then @@ -109,7 +75,6 @@ activate () { terraform workspace new "$WORKSPACE" terraform workspace select "$WORKSPACE" fi - import_sigkey echo "[+] Done, use terraform [validate|plan|apply] to work with your dev infra" } @@ -120,7 +85,6 @@ destroy () { fi echo "[+] Destroying workspace: '$WORKSPACE'" terraform workspace select "$WORKSPACE" - delete_keyvault terraform apply -destroy -auto-approve terraform workspace select default } From d249f8b7052ab68cbff47d320b05e0a988ce0d39 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Mon, 29 Jan 2024 08:41:07 +0200 Subject: [PATCH 76/88] playground: don't switch to default after destroy Do not automatically switch to default workspace after `destroy` command. Improve workspace name matching by not allowing partial matches. Signed-off-by: Henri Rosten --- terraform/playground/terraform-playground.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/terraform/playground/terraform-playground.sh b/terraform/playground/terraform-playground.sh index 2abcd36c..9b518a0f 100755 --- a/terraform/playground/terraform-playground.sh +++ b/terraform/playground/terraform-playground.sh @@ -35,7 +35,7 @@ usage () { echo " " echo " Deactivate and destroy the private development infra that was previously" echo " created with the 'activate' command. This command deletes all the infra" - echo " resources and removes the terraform workspace." + echo " resources." echo "" } @@ -69,7 +69,7 @@ generate_azure_private_workspace_name () { activate () { echo "[+] Activating workspace: '$WORKSPACE'" - if terraform workspace list | grep -q "$WORKSPACE"; then + if terraform workspace list | grep -qP "\s$WORKSPACE\$"; then terraform workspace select "$WORKSPACE" else terraform workspace new "$WORKSPACE" @@ -79,14 +79,13 @@ activate () { } destroy () { - if ! terraform workspace list | grep -q "$WORKSPACE"; then + if ! terraform workspace list | grep -qP "\s$WORKSPACE\$"; then echo "[+] Devenv workspace '$WORKSPACE' does not exist, nothing to destroy" exit 0 fi echo "[+] Destroying workspace: '$WORKSPACE'" terraform workspace select "$WORKSPACE" terraform apply -destroy -auto-approve - terraform workspace select default } list () { @@ -114,7 +113,8 @@ main () { exit_unless_command_exists sed exit_unless_command_exists cut exit_unless_command_exists tr - + exit_unless_command_exists grep + # Assigns $WORKSPACE variable generate_azure_private_workspace_name From e2d6f3a065db9d7001d28e58728606c06d29883e Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Mon, 29 Jan 2024 12:34:23 +0200 Subject: [PATCH 77/88] builder: bind to correct network security group Associate each builder VM to correct network security group. Before this change, builders were bound to binary cache's security group. Signed-off-by: Henri Rosten --- terraform/jenkins/builder.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/jenkins/builder.tf b/terraform/jenkins/builder.tf index 29de9b8d..0f5c18ac 100644 --- a/terraform/jenkins/builder.tf +++ b/terraform/jenkins/builder.tf @@ -58,7 +58,7 @@ resource "azurerm_network_interface_security_group_association" "builder_vm" { count = local.num_builders network_interface_id = module.builder_vm[count.index].virtual_machine_network_interface_id - network_security_group_id = azurerm_network_security_group.binary_cache_vm.id + network_security_group_id = azurerm_network_security_group.builder_vm[count.index].id } resource "azurerm_network_security_group" "builder_vm" { From fc4770a5df5e6f3fea345647eb31e6169fc5cc75 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Mon, 29 Jan 2024 13:55:31 +0200 Subject: [PATCH 78/88] jenkins-controller: delay jenkins service startup Run jenkins service only after cloud-config. This is an attempt to fix the occasional jenkins service startup failures. Signed-off-by: Henri Rosten --- hosts/jenkins-controller/configuration.nix | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index 86b24381..8e6c053b 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -74,6 +74,8 @@ in { port = 8081; withCLI = true; }; + systemd.services.jenkins.after = ["cloud-config.service"]; + systemd.services.jenkins.requires = ["cloud-config.service"]; # set StateDirectory=jenkins, so state volume has the right permissions # and we wait on the mountpoint to appear. From fe3f4db7b8845281499c2b85101e8a333a594ac0 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Fri, 2 Feb 2024 06:16:54 +0200 Subject: [PATCH 79/88] terraform: restructure terraform directory Signed-off-by: Henri Rosten --- .sops.yaml | 2 +- hosts/builder/configuration.nix | 2 +- hosts/jenkins-controller/configuration.nix | 2 +- .../{jenkins/README.md => README-jenkins.md} | 0 terraform/azarm/azarm.tf | 141 ++++++++++ terraform/{ => azarm}/secrets.yaml | 0 .../{scripts => azarm}/ubuntu-builder.sh | 0 terraform/azure-ghaf-infra.tf | 252 ------------------ .../azure-secrets/binary_cache_signing.tf | 85 ------ .../binary_cache.tf => binary-cache.tf} | 54 ++-- terraform/{jenkins => }/builder.tf | 38 ++- terraform/init-ghaf-infra.sh | 95 +++++++ ...ns_controller.tf => jenkins-controller.tf} | 37 ++- terraform/jenkins/binary_cache_signing.tf | 18 -- terraform/jenkins/image_storage.tf | 21 -- terraform/jenkins/main.tf | 57 ---- terraform/jenkins/remote_build_ssh.tf | 24 -- terraform/jenkins/remote_state.tf | 13 - terraform/main.tf | 207 ++++++++++++++ .../modules}/azurerm-linux-vm/README.md | 0 .../modules}/azurerm-linux-vm/variables.tf | 0 .../azurerm-linux-vm/virtual_machine.tf | 0 .../modules}/azurerm-nix-vm-image/README.md | 0 .../modules}/azurerm-nix-vm-image/main.tf | 0 .../azurerm-nix-vm-image/nix-build.sh | 0 .../azurerm-nix-vm-image/variables.tf | 0 .../binary-cache-sigkey.tf | 72 +++++ .../binary-cache-storage.tf} | 24 +- .../builder-ssh-key/builder-ssh-key.tf} | 44 ++- terraform/persistent/main.tf | 138 ++++++++++ terraform/playground/test-infra.tf | 10 +- .../tfstate-storage.tf | 14 +- 32 files changed, 775 insertions(+), 575 deletions(-) rename terraform/{jenkins/README.md => README-jenkins.md} (100%) create mode 100644 terraform/azarm/azarm.tf rename terraform/{ => azarm}/secrets.yaml (100%) rename terraform/{scripts => azarm}/ubuntu-builder.sh (100%) delete mode 100644 terraform/azure-ghaf-infra.tf delete mode 100644 terraform/azure-secrets/binary_cache_signing.tf rename terraform/{jenkins/binary_cache.tf => binary-cache.tf} (59%) rename terraform/{jenkins => }/builder.tf (64%) create mode 100755 terraform/init-ghaf-infra.sh rename terraform/{jenkins/jenkins_controller.tf => jenkins-controller.tf} (83%) delete mode 100644 terraform/jenkins/binary_cache_signing.tf delete mode 100644 terraform/jenkins/image_storage.tf delete mode 100644 terraform/jenkins/main.tf delete mode 100644 terraform/jenkins/remote_build_ssh.tf delete mode 100644 terraform/jenkins/remote_state.tf create mode 100644 terraform/main.tf rename {tf-modules => terraform/modules}/azurerm-linux-vm/README.md (100%) rename {tf-modules => terraform/modules}/azurerm-linux-vm/variables.tf (100%) rename {tf-modules => terraform/modules}/azurerm-linux-vm/virtual_machine.tf (100%) rename {tf-modules => terraform/modules}/azurerm-nix-vm-image/README.md (100%) rename {tf-modules => terraform/modules}/azurerm-nix-vm-image/main.tf (100%) rename {tf-modules => terraform/modules}/azurerm-nix-vm-image/nix-build.sh (100%) rename {tf-modules => terraform/modules}/azurerm-nix-vm-image/variables.tf (100%) create mode 100644 terraform/persistent/binary-cache-sigkey/binary-cache-sigkey.tf rename terraform/{jenkins/binary_cache_storage.tf => persistent/binary-cache-storage/binary-cache-storage.tf} (61%) rename terraform/{azure-secrets/remote_build_ssh.tf => persistent/builder-ssh-key/builder-ssh-key.tf} (63%) create mode 100644 terraform/persistent/main.tf rename terraform/{azure-storage => state-storage}/tfstate-storage.tf (75%) diff --git a/.sops.yaml b/.sops.yaml index 193ee81d..2c7e5d48 100644 --- a/.sops.yaml +++ b/.sops.yaml @@ -12,7 +12,7 @@ keys: - &binarycache age1s47a3y44j695gemcl0kqgjlxxvaa50de9s69jy2l6vc8xtmk5pcskhpknl - &monitoring age17s9sc2cgt9t30cyl65zya8p4zmwnndrx2r896e7gzgl08sjn0qmq3t6shs creation_rules: - - path_regex: terraform/secrets.yaml$ + - path_regex: terraform/azarm/secrets.yaml$ key_groups: - age: - *flokli diff --git a/hosts/builder/configuration.nix b/hosts/builder/configuration.nix index 664bcf9b..687b376d 100644 --- a/hosts/builder/configuration.nix +++ b/hosts/builder/configuration.nix @@ -42,7 +42,7 @@ # Configure Nix to use this as a substitutor, and the public key used for signing. # TODO: remove cache.vedenemo.dev substituter nix.settings.trusted-public-keys = [ - "ghaf-jenkins:5mkxSJ9AQd7qVhsbtNV8eBGiHOm7oeMxFXVAr8VHTlI=" + "ghaf-infra-dev:zPj3qUkGtUcnMehhQY89bayLOZBpMClIfFb5KkasLQE=" "cache.vedenemo.dev:8NhplARANhClUSWJyLVk4WMyy1Wb4rhmWW2u8AejH9E=" ]; nix.settings.substituters = [ diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index 8e6c053b..57203269 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -186,7 +186,7 @@ in { # Configure Nix to use this as a substitutor, and the public key used for signing. nix.settings.trusted-public-keys = [ - "ghaf-jenkins:5mkxSJ9AQd7qVhsbtNV8eBGiHOm7oeMxFXVAr8VHTlI=" + "ghaf-infra-dev:zPj3qUkGtUcnMehhQY89bayLOZBpMClIfFb5KkasLQE=" ]; nix.settings.substituters = [ "http://localhost:8080" diff --git a/terraform/jenkins/README.md b/terraform/README-jenkins.md similarity index 100% rename from terraform/jenkins/README.md rename to terraform/README-jenkins.md diff --git a/terraform/azarm/azarm.tf b/terraform/azarm/azarm.tf new file mode 100644 index 00000000..8de7a83f --- /dev/null +++ b/terraform/azarm/azarm.tf @@ -0,0 +1,141 @@ +# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +terraform { + required_providers { + azurerm = { + source = "hashicorp/azurerm" + } + sops = { + source = "carlpett/sops" + } + } + # Backend for storing terraform state (see ../state-storage) + backend "azurerm" { + resource_group_name = "ghaf-infra-state" + storage_account_name = "ghafinfratfstatestorage" + container_name = "ghaf-infra-tfstate-container" + key = "ghaf-azarm.tfstate" + } +} +provider "azurerm" { + features {} +} +# Sops secrets +data "sops_file" "secrets" { + source_file = "secrets.yaml" +} +# Resource group +resource "azurerm_resource_group" "rg" { + name = "ghaf-azarm-arm-builder" + location = "northeurope" +} +# Virtual Network +resource "azurerm_virtual_network" "ghaf_infra_tf_vnet" { + name = "ghaf-infra-tf-vnet" + address_space = ["10.0.0.0/16"] + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name +} +# Subnet +resource "azurerm_subnet" "ghaf_infra_tf_subnet" { + name = "ghaf-infra-tf-subnet" + resource_group_name = azurerm_resource_group.rg.name + virtual_network_name = azurerm_virtual_network.ghaf_infra_tf_vnet.name + address_prefixes = ["10.0.2.0/24"] +} + +################################################################################ + +# azarm: +# aarch64-linux builder - Ubuntu host with nix package manager. +# Why not NixOS? The reason is: we have not managed to get nixos-anywhere +# working with azure arm VMs. +# Since the host is not NixOS, all the host configuration is done on +# terraform apply using the configuration script at ./ubuntu-builder.sh + +# Public IP +resource "azurerm_public_ip" "azarm_public_ip" { + name = "azarm-public-ip" + domain_name_label = "azarm" + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name + allocation_method = "Static" +} +# Network interface +resource "azurerm_network_interface" "azarm_ni" { + name = "azarm-nic" + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name + ip_configuration { + name = "azarm_nic_configuration" + subnet_id = azurerm_subnet.ghaf_infra_tf_subnet.id + private_ip_address_allocation = "Static" + private_ip_address = "10.0.2.10" + public_ip_address_id = azurerm_public_ip.azarm_public_ip.id + } +} +# Network Security Group +resource "azurerm_network_security_group" "azarm_nsg" { + name = "azarm-nsg" + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name + security_rule { + name = "AllowSSHInbound" + priority = 300 + direction = "Inbound" + access = "Allow" + protocol = "Tcp" + source_port_range = "*" + destination_port_range = "22" + source_address_prefix = "*" + destination_address_prefix = "*" + } +} +resource "azurerm_network_interface_security_group_association" "nsg_azarm_apply" { + network_interface_id = azurerm_network_interface.azarm_ni.id + network_security_group_id = azurerm_network_security_group.azarm_nsg.id +} +# Azure arm builder (azarm) +resource "azurerm_linux_virtual_machine" "azarm_vm" { + name = "azarm" + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name + network_interface_ids = [ + azurerm_network_interface.azarm_ni.id + ] + size = "Standard_D8ps_v5" + os_disk { + name = "azarm-disk" + caching = "ReadWrite" + storage_account_type = "Premium_LRS" + disk_size_gb = 512 + } + source_image_reference { + publisher = "canonical" + offer = "0001-com-ubuntu-server-jammy" + sku = "22_04-lts-arm64" + version = "latest" + } + admin_username = data.sops_file.secrets.data["vm_admin_name"] + disable_password_authentication = true + admin_ssh_key { + username = data.sops_file.secrets.data["vm_admin_name"] + public_key = data.sops_file.secrets.data["vm_admin_rsa_pub"] + } +} +resource "azurerm_virtual_machine_extension" "deploy_ubuntu_builder" { + name = "azarm-vmext" + virtual_machine_id = azurerm_linux_virtual_machine.azarm_vm.id + publisher = "Microsoft.Azure.Extensions" + type = "CustomScript" + type_handler_version = "2.1" + settings = < /dev/null; then + echo "Error: command '$1' is not installed" >&2 + exit 1 + fi +} + +init_state_storage () { + echo "[+] Initializing state storage" + # See: ./state-storage + pushd "$MYDIR/state-storage" >/dev/null + terraform init >/dev/null + if ! terraform apply -auto-approve &>/dev/null; then + echo "[+] State storage is already initialized" + fi + popd >/dev/null +} + +import_bincache_sigkey () { + env="$1" + echo "[+] Importing binary cache signing key '$env'" + # Skip import if signing key is imported already + if terraform state list | grep -q secret_resource.binary_cache_signing_key_"$env"; then + echo "[+] Binary cache signing key is already imported" + return + fi + # Generate and import the key + nix-store --generate-binary-cache-key "ghaf-infra-$env" sigkey-secret.tmp "sigkey-public-$env.tmp" + terraform import secret_resource.binary_cache_signing_key_"$env" "$(< ./sigkey-secret.tmp)" + rm -f sigkey-secret.tmp +} + +init_persistent () { + echo "[+] Initializing persistent data" + # See: ./persistent + pushd "$MYDIR/persistent" >/dev/null + terraform init > /dev/null + import_bincache_sigkey "prod" + import_bincache_sigkey "dev" + echo "[+] Applying possible changes" + terraform apply -auto-approve >/dev/null + popd >/dev/null +} + +init_terraform () { + echo "[+] Running terraform init" + # It's safe to run terraform init multiple times + terraform -chdir="$MYDIR" init >/dev/null +} + +################################################################################ + +main () { + + exit_unless_command_exists az + exit_unless_command_exists terraform + exit_unless_command_exists nix-store + exit_unless_command_exists grep + + init_state_storage + init_persistent + init_terraform + +} + +main "$@" + +################################################################################ diff --git a/terraform/jenkins/jenkins_controller.tf b/terraform/jenkins-controller.tf similarity index 83% rename from terraform/jenkins/jenkins_controller.tf rename to terraform/jenkins-controller.tf index 732f4e84..de997d07 100644 --- a/terraform/jenkins/jenkins_controller.tf +++ b/terraform/jenkins-controller.tf @@ -4,29 +4,26 @@ # Build the Jenkins controller image module "jenkins_controller_image" { - source = "../../tf-modules/azurerm-nix-vm-image" + source = "./modules/azurerm-nix-vm-image" nix_attrpath = "outputs.nixosConfigurations.jenkins-controller.config.system.build.azureImage" - nix_entrypoint = "${path.module}/../.." - - - name = "jenkins-controller" - resource_group_name = azurerm_resource_group.default.name - location = azurerm_resource_group.default.location + nix_entrypoint = "${path.module}/.." + name = "jenkins-controller" + resource_group_name = azurerm_resource_group.infra.name + location = azurerm_resource_group.infra.location storage_account_name = azurerm_storage_account.vm_images.name storage_container_name = azurerm_storage_container.vm_images.name } # Create a machine using this image module "jenkins_controller_vm" { - source = "../../tf-modules/azurerm-linux-vm" + source = "./modules/azurerm-linux-vm" - resource_group_name = azurerm_resource_group.default.name - location = azurerm_resource_group.default.location - - virtual_machine_name = "ghaf-jenkins-controller-${local.name_postfix}" - virtual_machine_size = "Standard_D2_v2" + resource_group_name = azurerm_resource_group.infra.name + location = azurerm_resource_group.infra.location + virtual_machine_name = "ghaf-jenkins-controller-${local.env}" + virtual_machine_size = local.opts[local.conf].vm_size_controller virtual_machine_source_image = module.jenkins_controller_image.image_id virtual_machine_custom_data = join("\n", ["#cloud-config", yamlencode({ @@ -48,7 +45,7 @@ module "jenkins_controller_vm" { "path" = "/var/lib/fetch-binary-cache-signing-key/env" }, { - content = "AZURE_STORAGE_ACCOUNT_NAME=${azurerm_storage_account.binary_cache.name}", + content = "AZURE_STORAGE_ACCOUNT_NAME=${data.azurerm_storage_account.binary_cache.name}", "path" = "/var/lib/rclone-http/env" }, # Render /etc/nix/machines with terraform. In the future, we might want to @@ -90,8 +87,8 @@ resource "azurerm_network_interface_security_group_association" "jenkins_control resource "azurerm_network_security_group" "jenkins_controller_vm" { name = "jenkins-controller-vm" - resource_group_name = azurerm_resource_group.default.name - location = azurerm_resource_group.default.location + resource_group_name = azurerm_resource_group.infra.name + location = azurerm_resource_group.infra.location security_rule { name = "AllowSSHInbound" @@ -109,15 +106,13 @@ resource "azurerm_network_security_group" "jenkins_controller_vm" { # Create a data disk resource "azurerm_managed_disk" "jenkins_controller_jenkins_state" { name = "jenkins-controller-vm-jenkins-state" - resource_group_name = azurerm_resource_group.default.name - location = azurerm_resource_group.default.location + resource_group_name = azurerm_resource_group.infra.name + location = azurerm_resource_group.infra.location storage_account_type = "Standard_LRS" create_option = "Empty" disk_size_gb = 10 } -data "azurerm_client_config" "current" {} - # Grant the VM read-only access to the Azure Key Vault Secret containing the # ed25519 private key used to connect to remote builders. resource "azurerm_key_vault_access_policy" "ssh_remote_build_jenkins_controller" { @@ -132,7 +127,7 @@ resource "azurerm_key_vault_access_policy" "ssh_remote_build_jenkins_controller" # Allow the VM to *write* to (and read from) the binary cache bucket resource "azurerm_role_assignment" "jenkins_controller_access_storage" { - scope = azurerm_storage_container.binary_cache_1.resource_manager_id + scope = data.azurerm_storage_container.binary_cache_1.resource_manager_id role_definition_name = "Storage Blob Data Contributor" principal_id = module.jenkins_controller_vm.virtual_machine_identity_principal_id } diff --git a/terraform/jenkins/binary_cache_signing.tf b/terraform/jenkins/binary_cache_signing.tf deleted file mode 100644 index 1554c7a3..00000000 --- a/terraform/jenkins/binary_cache_signing.tf +++ /dev/null @@ -1,18 +0,0 @@ -# SPDX-FileCopyrightText: 2024 Technology Innovation Institute (TII) -# -# SPDX-License-Identifier: Apache-2.0 - -# Reads the binary cache signing key from a key vault -# on resource group "ghaf-infra-secrets" (see: ../azure-secrets) - -data "azurerm_key_vault" "binary_cache_signing_key" { - name = "ghaf-binarycache-signing" - resource_group_name = "ghaf-infra-secrets" - provider = azurerm -} - -data "azurerm_key_vault_secret" "binary_cache_signing_key" { - name = "binary-cache-signing-key" - key_vault_id = data.azurerm_key_vault.binary_cache_signing_key.id - provider = azurerm -} diff --git a/terraform/jenkins/image_storage.tf b/terraform/jenkins/image_storage.tf deleted file mode 100644 index ffad74b1..00000000 --- a/terraform/jenkins/image_storage.tf +++ /dev/null @@ -1,21 +0,0 @@ -# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) -# -# SPDX-License-Identifier: Apache-2.0 - -# Storage account and storage container used to store VM images - -resource "azurerm_storage_account" "vm_images" { - name = "vimg${local.name_postfix}" - resource_group_name = azurerm_resource_group.default.name - location = azurerm_resource_group.default.location - account_tier = "Standard" - account_replication_type = "LRS" - allow_nested_items_to_be_public = false -} - -resource "azurerm_storage_container" "vm_images" { - name = "ghaf-infra-vm-images" - storage_account_name = azurerm_storage_account.vm_images.name - container_access_type = "private" -} - diff --git a/terraform/jenkins/main.tf b/terraform/jenkins/main.tf deleted file mode 100644 index e11134f7..00000000 --- a/terraform/jenkins/main.tf +++ /dev/null @@ -1,57 +0,0 @@ -# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) -# -# SPDX-License-Identifier: Apache-2.0 - -provider "azurerm" { - features {} -} - -terraform { - required_providers { - azurerm = { - source = "hashicorp/azurerm" - } - secret = { - source = "numtide/secret" - } - } -} - -locals { - # read ssh-keys.yaml into local.ssh_keys - ssh_keys = yamldecode(file("../../ssh-keys.yaml")) - # postfix used in the resource group name - rg_postfix = terraform.workspace == "default" ? "prod" : terraform.workspace - # postfix used in various resource names - name_postfix = terraform.workspace == "default" ? "ghafprod" : terraform.workspace -} - -# The resource group everything in this terraform module lives in -resource "azurerm_resource_group" "default" { - name = "ghaf-infra-jenkins-${local.rg_postfix}" - location = "northeurope" -} - -# Create a virtual network -resource "azurerm_virtual_network" "vnet" { - name = "ghaf-infra-vnet" - address_space = ["10.0.0.0/16"] - location = azurerm_resource_group.default.location - resource_group_name = azurerm_resource_group.default.name -} - -# Slice out a subnet for jenkins. -resource "azurerm_subnet" "jenkins" { - name = "ghaf-infra-jenkins" - resource_group_name = azurerm_resource_group.default.name - virtual_network_name = azurerm_virtual_network.vnet.name - address_prefixes = ["10.0.2.0/24"] -} - -# Slice out a subnet for the buidlers. -resource "azurerm_subnet" "builders" { - name = "ghaf-infra-builders" - resource_group_name = azurerm_resource_group.default.name - virtual_network_name = azurerm_virtual_network.vnet.name - address_prefixes = ["10.0.4.0/28"] -} diff --git a/terraform/jenkins/remote_build_ssh.tf b/terraform/jenkins/remote_build_ssh.tf deleted file mode 100644 index 3474894f..00000000 --- a/terraform/jenkins/remote_build_ssh.tf +++ /dev/null @@ -1,24 +0,0 @@ -# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) -# -# SPDX-License-Identifier: Apache-2.0 - -# Reads the builder ssh key from a key vault -# on resource group "ghaf-infra-secrets" (see: ../azure-secrets) - -data "azurerm_key_vault" "ssh_remote_build" { - name = "ghaf-ssh-remote-build" - resource_group_name = "ghaf-infra-secrets" - provider = azurerm -} - -data "azurerm_key_vault_secret" "ssh_remote_build" { - name = "remote-build-ssh-private-key" - key_vault_id = data.azurerm_key_vault.ssh_remote_build.id - provider = azurerm -} - -data "azurerm_key_vault_secret" "ssh_remote_build_pub" { - name = "remote-build-ssh-public-key" - key_vault_id = data.azurerm_key_vault.ssh_remote_build.id - provider = azurerm -} diff --git a/terraform/jenkins/remote_state.tf b/terraform/jenkins/remote_state.tf deleted file mode 100644 index 3fdf2ce8..00000000 --- a/terraform/jenkins/remote_state.tf +++ /dev/null @@ -1,13 +0,0 @@ -# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) -# -# SPDX-License-Identifier: Apache-2.0 - -terraform { - # Backend for storing tfstate (see ../azure-storage) - backend "azurerm" { - resource_group_name = "ghaf-infra-storage" - storage_account_name = "ghafinfrastatestorage" - container_name = "ghaf-infra-tfstate-container" - key = "jenkins.tfstate" - } -} diff --git a/terraform/main.tf b/terraform/main.tf new file mode 100644 index 00000000..4d8a1d53 --- /dev/null +++ b/terraform/main.tf @@ -0,0 +1,207 @@ +# SPDX-FileCopyrightText: 2024 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +provider "azurerm" { + features {} +} + +terraform { + required_providers { + azurerm = { + source = "hashicorp/azurerm" + } + secret = { + source = "numtide/secret" + } + } +} + +################################################################################ + +terraform { + # Backend for storing terraform state (see ../state-storage) + backend "azurerm" { + resource_group_name = "ghaf-infra-state" + storage_account_name = "ghafinfratfstatestorage" + container_name = "ghaf-infra-tfstate-container" + key = "ghaf-infra.tfstate" + } +} + +################################################################################ + +# Current signed-in user +data "azurerm_client_config" "current" {} + +# Variables +variable "location" { + type = string + default = "northeurope" + description = "Azure region into which the resources will be deployed" +} + +# Use azure_region module to get the short name of the Azure region, +# see: https://registry.terraform.io/modules/claranet/regions/azurerm/latest +module "azure_region" { + source = "claranet/regions/azurerm" + azure_region = var.location +} + +locals { + # Raise an error if workspace is 'default', + # this is a workaround to missing asserts in terraform: + assert_workspace_not_default = regex( + (terraform.workspace == "default") ? + "((Force invalid regex pattern)\n\nERROR: default workspace not allowed" : "", "") + + # Short name of the Azure region, see: + # https://github.com/claranet/terraform-azurerm-regions/blob/master/REGIONS.md + shortloc = module.azure_region.location_short + + # Sanitize workspace name + ws = substr(replace(lower(terraform.workspace), "/[^a-z0-9]/", ""), 0, 16) + + # Environment-specific configuration options. + # See Azure vm sizes and specs at: + # https://azure.microsoft.com/en-us/pricing/details/virtual-machines/linux + # E.g. 'Standard_D1_v2' means: 1 vCPU, 3.5 GiB RAM + opts = { + priv = { + vm_size_binarycache = "Standard_D1_v2" + vm_size_builder = "Standard_D2_v3" + vm_size_controller = "Standard_D2_v3" + num_builders = 1 + } + dev = { + vm_size_binarycache = "Standard_D1_v2" + vm_size_builder = "Standard_D4_v3" + vm_size_controller = "Standard_D4_v3" + num_builders = 1 + } + prod = { + vm_size_binarycache = "Standard_D2_v2" + vm_size_builder = "Standard_D8_v3" + vm_size_controller = "Standard_D8_v3" + num_builders = 2 + } + } + + # Read ssh-keys.yaml into local.ssh_keys + ssh_keys = yamldecode(file("../ssh-keys.yaml")) + + # Map workspace name to configuration name: + # !"dev" && !"prod" ==> "priv" + # "dev" ==> "dev" + # "prod" ==> "prod" + # This determines the configuration options used in the + # ghaf-infra instance (defines e.g. vm_sizes and number of builders) + # TODO: allow overwriting this with an input variable + conf = local.ws != "dev" && local.ws != "prod" ? "priv" : local.ws + + # env is used to identify workspace-specific resources: + env = local.ws + + # Selects the persistent data used in the ghaf-infra instance, currently + # either "dev" or "prod" + # (see ./persistent) + persistent_data = local.conf == "priv" ? "dev" : local.conf +} + +################################################################################ + +# Resource group for this ghaf-infra instance +resource "azurerm_resource_group" "infra" { + name = "ghaf-infra-${local.env}" + location = var.location +} + +################################################################################ + +# Virtual network +resource "azurerm_virtual_network" "vnet" { + name = "ghaf-infra-vnet" + address_space = ["10.0.0.0/16"] + location = azurerm_resource_group.infra.location + resource_group_name = azurerm_resource_group.infra.name +} + +# Slice out a subnet for jenkins +resource "azurerm_subnet" "jenkins" { + name = "ghaf-infra-jenkins" + resource_group_name = azurerm_resource_group.infra.name + virtual_network_name = azurerm_virtual_network.vnet.name + address_prefixes = ["10.0.2.0/24"] +} + +# Slice out a subnet for the buidlers +resource "azurerm_subnet" "builders" { + name = "ghaf-infra-builders" + resource_group_name = azurerm_resource_group.infra.name + virtual_network_name = azurerm_virtual_network.vnet.name + address_prefixes = ["10.0.4.0/28"] +} + +################################################################################ + +# Storage account and storage container used to store VM images + +resource "azurerm_storage_account" "vm_images" { + name = "img${local.env}${local.shortloc}" + resource_group_name = azurerm_resource_group.infra.name + location = azurerm_resource_group.infra.location + account_tier = "Standard" + account_replication_type = "LRS" + allow_nested_items_to_be_public = false +} + +resource "azurerm_storage_container" "vm_images" { + name = "ghaf-infra-vm-images" + storage_account_name = azurerm_storage_account.vm_images.name + container_access_type = "private" +} + +################################################################################ + +# Data sources to access 'persistent' data, see ./persistent + +data "azurerm_storage_account" "binary_cache" { + name = "ghafbincache${local.persistent_data}${local.shortloc}" + resource_group_name = "ghaf-infra-persistent" +} +data "azurerm_storage_container" "binary_cache_1" { + name = "binary-cache-v1" + storage_account_name = data.azurerm_storage_account.binary_cache.name +} + +data "azurerm_key_vault" "ssh_remote_build" { + name = "ssh-builder-${local.persistent_data}-${local.shortloc}" + resource_group_name = "ghaf-infra-persistent" + provider = azurerm +} + +data "azurerm_key_vault_secret" "ssh_remote_build" { + name = "remote-build-ssh-private-key" + key_vault_id = data.azurerm_key_vault.ssh_remote_build.id + provider = azurerm +} + +data "azurerm_key_vault_secret" "ssh_remote_build_pub" { + name = "remote-build-ssh-public-key" + key_vault_id = data.azurerm_key_vault.ssh_remote_build.id + provider = azurerm +} + +data "azurerm_key_vault" "binary_cache_signing_key" { + name = "bche-sigkey-${local.persistent_data}-${local.shortloc}" + resource_group_name = "ghaf-infra-persistent" + provider = azurerm +} + +data "azurerm_key_vault_secret" "binary_cache_signing_key" { + name = "binary-cache-signing-key-priv" + key_vault_id = data.azurerm_key_vault.binary_cache_signing_key.id + provider = azurerm +} + +################################################################################ diff --git a/tf-modules/azurerm-linux-vm/README.md b/terraform/modules/azurerm-linux-vm/README.md similarity index 100% rename from tf-modules/azurerm-linux-vm/README.md rename to terraform/modules/azurerm-linux-vm/README.md diff --git a/tf-modules/azurerm-linux-vm/variables.tf b/terraform/modules/azurerm-linux-vm/variables.tf similarity index 100% rename from tf-modules/azurerm-linux-vm/variables.tf rename to terraform/modules/azurerm-linux-vm/variables.tf diff --git a/tf-modules/azurerm-linux-vm/virtual_machine.tf b/terraform/modules/azurerm-linux-vm/virtual_machine.tf similarity index 100% rename from tf-modules/azurerm-linux-vm/virtual_machine.tf rename to terraform/modules/azurerm-linux-vm/virtual_machine.tf diff --git a/tf-modules/azurerm-nix-vm-image/README.md b/terraform/modules/azurerm-nix-vm-image/README.md similarity index 100% rename from tf-modules/azurerm-nix-vm-image/README.md rename to terraform/modules/azurerm-nix-vm-image/README.md diff --git a/tf-modules/azurerm-nix-vm-image/main.tf b/terraform/modules/azurerm-nix-vm-image/main.tf similarity index 100% rename from tf-modules/azurerm-nix-vm-image/main.tf rename to terraform/modules/azurerm-nix-vm-image/main.tf diff --git a/tf-modules/azurerm-nix-vm-image/nix-build.sh b/terraform/modules/azurerm-nix-vm-image/nix-build.sh similarity index 100% rename from tf-modules/azurerm-nix-vm-image/nix-build.sh rename to terraform/modules/azurerm-nix-vm-image/nix-build.sh diff --git a/tf-modules/azurerm-nix-vm-image/variables.tf b/terraform/modules/azurerm-nix-vm-image/variables.tf similarity index 100% rename from tf-modules/azurerm-nix-vm-image/variables.tf rename to terraform/modules/azurerm-nix-vm-image/variables.tf diff --git a/terraform/persistent/binary-cache-sigkey/binary-cache-sigkey.tf b/terraform/persistent/binary-cache-sigkey/binary-cache-sigkey.tf new file mode 100644 index 00000000..8288843f --- /dev/null +++ b/terraform/persistent/binary-cache-sigkey/binary-cache-sigkey.tf @@ -0,0 +1,72 @@ +# SPDX-FileCopyrightText: 2024 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +################################################################################ + +# May only contain alphanumeric characters and dashes and must be between 3-24 +# chars, must be globally unique +variable "bincache_keyvault_name" { + type = string +} + +variable "resource_group_name" { + type = string +} + +variable "location" { + type = string +} + +variable "secret_resource" { + type = object({ + value = string + }) +} + +variable "tenant_id" { + type = string +} + + +################################################################################ + +# Create an Azure key vault. +resource "azurerm_key_vault" "binary_cache_signing_key" { + name = var.bincache_keyvault_name + location = var.location + resource_group_name = var.resource_group_name + sku_name = "standard" + # The Azure Active Directory tenant ID that should be used for authenticating + # requests to the key vault. + tenant_id = var.tenant_id +} + +# Upload the binary cache signing key as a vault secret +resource "azurerm_key_vault_secret" "binary_cache_signing_key" { + name = "binary-cache-signing-key-priv" + value = var.secret_resource.value + key_vault_id = azurerm_key_vault.binary_cache_signing_key.id + + # Each of the secrets needs an explicit dependency on the access policy. + # Otherwise, Terraform may attempt to create the secret before creating the + # access policy. + # https://stackoverflow.com/a/74747333 + depends_on = [ + azurerm_key_vault_access_policy.binary_cache_signing_key_terraform + ] +} + +resource "azurerm_key_vault_access_policy" "binary_cache_signing_key_terraform" { + key_vault_id = azurerm_key_vault.binary_cache_signing_key.id + tenant_id = var.tenant_id + # "TerraformAdminsGHAFInfra" group + object_id = "f80c2488-2301-4de8-89d6-4954b77f453e" + + secret_permissions = [ + "Get", + "List", + "Set" + ] +} + diff --git a/terraform/jenkins/binary_cache_storage.tf b/terraform/persistent/binary-cache-storage/binary-cache-storage.tf similarity index 61% rename from terraform/jenkins/binary_cache_storage.tf rename to terraform/persistent/binary-cache-storage/binary-cache-storage.tf index 52797c9a..224c167c 100644 --- a/terraform/jenkins/binary_cache_storage.tf +++ b/terraform/persistent/binary-cache-storage/binary-cache-storage.tf @@ -2,11 +2,29 @@ # # SPDX-License-Identifier: Apache-2.0 +################################################################################ + +# Can only consist of lowercase letters and numbers, and must be between 3 +# and 24 characters long, must be globally unique +variable "bincache_storage_account_name" { + type = string +} + +variable "resource_group_name" { + type = string +} + +variable "location" { + type = string +} + +################################################################################ + # Create the storage account and storage container resource "azurerm_storage_account" "binary_cache" { - name = "bche${local.name_postfix}" - resource_group_name = azurerm_resource_group.default.name # TODO: separate resource group? - location = azurerm_resource_group.default.location + name = var.bincache_storage_account_name + resource_group_name = var.resource_group_name + location = var.location account_tier = "Standard" account_replication_type = "LRS" allow_nested_items_to_be_public = false diff --git a/terraform/azure-secrets/remote_build_ssh.tf b/terraform/persistent/builder-ssh-key/builder-ssh-key.tf similarity index 63% rename from terraform/azure-secrets/remote_build_ssh.tf rename to terraform/persistent/builder-ssh-key/builder-ssh-key.tf index cc78db1c..b2904c18 100644 --- a/terraform/azure-secrets/remote_build_ssh.tf +++ b/terraform/persistent/builder-ssh-key/builder-ssh-key.tf @@ -2,25 +2,47 @@ # # SPDX-License-Identifier: Apache-2.0 +################################################################################ + +# May only contain alphanumeric characters and dashes and must be between 3-24 +# chars, must be globally unique +variable "builder_ssh_keyvault_name" { + type = string +} + +variable "resource_group_name" { + type = string +} + +variable "location" { + type = string +} + +variable "tenant_id" { + type = string +} + +################################################################################ + # Create a ED25519 key, which the jenkins master will use to authenticate with # builders. resource "tls_private_key" "ed25519_remote_build" { - algorithm = "ED25519" + algorithm = "ED25519" + ecdsa_curve = "P521" } -# Create an Azure key vault. +# Create an Azure key vault resource "azurerm_key_vault" "ssh_remote_build" { - # this must be globally unique - name = "ghaf-ssh-remote-build" - location = azurerm_resource_group.default.location - resource_group_name = azurerm_resource_group.default.name + name = var.builder_ssh_keyvault_name + location = var.location + resource_group_name = var.resource_group_name sku_name = "standard" - # The Azure Active Directory tenant ID that should be used for authenticating - # requests to the key vault. - tenant_id = data.azurerm_client_config.current.tenant_id + # The Azure Active Directory tenant ID that should be used for authenticating + # requests to the key vault + tenant_id = var.tenant_id } -# Put the ed25519 private key used for ssh as a secret. +# Put the ed25519 private key used for ssh as a secret resource "azurerm_key_vault_secret" "ssh_remote_build" { name = "remote-build-ssh-private-key" value = tls_private_key.ed25519_remote_build.private_key_openssh @@ -48,7 +70,7 @@ resource "azurerm_key_vault_secret" "ssh_remote_build_pub" { resource "azurerm_key_vault_access_policy" "ssh_remote_build_terraform" { key_vault_id = azurerm_key_vault.ssh_remote_build.id - tenant_id = data.azurerm_client_config.current.tenant_id + tenant_id = var.tenant_id # "TerraformAdminsGHAFInfra" group object_id = "f80c2488-2301-4de8-89d6-4954b77f453e" diff --git a/terraform/persistent/main.tf b/terraform/persistent/main.tf new file mode 100644 index 00000000..32f1938d --- /dev/null +++ b/terraform/persistent/main.tf @@ -0,0 +1,138 @@ +# SPDX-FileCopyrightText: 2024 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +provider "azurerm" { + features {} +} + +terraform { + required_providers { + azurerm = { + source = "hashicorp/azurerm" + } + secret = { + source = "numtide/secret" + } + } +} + +################################################################################ + +terraform { + # Backend for storing terraform state (see ../state-storage) + backend "azurerm" { + resource_group_name = "ghaf-infra-state" + storage_account_name = "ghafinfratfstatestorage" + container_name = "ghaf-infra-tfstate-container" + key = "ghaf-infra-persistent.tfstate" + } +} + +################################################################################ + +# Variables +variable "location" { + type = string + default = "northeurope" + description = "Azure region into which the resources will be deployed" +} + +# Use azure_region module to get the short name of the Azure region, +# see: https://registry.terraform.io/modules/claranet/regions/azurerm/latest +# and: https://github.com/claranet/terraform-azurerm-regions/blob/master/REGIONS.md +module "azure_region" { + source = "claranet/regions/azurerm" + azure_region = var.location +} + +locals { + shortloc = module.azure_region.location_short +} + +# Resource group +resource "azurerm_resource_group" "persistent" { + name = "ghaf-infra-persistent" + location = var.location +} + +# Current signed-in user +data "azurerm_client_config" "current" {} + +################################################################################ + +# Resources + +# secret_resouce must be created on import, e.g.: +# +# nix-store --generate-binary-cache-key foo secret-key public-key +# terraform import secret_resource.binary_cache_signing_key_dev "$(< ./secret-key)" +# terraform apply +# +# Ghaf-infra automates the creation in 'init-ghaf-infra.sh' +resource "secret_resource" "binary_cache_signing_key_dev" { + lifecycle { + prevent_destroy = true + } +} +resource "secret_resource" "binary_cache_signing_key_prod" { + lifecycle { + prevent_destroy = true + } +} + +module "builder_ssh_key_prod" { + source = "./builder-ssh-key" + # Must be globally unique + builder_ssh_keyvault_name = "ssh-builder-prod-${local.shortloc}" + resource_group_name = azurerm_resource_group.persistent.name + location = azurerm_resource_group.persistent.location + tenant_id = data.azurerm_client_config.current.tenant_id +} + +module "builder_ssh_key_dev" { + source = "./builder-ssh-key" + # Must be globally unique + builder_ssh_keyvault_name = "ssh-builder-dev-${local.shortloc}" + resource_group_name = azurerm_resource_group.persistent.name + location = azurerm_resource_group.persistent.location + tenant_id = data.azurerm_client_config.current.tenant_id +} + +module "binary_cache_sigkey_prod" { + source = "./binary-cache-sigkey" + # Must be globally unique + bincache_keyvault_name = "bche-sigkey-prod-${local.shortloc}" + secret_resource = secret_resource.binary_cache_signing_key_prod + resource_group_name = azurerm_resource_group.persistent.name + location = azurerm_resource_group.persistent.location + tenant_id = data.azurerm_client_config.current.tenant_id +} + +module "binary_cache_sigkey_dev" { + source = "./binary-cache-sigkey" + # Must be globally unique + bincache_keyvault_name = "bche-sigkey-dev-${local.shortloc}" + secret_resource = secret_resource.binary_cache_signing_key_dev + resource_group_name = azurerm_resource_group.persistent.name + location = azurerm_resource_group.persistent.location + tenant_id = data.azurerm_client_config.current.tenant_id +} + +module "binary_cache_storage_prod" { + source = "./binary-cache-storage" + # Must be globally unique + bincache_storage_account_name = "ghafbincacheprod${local.shortloc}" + resource_group_name = azurerm_resource_group.persistent.name + location = azurerm_resource_group.persistent.location +} + +module "binary_cache_storage_dev" { + source = "./binary-cache-storage" + # Must be globally unique + bincache_storage_account_name = "ghafbincachedev${local.shortloc}" + resource_group_name = azurerm_resource_group.persistent.name + location = azurerm_resource_group.persistent.location +} + +################################################################################ diff --git a/terraform/playground/test-infra.tf b/terraform/playground/test-infra.tf index e5f67a94..edcabe11 100644 --- a/terraform/playground/test-infra.tf +++ b/terraform/playground/test-infra.tf @@ -8,10 +8,10 @@ terraform { source = "hashicorp/azurerm" } } - # Backend for storing tfstate (see ./azure-storage) + # Backend for storing tfstate (see ../state-storage) backend "azurerm" { - resource_group_name = "ghaf-infra-storage" - storage_account_name = "ghafinfrastatestorage" + resource_group_name = "ghaf-infra-state" + storage_account_name = "ghafinfratfstatestorage" container_name = "ghaf-infra-tfstate-container" key = "ghaf-infra-playground.tfstate" } @@ -74,7 +74,7 @@ resource "azurerm_storage_container" "vm_images" { # VM module "test_image" { - source = "../../tf-modules/azurerm-nix-vm-image" + source = "../modules/azurerm-nix-vm-image" nix_attrpath = "outputs.nixosConfigurations.builder.config.system.build.azureImage" nix_entrypoint = "${path.module}/../.." @@ -92,7 +92,7 @@ locals { } module "test_vm" { - source = "../../tf-modules/azurerm-linux-vm" + source = "../modules/azurerm-linux-vm" count = local.num_vms diff --git a/terraform/azure-storage/tfstate-storage.tf b/terraform/state-storage/tfstate-storage.tf similarity index 75% rename from terraform/azure-storage/tfstate-storage.tf rename to terraform/state-storage/tfstate-storage.tf index 4f50c012..6771c2d6 100644 --- a/terraform/azure-storage/tfstate-storage.tf +++ b/terraform/state-storage/tfstate-storage.tf @@ -16,22 +16,16 @@ provider "azurerm" { # Resource group -variable "resource_group_location" { - type = string - default = "northeurope" - description = "Location of the resource group." -} - resource "azurerm_resource_group" "rg" { - name = "ghaf-infra-storage" - location = var.resource_group_location + name = "ghaf-infra-state" + location = "northeurope" } - # Storage container resource "azurerm_storage_account" "tfstate" { - name = "ghafinfrastatestorage" + # This must be globally unique, max 24 characters + name = "ghafinfratfstatestorage" resource_group_name = azurerm_resource_group.rg.name location = azurerm_resource_group.rg.location account_tier = "Standard" From f35291c42d5be394ca53cddf91e7e0b26fd84c65 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Mon, 5 Feb 2024 10:12:10 +0200 Subject: [PATCH 80/88] terraform: allow many persistent instances Signed-off-by: Henri Rosten --- hosts/builder/configuration.nix | 2 +- hosts/jenkins-controller/configuration.nix | 2 +- terraform/main.tf | 6 ++-- terraform/persistent/main.tf | 32 +++++++++---------- terraform/playground/README.md | 4 +-- terraform/playground/terraform-playground.sh | 4 +-- .../{init-ghaf-infra.sh => terraform-init.sh} | 5 +-- 7 files changed, 28 insertions(+), 27 deletions(-) rename terraform/{init-ghaf-infra.sh => terraform-init.sh} (93%) diff --git a/hosts/builder/configuration.nix b/hosts/builder/configuration.nix index 687b376d..91aa04de 100644 --- a/hosts/builder/configuration.nix +++ b/hosts/builder/configuration.nix @@ -42,7 +42,7 @@ # Configure Nix to use this as a substitutor, and the public key used for signing. # TODO: remove cache.vedenemo.dev substituter nix.settings.trusted-public-keys = [ - "ghaf-infra-dev:zPj3qUkGtUcnMehhQY89bayLOZBpMClIfFb5KkasLQE=" + "ghaf-infra-dev:EdgcUJsErufZitluMOYmoJDMQE+HFyveI/D270Cr84I=" "cache.vedenemo.dev:8NhplARANhClUSWJyLVk4WMyy1Wb4rhmWW2u8AejH9E=" ]; nix.settings.substituters = [ diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/jenkins-controller/configuration.nix index 57203269..bbece7db 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/jenkins-controller/configuration.nix @@ -186,7 +186,7 @@ in { # Configure Nix to use this as a substitutor, and the public key used for signing. nix.settings.trusted-public-keys = [ - "ghaf-infra-dev:zPj3qUkGtUcnMehhQY89bayLOZBpMClIfFb5KkasLQE=" + "ghaf-infra-dev:EdgcUJsErufZitluMOYmoJDMQE+HFyveI/D270Cr84I=" ]; nix.settings.substituters = [ "http://localhost:8080" diff --git a/terraform/main.tf b/terraform/main.tf index 4d8a1d53..584eb983 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -167,7 +167,7 @@ resource "azurerm_storage_container" "vm_images" { data "azurerm_storage_account" "binary_cache" { name = "ghafbincache${local.persistent_data}${local.shortloc}" - resource_group_name = "ghaf-infra-persistent" + resource_group_name = "ghaf-infra-persistent-${local.shortloc}" } data "azurerm_storage_container" "binary_cache_1" { name = "binary-cache-v1" @@ -176,7 +176,7 @@ data "azurerm_storage_container" "binary_cache_1" { data "azurerm_key_vault" "ssh_remote_build" { name = "ssh-builder-${local.persistent_data}-${local.shortloc}" - resource_group_name = "ghaf-infra-persistent" + resource_group_name = "ghaf-infra-persistent-${local.shortloc}" provider = azurerm } @@ -194,7 +194,7 @@ data "azurerm_key_vault_secret" "ssh_remote_build_pub" { data "azurerm_key_vault" "binary_cache_signing_key" { name = "bche-sigkey-${local.persistent_data}-${local.shortloc}" - resource_group_name = "ghaf-infra-persistent" + resource_group_name = "ghaf-infra-persistent-${local.shortloc}" provider = azurerm } diff --git a/terraform/persistent/main.tf b/terraform/persistent/main.tf index 32f1938d..534870d8 100644 --- a/terraform/persistent/main.tf +++ b/terraform/persistent/main.tf @@ -38,21 +38,21 @@ variable "location" { description = "Azure region into which the resources will be deployed" } -# Use azure_region module to get the short name of the Azure region, -# see: https://registry.terraform.io/modules/claranet/regions/azurerm/latest -# and: https://github.com/claranet/terraform-azurerm-regions/blob/master/REGIONS.md -module "azure_region" { - source = "claranet/regions/azurerm" - azure_region = var.location -} - locals { - shortloc = module.azure_region.location_short + # Raise an error if workspace is 'default', + # this is a workaround to missing asserts in terraform: + assert_workspace_not_default = regex( + (terraform.workspace == "default") ? + "((Force invalid regex pattern)\n\nERROR: workspace 'default' is not allowed" : "", "") + + # Sanitize workspace name: + # Workspace name defines the persistent instance + ws = substr(replace(lower(terraform.workspace), "/[^a-z0-9]/", ""), 0, 16) } # Resource group resource "azurerm_resource_group" "persistent" { - name = "ghaf-infra-persistent" + name = "ghaf-infra-persistent-${local.ws}" location = var.location } @@ -84,7 +84,7 @@ resource "secret_resource" "binary_cache_signing_key_prod" { module "builder_ssh_key_prod" { source = "./builder-ssh-key" # Must be globally unique - builder_ssh_keyvault_name = "ssh-builder-prod-${local.shortloc}" + builder_ssh_keyvault_name = "ssh-builder-prod-${local.ws}" resource_group_name = azurerm_resource_group.persistent.name location = azurerm_resource_group.persistent.location tenant_id = data.azurerm_client_config.current.tenant_id @@ -93,7 +93,7 @@ module "builder_ssh_key_prod" { module "builder_ssh_key_dev" { source = "./builder-ssh-key" # Must be globally unique - builder_ssh_keyvault_name = "ssh-builder-dev-${local.shortloc}" + builder_ssh_keyvault_name = "ssh-builder-dev-${local.ws}" resource_group_name = azurerm_resource_group.persistent.name location = azurerm_resource_group.persistent.location tenant_id = data.azurerm_client_config.current.tenant_id @@ -102,7 +102,7 @@ module "builder_ssh_key_dev" { module "binary_cache_sigkey_prod" { source = "./binary-cache-sigkey" # Must be globally unique - bincache_keyvault_name = "bche-sigkey-prod-${local.shortloc}" + bincache_keyvault_name = "bche-sigkey-prod-${local.ws}" secret_resource = secret_resource.binary_cache_signing_key_prod resource_group_name = azurerm_resource_group.persistent.name location = azurerm_resource_group.persistent.location @@ -112,7 +112,7 @@ module "binary_cache_sigkey_prod" { module "binary_cache_sigkey_dev" { source = "./binary-cache-sigkey" # Must be globally unique - bincache_keyvault_name = "bche-sigkey-dev-${local.shortloc}" + bincache_keyvault_name = "bche-sigkey-dev-${local.ws}" secret_resource = secret_resource.binary_cache_signing_key_dev resource_group_name = azurerm_resource_group.persistent.name location = azurerm_resource_group.persistent.location @@ -122,7 +122,7 @@ module "binary_cache_sigkey_dev" { module "binary_cache_storage_prod" { source = "./binary-cache-storage" # Must be globally unique - bincache_storage_account_name = "ghafbincacheprod${local.shortloc}" + bincache_storage_account_name = "ghafbincacheprod${local.ws}" resource_group_name = azurerm_resource_group.persistent.name location = azurerm_resource_group.persistent.location } @@ -130,7 +130,7 @@ module "binary_cache_storage_prod" { module "binary_cache_storage_dev" { source = "./binary-cache-storage" # Must be globally unique - bincache_storage_account_name = "ghafbincachedev${local.shortloc}" + bincache_storage_account_name = "ghafbincachedev${local.ws}" resource_group_name = azurerm_resource_group.persistent.name location = azurerm_resource_group.persistent.location } diff --git a/terraform/playground/README.md b/terraform/playground/README.md index c70b98a2..716c04ab 100644 --- a/terraform/playground/README.md +++ b/terraform/playground/README.md @@ -7,9 +7,9 @@ SPDX-License-Identifier: CC-BY-SA-4.0 # Terraform Playground This project uses terraform to automate the creation of infrastructure resources. -To support infrastructure development in isolated development environments, we use [terraform workspaces](https://developer.hashicorp.com/terraform/cli/workspaces). +To support infrastructure development in isolated development environments, this project uses [terraform workspaces](https://developer.hashicorp.com/terraform/cli/workspaces). -The tooling under this `playground` directory is provided to facilitate the usage of terraform workspaces in setting-up a distinct copy of the target infrastructure to test a set of changes before modifying shared (dev/prod) infrastructure. +The tooling under the `playground` directory is provided to facilitate the usage of terraform workspaces in setting-up a distinct copy of the target infrastructure to test a set of changes before modifying shared (dev/prod) infrastructure. This page documents the usage of `terraform-playground.sh` to help facilitate the usage of private development environments for testing infra changes. diff --git a/terraform/playground/terraform-playground.sh b/terraform/playground/terraform-playground.sh index 9b518a0f..004e1b5b 100755 --- a/terraform/playground/terraform-playground.sh +++ b/terraform/playground/terraform-playground.sh @@ -53,13 +53,13 @@ generate_azure_private_workspace_name () { # - .userPrincipalName returns the signed-in azure username # - cut removes everything up until the first '@' # - sed keeps only letter and number characters - # - final cut keeps at most 20 characters + # - final cut keeps at most 16 characters # - tr converts the string to lower case # Thus, given a signed-in user 'foo.bar@baz.com', the workspace name # becomes 'foobar'. # Below command errors out with the azure error message if the azure user # is not signed-in. - WORKSPACE=$(az ad signed-in-user show | jq -cr .userPrincipalName | cut -d'@' -f1 | sed 's/[^a-zA-Z0-9]//g' | cut -c 1-20 | tr '[:upper:]' '[:lower:]') + WORKSPACE=$(az ad signed-in-user show | jq -cr .userPrincipalName | cut -d'@' -f1 | sed 's/[^a-zA-Z0-9]//g' | cut -c 1-16 | tr '[:upper:]' '[:lower:]') # Check WORKSPACE is non-empty and not 'default' if [ -z "$WORKSPACE" ] || [ "$WORKSPACE" = "default" ]; then echo "Error: invalid workspace name: '$WORKSPACE'" diff --git a/terraform/init-ghaf-infra.sh b/terraform/terraform-init.sh similarity index 93% rename from terraform/init-ghaf-infra.sh rename to terraform/terraform-init.sh index 705170c7..56fc337e 100755 --- a/terraform/init-ghaf-infra.sh +++ b/terraform/terraform-init.sh @@ -10,7 +10,7 @@ set -o pipefail # exit if any pipeline command fails ################################################################################ -# This script is a helper to initialize the ghaf-infra: +# This script is a helper to initialize the ghaf terraform infra: # - init terraform state storage # - init persistent secrets such as binary cache signing key (per environment) # - init persistent binary cache storage (per environment) @@ -62,6 +62,8 @@ init_persistent () { # See: ./persistent pushd "$MYDIR/persistent" >/dev/null terraform init > /dev/null + # Default persistent instance: 'eun' (northeurope) + terraform workspace select eun 2>/dev/null || terraform workspace new eun import_bincache_sigkey "prod" import_bincache_sigkey "dev" echo "[+] Applying possible changes" @@ -71,7 +73,6 @@ init_persistent () { init_terraform () { echo "[+] Running terraform init" - # It's safe to run terraform init multiple times terraform -chdir="$MYDIR" init >/dev/null } From a9ea3fd7121da8e29d32705f1fb75816196e5d83 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Tue, 6 Feb 2024 09:21:59 +0200 Subject: [PATCH 81/88] terraform: restructure documentation Signed-off-by: Henri Rosten --- .../{README-jenkins.md => README-azure.md} | 2 +- terraform/README.md | 156 +++++++++++++----- terraform/playground/README.md | 29 +--- 3 files changed, 129 insertions(+), 58 deletions(-) rename terraform/{README-jenkins.md => README-azure.md} (99%) diff --git a/terraform/README-jenkins.md b/terraform/README-azure.md similarity index 99% rename from terraform/README-jenkins.md rename to terraform/README-azure.md index b2794baf..e115b7da 100644 --- a/terraform/README-jenkins.md +++ b/terraform/README-azure.md @@ -4,7 +4,7 @@ SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) SPDX-License-Identifier: CC-BY-SA-4.0 --> -# terraform/jenkins +# terraform This directory contains the root terraform module describing the image-based CI setup in Azure. diff --git a/terraform/README.md b/terraform/README.md index 45fa5042..b0f720e8 100644 --- a/terraform/README.md +++ b/terraform/README.md @@ -6,25 +6,32 @@ SPDX-License-Identifier: CC-BY-SA-4.0 # Ghaf-infra: Terraform -This project uses terraform to automate the creation of infrastructure resources. The intended usage together with NixOS configurations in the main [flake.nix](../flake.nix) is as follows: -- We use the terraform configuration in this directory for the inital setup of the infrastructure resources (VMs, networks, etc.) -- We use the NixOS configurations in [flake.nix](../flake.nix) to [install](../README.md#install) NixOS on the VMs -- We maintain the infrastructure by [deploying](../README.md#deploy) changes to the NixOS configurations via [flake.nix](../flake.nix) +This directory contains the root terraform module describing the [ghaf](https://github.com/tiiuae/ghaf) CI setup in Azure. -Notice: the typical ghaf-infra maintenance only requires deploying changes to the existing infra. Indeed, the infrastructure setup with terraform and installation of NixOS are tasks only required when moving to a new infrastructure or introducing new resources to the existing infra. +For architectural description, see [README-azure.md](./README-azure.md) originally from [PR#35](https://github.com/tiiuae/ghaf-infra/pull/35) -## Usage +> The setup uses Nix to build disk images, uploads them to Azure, and then boots +> virtual machines off of them. +> +> Images are considered "appliance images", meant the Nix code describing their +> configuration describes the exact same purpose of the machine (no two-staged +> deployment process, the machine does the thing it's supposed to do after +> bootup), allowing to remove the need for e.g. ssh access as much as possible. +> +> Machines are considered ephemeral, every change in the appliance image / nixos +> configuration causes a new image to be built, and a new VM to be booted with +> that new image. -If you still don't have nix package manager on your local host, install it following the package manager installation instructions from https://nixos.org/download.html. +## Getting Started -Then, clone this repository: +This document assumes you have [`nix`](https://nixos.org/download.html) package manager installed on you local host. + +Clone this repository: ```bash $ git clone https://github.com/tiiuae/ghaf-infra.git -$ cd ghaf-infra/ +$ cd ghaf-infra ``` -All commands in this document are executed from nix-shell inside the `terraform` directory. - Bootstrap nix-shell with the required dependencies: ```bash # Start a nix-shell with required dependencies: @@ -37,42 +44,122 @@ $ az login $ cd terraform/ ``` -## Initializing Azure Storage -This project stores the terraform state in a remote storage in an azure storage blob as configured in [tfstate-storage.tf](./azure-storage/tfstate-storage.tf). The benefits of using such remote storage setup are well outlined in [storing state in azure storage](https://learn.microsoft.com/en-us/azure/developer/terraform/store-state-in-azure-storage) and [terraform backend configuration](https://developer.hashicorp.com/terraform/language/settings/backends/configuration). The main benefit is that it allows multiple people to access the state data and collaborate on the resources configuration. +All commands in this document are executed from nix-shell inside the `terraform` directory. + +## Directory Structure +``` +terraform +├── azarm +├── persistent +│   ├── binary-cache-sigkey +│   ├── binary-cache-storage +│   ├── builder-ssh-key +├── playground +│   ├── terraform-playground.sh +├── state-storage +│   └── tfstate-storage.tf +├── modules +│   ├── azurerm-linux-vm +│   └── azurerm-nix-vm-image +├── binary-cache.tf +├── builder.tf +├── jenkins-controller.tf +└── main.tf +``` +- The `terraform` directory contains the root terraform deployment files with the VM configurations `binary-cache.tf`, `builder.tf`, and `jenkins-controller.tf` matching the components described in [README-azure.md](./README-azure.md) in its [components section](./README-azure.md#components). +- The `terraform/azarm` directory contains the terraform configuration for Azure `aarch64` builder which is used from ghaf github-actions [build.yml workflow](https://github.com/tiiuae/ghaf/blob/e81ccfb41d75eda0488b6b4325aeccb8385ce960/.github/workflows/build.yml#L151) to build `aarch64` targets for authorized PRs pre-merge. `azarm` is disconnected from the root terraform module: it's a separate configuration with its own state. +- The `terraform/persistent` directory contains the terraform configuration for parts of the infrastructure that are shared between the ghaf-infra development instances. An example of such persistent ghaf-infra resource is the binary cache storage as well as the binary cache signing key. There may be many 'persistent' infrastructure instances - currently `dev` and `prod` deployments have their own instances of the persistent resources. Section [Multiple Environments with Terraform Workspaces](./README.md#multiple-environments-with-terraform-workspaces) discusses this topic with more details. +- The `terraform/playground` directory contains tooling to facilitate the usage of terraform workspaces in setting-up distinct copies of the ghaf-infra infrastructure, i.e. 'playground' `dev` environments. It also includes an [example test infrastructure](./playground/test-infra.tf) that allows deploying example infrastructure including just one nix VM, highlighting the use of `terraform/modules` to build and upload the nix image on Azure. +- The `terraform/state-storage` directory contains the terraform configuration for the ghaf-infra remote backend state storage using Azure storage blob. See section [Initializing Azure State and Persistent Data](./README.md#initializing-azure-state-and-persistent-data) for more details. +- The `terraform/modules` directory contains terraform modules used from the ghaf-infra VM configurations to build, upload, and spin up Azure nix images. + +## Initializing Azure State and Persistent Data +This project stores the terraform state in a remote storage in an azure storage blob as configured in [tfstate-storage.tf](./state-storage/tfstate-storage.tf). The benefits of using such remote storage setup are well outlined in [storing state in azure storage](https://learn.microsoft.com/en-us/azure/developer/terraform/store-state-in-azure-storage) and [terraform backend configuration](https://developer.hashicorp.com/terraform/language/settings/backends/configuration). -**Note**: if you work with existing infrastructure, there should be no need to initialize the state storage. Initializing state storage is only needed when you start-off or move to a new infrastructure. +To initialize the backend storage, use the `terraform-init-sh`: -When starting a new infrastructure you need to initialize the terraform state storage: ```bash -$ cd azure-storage/ -$ terraform init -$ terraform apply +# Inside the terraform directory +$ ./terraform-init.sh +[+] Initializing state storage +[+] Initializing persistent data +... +[+] Running terraform init ``` +`terraform-init.sh` will not do anything if the initialization has already been done. In other words, it's safe to run the script many times; it will not destroy or re-initialize anything if the init was already executed. -## Terraform workflow +In addition to the shared terraform state, some of the infrastructure resources are also shared between the ghaf-infra instances. `terraform-init.sh` initializes the persistent configuration defined under `terraform/persistent`. There may be many 'persistent' infrastructure instances: currently `dev` and `prod` deployments have their own instances of the persistent resources. Section [Multiple Environments with Terraform Workspaces](./README.md#multiple-environments-with-terraform-workspaces) discusses this topic with more details. -Following describes the intended workflow, with commands executed from the nix-shell. +## Multiple Environments with Terraform Workspaces -First, change the terraform code by modifying the relevant files in this directory. Then: +To support infrastructure development in isolated environments, this project uses [terraform workspaces](https://developer.hashicorp.com/terraform/cli/workspaces). +The main reasons for using terraform workspaces include: +- Different workspaces allow deploying different instances of ghaf-infra. Each instance has a completely separate state data, making it possible to deploy `dev`, `prod`, or even private development instances of ghaf-infra. This makes it possible to first develop and test infrastructure changes in a private development environment, before proposing changes to shared (e.g. `dev` or `prod`) environments. The configuration codebase is the same between all the environments, with the differentiation options defined in the [`main.tf`](./main.tf#L69). +- Parts of the ghaf-infra infrastructure are persistent and shared between different environments. As an example, private `dev` environments share the binary cache storage. This arrangement makes it possible to treat, for instance, `dev` and private ghaf-infra instances dispensable: ghaf-infra instances can be temporary and short-lived as it's easy to spin-up new environments without losing any valuable data. The persistent data is configured outside the root ghaf-infra terraform deployment in the `terraform/persistent` directory. There may be many 'persistent' infrastructure instances - currently `dev` and `prod` deployments have their own instances of the persistent resources. This means that `dev` and `prod` instances of ghaf-infra do **not** share any persistent data. As an example, `dev` and `prod` deployments of ghaf-infra have a separate binary cache storage. The binding to persistent resources from ghaf-infra is done in the [`main.tf`](./main.tf#L166) based on the terraform workspace name and resource location. +- Currently, the following resources are defined 'persistent', meaning `dev` and `prod` instances do not share the following resources: + - Binary cache storage: [`binary-cache-storage.tf`](./persistent/binary-cache-storage/binary-cache-storage.tf) + - Binray cache signing key: [`binary-cache-sigkey.ft`](./persistent/binary-cache-sigkey/binary-cache-sigkey.tf) + - Builder ssh key: [`builder-ssh-key.tf`](./persistent/builder-ssh-key/builder-ssh-key.tf) + +To help facilitate the usage of terraform workspaces in setting-up distinct copies of ghaf-infra, one can [use terraform workspaces from the command line](https://developer.hashicorp.com/terraform/cli/workspaces#managing-cli-workspaces) or consider using the helper script provided at [`playground/terraform-playground.sh`](./playground/terraform-playground.sh). Below, for the sake of example, we use the [`playground/terraform-playground.sh`](./playground/terraform-playground.sh) to setup a private devlopment instance of ghaf-infra: ```bash -# Terraform comands are executed under the terraform directory: -$ cd terraform/ +# Activate private development environment +$ ./playground/terraform-playground.sh activate +# ... +[+] Done, use terraform [validate|plan|apply] to work with your dev infra +``` +Which sets-up a terraform workspace for your private development environment: +```bash +# List the current terraform worskapce +$ terraform workspace list +Terraform workspaces: + default + dev +* henrirosten # <-- indicates active workspace + prod +``` + +## Terraform workflow -# Initialize terraform working directory -$ terraform init +Following describes the intended workflow, with commands executed from the nix-shell. + +Once your are ready to deploy your terraform or nix configuration changes, the following sequence of commands typically take place: +```bash +# Inside the terraform directory # Format the terraform code files: -$ terraform fmt +$ terraform fmt -recursive -# Test the changes: +# Validate the terraform changes: $ terraform validate -# Test applying your configuration changes: +# Make sure you deploy to the correct ghaf-infra instance: +$ terraform workspace list + default + dev +* henrirosten # <== This example deploys to private dev environment + prod + +# Show what actions terraform would take on apply: +$ terraform plan + +# Apply your configuration changes: $ terraform apply ``` -### Common Terraform Errors +Once `terraform apply` completes, the private development infrastructure is deployed. +You can now play around in your isolated copy of the infrastructure, testing and updating the changes, making sure the changes work as expected before merging the changes. + +## Destroying Playground Environment +Once the configuration changes have been tested, the private development environment can be destroyed: +```bash +# Destroy the private terraform worskapce +$ ./playground/terraform-playground.sh destroy +``` +The above command removes all the resources that were created for the private development environment. + +## Common Terraform Errors Below are some common Terraform errors with tips on how to resolve each. @@ -82,17 +169,12 @@ $ terraform apply ... azurerm_virtual_machine_extension.deploy_ubuntu_builder: Creating... ╷ -│ Error: A resource with the ID "/subscriptions//resourceGroups/ghaf-infra-tf-dev/providers/Microsoft.Compute/virtualMachines/azarm/extensions/azarm-vmext" already exists - to be managed via Terraform this resource needs to be imported into the State. Please see the resource documentation for "azurerm_virtual_machine_extension" for more information. +│ Error: A resource with the ID "/subscriptions//resourceGroups/rg-name-here/providers/Microsoft.Compute/virtualMachines/azarm/extensions/azarm-vmext" already exists - to be managed via Terraform this resource needs to be imported into the State. Please see the resource documentation for "azurerm_virtual_machine_extension" for more information. ``` Example fix: ```bash -$ terraform import azurerm_virtual_machine_extension.deploy_ubuntu_builder /subscriptions//resourceGroups/ghaf-infra-tf-dev/providers/Microsoft.Compute/virtualMachines/azarm/extensions/azarm-vmext +$ terraform import azurerm_virtual_machine_extension.deploy_ubuntu_builder /subscriptions//resourceGroups/rg-name-here/providers/Microsoft.Compute/virtualMachines/azarm/extensions/azarm-vmext # Ref: https://stackoverflow.com/questions/61418168/terraform-resource-with-the-id-already-exists -``` - - -## References -- Azure secrets: https://registry.terraform.io/providers/hashicorp/azuread/0.9.0/docs/guides/service_principal_client_secret -- Use Terraform to create Linux VM in azure: https://learn.microsoft.com/en-us/azure/virtual-machines/linux/quick-create-terraform?tabs=azure-cli +``` \ No newline at end of file diff --git a/terraform/playground/README.md b/terraform/playground/README.md index 716c04ab..cc248a49 100644 --- a/terraform/playground/README.md +++ b/terraform/playground/README.md @@ -17,15 +17,7 @@ This page documents the usage of `terraform-playground.sh` to help facilitate th ## Usage -If you still don't have nix package manager on your local host, install it following the package manager installation instructions from https://nixos.org/download.html. - -Then, clone this repository: -```bash -$ git clone https://github.com/tiiuae/ghaf-infra.git -$ cd ghaf-infra/ -``` - -All commands in this document are executed from nix-shell inside the `terraform/jenkins` directory. +All commands in this document are executed from nix-shell inside the `terraform/` directory. Bootstrap nix-shell with the required dependencies: ```bash @@ -35,21 +27,21 @@ $ nix-shell # Authenticate with az login: $ az login -# We use the configuration under terraform/jenkins as an example: -$ cd terraform/jenkins +# We use the infrastructure configuration under terraform/playground as an example: +$ cd terraform/playground ``` ## Activating Playground Environment ```bash # Activate private development environment -$ ../playground/terraform-playground.sh activate +$ ./terraform-playground.sh activate # ... [+] Done, use terraform [validate|plan|apply] to work with your dev infra ``` The `activate` command sets-up a terraform workspace for your private development environment: ```bash # List the current terraform worskapce -$ ../playground/terraform-playground.sh list +$ ./terraform-playground.sh list Terraform workspaces: default * henrirosten # <-- indicates active workspace @@ -58,9 +50,7 @@ Terraform workspaces: ## Testing Infrastructure Changes With the private development workspace now setup, we can test infrastructure changes in a private development environment: ```bash -# In directory terraform/jenkins -$ pwd -[..]/ghaf-infra/terraform/jenkins +# In directory terraform/playground # Check terraform configuration files format: $ terraform fmt -recursive @@ -72,17 +62,17 @@ $ terraform validate $ terraform plan # Deploy the infrastructure: -$ terraform apply -auto-approve +$ terraform apply ``` Once `terraform apply` completes, the private development infrastructure is deployed. -You can now play around in your isolated copy of the infrastructure, testing and updating the changes, making sure the changes work as expected before proposing the changes to a shared (prod/dev) environment. +You can now play around in your isolated copy of the infrastructure, testing and updating the changes, making sure the changes work as expected before merging the changes. ## Destroying Playground Environment Once the configuration changes have been tested, the private development environment can be destroyed: ```bash # Destroy the private terraform worskapce -$ ../playground/terraform-playground.sh destroy +$ ./terraform-playground.sh destroy ``` The above command removes all the resources that were created for the private development environment. @@ -90,4 +80,3 @@ The above command removes all the resources that were created for the private de ## References - Terraform workspaces: https://developer.hashicorp.com/terraform/cli/workspaces - How to manage multiple environments with Terraform using workspaces: https://blog.gruntwork.io/how-to-manage-multiple-environments-with-terraform-using-workspaces-98680d89a03e - From b17ceec0cdee5f887ff7783272bbfee270a82a3b Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Mon, 5 Feb 2024 12:46:09 +0200 Subject: [PATCH 82/88] terraform: fixes from testing documentation Signed-off-by: Henri Rosten --- terraform/main.tf | 4 ++-- terraform/terraform-init.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/terraform/main.tf b/terraform/main.tf index 584eb983..6696f02e 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -53,7 +53,7 @@ locals { # this is a workaround to missing asserts in terraform: assert_workspace_not_default = regex( (terraform.workspace == "default") ? - "((Force invalid regex pattern)\n\nERROR: default workspace not allowed" : "", "") + "((Force invalid regex pattern)\n\nERROR: workspace 'default' is not allowed" : "", "") # Short name of the Azure region, see: # https://github.com/claranet/terraform-azurerm-regions/blob/master/REGIONS.md @@ -80,7 +80,7 @@ locals { num_builders = 1 } prod = { - vm_size_binarycache = "Standard_D2_v2" + vm_size_binarycache = "Standard_D2_v3" vm_size_builder = "Standard_D8_v3" vm_size_controller = "Standard_D8_v3" num_builders = 2 diff --git a/terraform/terraform-init.sh b/terraform/terraform-init.sh index 56fc337e..80df1a77 100755 --- a/terraform/terraform-init.sh +++ b/terraform/terraform-init.sh @@ -63,7 +63,7 @@ init_persistent () { pushd "$MYDIR/persistent" >/dev/null terraform init > /dev/null # Default persistent instance: 'eun' (northeurope) - terraform workspace select eun 2>/dev/null || terraform workspace new eun + terraform workspace select eun &>/dev/null || terraform workspace new eun import_bincache_sigkey "prod" import_bincache_sigkey "dev" echo "[+] Applying possible changes" From 831bd4e555854e469edc3ec977a0f1c5dc0c4207 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Tue, 6 Feb 2024 13:07:46 +0200 Subject: [PATCH 83/88] flake: move back to nixos-23.11 This reverts 3ba044e45ad0a0e7667de89d8cab87fb5dc8215b moving the nixpkgs revision back to b0b2c5445c64191fd8d0b31f2b1a34e45a64547d from 23.11 which is the same nixpkgs version as what was used in main-branch already. Signed-off-by: Henri Rosten --- flake.lock | 50 +++++++++++++++++++++++++------------------------- flake.nix | 2 +- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/flake.lock b/flake.lock index ad4ffbba..beae49b3 100644 --- a/flake.lock +++ b/flake.lock @@ -7,11 +7,11 @@ ] }, "locked": { - "lastModified": 1701775991, - "narHash": "sha256-/51DaSTzoW+wQfj5P9EnTbSxixDFjjhfnGdMKcSp+is=", + "lastModified": 1704318910, + "narHash": "sha256-wOIJwAsnZhM0NlFRwYJRgO4Lldh8j9viyzwQXtrbNtM=", "owner": "nix-community", "repo": "disko", - "rev": "f84c3684900d11cf19f530070d32d55f0ed51374", + "rev": "aef9a509db64a081186af2dc185654d78dc8e344", "type": "github" }, "original": { @@ -57,11 +57,11 @@ "nixpkgs-lib": "nixpkgs-lib" }, "locked": { - "lastModified": 1701473968, - "narHash": "sha256-YcVE5emp1qQ8ieHUnxt1wCZCC3ZfAS+SRRWZ2TMda7E=", + "lastModified": 1704152458, + "narHash": "sha256-DS+dGw7SKygIWf9w4eNBUZsK+4Ug27NwEWmn2tnbycg=", "owner": "hercules-ci", "repo": "flake-parts", - "rev": "34fed993f1674c8d06d58b37ce1e0fe5eebcb9f5", + "rev": "88a2cd8166694ba0b6cb374700799cec53aef527", "type": "github" }, "original": { @@ -92,11 +92,11 @@ "utils": "utils" }, "locked": { - "lastModified": 1688488021, - "narHash": "sha256-vn6xkx4g2q/qykU+jdQYyGSPKFmGePuhGujAdmlHx1Y=", + "lastModified": 1702912615, + "narHash": "sha256-qseX+/8drgwxOb1I3LKqBYMkmyeI5d5gmHqbZccR660=", "owner": "aristanetworks", "repo": "nix-serve-ng", - "rev": "f3931b8120b1ca663da280e11659c745e2e9ad1b", + "rev": "21e65cb4c62b5c9e3acc11c3c5e8197248fa46a4", "type": "github" }, "original": { @@ -107,11 +107,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1688403656, - "narHash": "sha256-zmNai3dKWUCKpKubPWsEJ1Q7od96KebWVDJNCnk+fr0=", + "lastModified": 1707171055, + "narHash": "sha256-7ZiKRdhrScsDfhDkGy8yJWAT6BfHqa8PYMX04roU03k=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "453da3c28f7a95374b73d1f3fd665dd40e6049e9", + "rev": "4b1aab22192b787355733c9495d47f4c66af084c", "type": "github" }, "original": { @@ -124,11 +124,11 @@ "nixpkgs-lib": { "locked": { "dir": "lib", - "lastModified": 1701253981, - "narHash": "sha256-ztaDIyZ7HrTAfEEUt9AtTDNoCYxUdSd6NrRHaYOIxtk=", + "lastModified": 1703961334, + "narHash": "sha256-M1mV/Cq+pgjk0rt6VxoyyD+O8cOUiai8t9Q6Yyq4noY=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "e92039b55bcd58469325ded85d4f58dd5a4eaf58", + "rev": "b0d36bd0a420ecee3bc916c91886caca87c894e9", "type": "github" }, "original": { @@ -141,16 +141,16 @@ }, "nixpkgs_2": { "locked": { - "lastModified": 1703092220, - "narHash": "sha256-O1W4RXGNCXVOOfFr6AyOZKS+2gAviatUBZwBzZEPeFc=", + "lastModified": 1704295289, + "narHash": "sha256-9WZDRfpMqCYL6g/HNWVvXF0hxdaAgwgIGeLYiOhmes8=", "owner": "nixos", "repo": "nixpkgs", - "rev": "e3f251c662bc525e4bae3edfa3fc67e52d690d4f", + "rev": "b0b2c5445c64191fd8d0b31f2b1a34e45a64547d", "type": "github" }, "original": { "owner": "nixos", - "ref": "master", + "ref": "nixos-23.11", "repo": "nixpkgs", "type": "github" } @@ -177,11 +177,11 @@ ] }, "locked": { - "lastModified": 1701728052, - "narHash": "sha256-7lOMc3PtW5a55vFReBJLLLOnopsoi1W7MkjJ93jPV4E=", + "lastModified": 1703991717, + "narHash": "sha256-XfBg2dmDJXPQEB8EdNBnzybvnhswaiAkUeeDj7fa/hQ=", "owner": "mic92", "repo": "sops-nix", - "rev": "e91ece6d2cf5a0ae729796b8f0dedceab5107c3d", + "rev": "cfdbaf68d00bc2f9e071f17ae77be4b27ff72fa6", "type": "github" }, "original": { @@ -212,11 +212,11 @@ ] }, "locked": { - "lastModified": 1701682826, - "narHash": "sha256-2lxeTUGs8Jzz/wjLgWYmZoXn60BYNRMzwHFtxNFUDLU=", + "lastModified": 1704233915, + "narHash": "sha256-GYDC4HjyVizxnyKRbkrh1GugGp8PP3+fJuh40RPCN7k=", "owner": "numtide", "repo": "treefmt-nix", - "rev": "affe7fc3f5790e1d0b5ba51bcff0f7ebe465e92d", + "rev": "e434da615ef74187ba003b529cc72f425f5d941e", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index eabe93ee..75e918a0 100644 --- a/flake.nix +++ b/flake.nix @@ -6,7 +6,7 @@ inputs = { # Nixpkgs - nixpkgs.url = "github:nixos/nixpkgs/master"; + nixpkgs.url = "github:nixos/nixpkgs/nixos-23.11"; # Allows us to structure the flake with the NixOS module system flake-parts.url = "github:hercules-ci/flake-parts"; flake-root.url = "github:srid/flake-root"; From b2c13e6d736897201432e9b5b891bb4d456864d9 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Tue, 6 Feb 2024 15:24:58 +0200 Subject: [PATCH 84/88] Cleanup unnecessary configurations Signed-off-by: Henri Rosten --- .sops.yaml | 15 --- hosts/azure-common.nix | 41 --------- hosts/default.nix | 5 - hosts/ghafhydra/configuration.nix | 67 -------------- hosts/ghafhydra/secrets.yaml | 43 --------- services/hydra/create-jobsets.sh | 84 ----------------- services/hydra/default.nix | 148 ------------------------------ tasks.py | 14 +-- terraform/azarm/secrets.yaml | 29 ++---- 9 files changed, 15 insertions(+), 431 deletions(-) delete mode 100644 hosts/azure-common.nix delete mode 100644 hosts/ghafhydra/configuration.nix delete mode 100644 hosts/ghafhydra/secrets.yaml delete mode 100644 services/hydra/create-jobsets.sh delete mode 100644 services/hydra/default.nix diff --git a/.sops.yaml b/.sops.yaml index 2c7e5d48..8f4e52d7 100644 --- a/.sops.yaml +++ b/.sops.yaml @@ -3,9 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 keys: - - &build01 age1tcp86swx4c8y8ej666k27lwca60j0x5tf4mcnw459ccec4am9vqqg2ht9d - &flokli age1lvpj49ewyx9a4uxevl05wfevmqld2d25juc65acjqpmerzdpc9kq2kxdgs - - &ghafhydra age1qnufx7gvz5kmm48nvdma4chxd4p0lca88f5fsyce8lrae6gp2a8sul692y - &hrosten age1hc6hszepd5xezxkgd3yx74pn3scxjm5w6px48m4rq9yj7w6rke7q72zhgn - &karim age122lvqyrdqz30fkfututykl0yle9u63u2em6e4aut7e5draws83ns3npt3a - &jrautiola age15jq5gjjd7ypsdlqfjtqy4red57v8ggqq9na6u3xffznu678nydpsuuwjg0 @@ -15,21 +13,8 @@ creation_rules: - path_regex: terraform/azarm/secrets.yaml$ key_groups: - age: - - *flokli - *hrosten - *karim - - path_regex: hosts/ghafhydra/secrets.yaml$ - key_groups: - - age: - - *flokli - - *hrosten - - *ghafhydra - - path_regex: hosts/build01/secrets.yaml$ - key_groups: - - age: - - *flokli - - *hrosten - - *build01 - path_regex: hosts/binarycache/secrets.yaml$ key_groups: - age: diff --git a/hosts/azure-common.nix b/hosts/azure-common.nix deleted file mode 100644 index f04381ec..00000000 --- a/hosts/azure-common.nix +++ /dev/null @@ -1,41 +0,0 @@ -# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) -# -# SPDX-License-Identifier: Apache-2.0 -{inputs, ...}: { - require = [ - "${inputs.nixpkgs}/nixos/modules/virtualisation/azure-agent.nix" - ]; - virtualisation.azure.agent.enable = true; - boot.kernelParams = ["console=ttyS0" "earlyprintk=ttyS0" "rootdelay=300" "panic=1" "boot.panic_on_fail"]; - boot.initrd.kernelModules = ["hv_vmbus" "hv_netvsc" "hv_utils" "hv_storvsc"]; - boot.loader.systemd-boot.enable = true; - boot.loader.efi.canTouchEfiVariables = true; - boot.loader.timeout = 0; - boot.loader.grub.configurationLimit = 0; - boot.growPartition = true; - - # Ref: - # - https://github.com/NixOS/nixpkgs/blob/8efd5d1e283604f75a808a20e6cde0ef313d07d4/nixos/modules/virtualisation/azure-common.nix#L44 - # - https://learn.microsoft.com/en-us/troubleshoot/azure/virtual-machines/troubleshoot-device-names-problems - services.udev.extraRules = '' - ENV{DEVTYPE}=="disk", KERNEL!="sda" SUBSYSTEM=="block", SUBSYSTEMS=="scsi", KERNELS=="?:0:0:0", ATTR{removable}=="0", SYMLINK+="disk/by-lun/0", - ENV{DEVTYPE}=="disk", KERNEL!="sda" SUBSYSTEM=="block", SUBSYSTEMS=="scsi", KERNELS=="?:0:0:1", ATTR{removable}=="0", SYMLINK+="disk/by-lun/1", - ENV{DEVTYPE}=="disk", KERNEL!="sda" SUBSYSTEM=="block", SUBSYSTEMS=="scsi", KERNELS=="?:0:0:2", ATTR{removable}=="0", SYMLINK+="disk/by-lun/2" - ENV{DEVTYPE}=="disk", KERNEL!="sda" SUBSYSTEM=="block", SUBSYSTEMS=="scsi", KERNELS=="?:0:0:3", ATTR{removable}=="0", SYMLINK+="disk/by-lun/3" - - ENV{DEVTYPE}=="disk", KERNEL!="sda" SUBSYSTEM=="block", SUBSYSTEMS=="scsi", KERNELS=="?:0:0:4", ATTR{removable}=="0", SYMLINK+="disk/by-lun/4" - ENV{DEVTYPE}=="disk", KERNEL!="sda" SUBSYSTEM=="block", SUBSYSTEMS=="scsi", KERNELS=="?:0:0:5", ATTR{removable}=="0", SYMLINK+="disk/by-lun/5" - ENV{DEVTYPE}=="disk", KERNEL!="sda" SUBSYSTEM=="block", SUBSYSTEMS=="scsi", KERNELS=="?:0:0:6", ATTR{removable}=="0", SYMLINK+="disk/by-lun/6" - ENV{DEVTYPE}=="disk", KERNEL!="sda" SUBSYSTEM=="block", SUBSYSTEMS=="scsi", KERNELS=="?:0:0:7", ATTR{removable}=="0", SYMLINK+="disk/by-lun/7" - - ENV{DEVTYPE}=="disk", KERNEL!="sda" SUBSYSTEM=="block", SUBSYSTEMS=="scsi", KERNELS=="?:0:0:8", ATTR{removable}=="0", SYMLINK+="disk/by-lun/8" - ENV{DEVTYPE}=="disk", KERNEL!="sda" SUBSYSTEM=="block", SUBSYSTEMS=="scsi", KERNELS=="?:0:0:9", ATTR{removable}=="0", SYMLINK+="disk/by-lun/9" - ENV{DEVTYPE}=="disk", KERNEL!="sda" SUBSYSTEM=="block", SUBSYSTEMS=="scsi", KERNELS=="?:0:0:10", ATTR{removable}=="0", SYMLINK+="disk/by-lun/10" - ENV{DEVTYPE}=="disk", KERNEL!="sda" SUBSYSTEM=="block", SUBSYSTEMS=="scsi", KERNELS=="?:0:0:11", ATTR{removable}=="0", SYMLINK+="disk/by-lun/11" - - ENV{DEVTYPE}=="disk", KERNEL!="sda" SUBSYSTEM=="block", SUBSYSTEMS=="scsi", KERNELS=="?:0:0:12", ATTR{removable}=="0", SYMLINK+="disk/by-lun/12" - ENV{DEVTYPE}=="disk", KERNEL!="sda" SUBSYSTEM=="block", SUBSYSTEMS=="scsi", KERNELS=="?:0:0:13", ATTR{removable}=="0", SYMLINK+="disk/by-lun/13" - ENV{DEVTYPE}=="disk", KERNEL!="sda" SUBSYSTEM=="block", SUBSYSTEMS=="scsi", KERNELS=="?:0:0:14", ATTR{removable}=="0", SYMLINK+="disk/by-lun/14" - ENV{DEVTYPE}=="disk", KERNEL!="sda" SUBSYSTEM=="block", SUBSYSTEMS=="scsi", KERNELS=="?:0:0:15", ATTR{removable}=="0", SYMLINK+="disk/by-lun/15" - ''; -} diff --git a/hosts/default.nix b/hosts/default.nix index 19f5d4b2..4ca43c52 100644 --- a/hosts/default.nix +++ b/hosts/default.nix @@ -9,7 +9,6 @@ }: { flake.nixosModules = { # shared modules - azure-common = import ./azure-common.nix; qemu-common = import ./qemu-common.nix; ficolo-common = import ./ficolo-common.nix; common = import ./common.nix; @@ -20,10 +19,6 @@ # make self and inputs available in nixos modules specialArgs = {inherit self inputs;}; in { - ghafhydra = lib.nixosSystem { - inherit specialArgs; - modules = [./ghafhydra/configuration.nix]; - }; binarycache = lib.nixosSystem { inherit specialArgs; modules = [./binarycache/configuration.nix]; diff --git a/hosts/ghafhydra/configuration.nix b/hosts/ghafhydra/configuration.nix deleted file mode 100644 index f4b0afdb..00000000 --- a/hosts/ghafhydra/configuration.nix +++ /dev/null @@ -1,67 +0,0 @@ -# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) -# -# SPDX-License-Identifier: Apache-2.0 -{ - self, - inputs, - lib, - config, - ... -}: { - sops.defaultSopsFile = ./secrets.yaml; - sops.secrets.hydra-admin-password.owner = "hydra"; - sops.secrets.id_buildfarm = {}; - sops.secrets.id_buildfarm.owner = "hydra-queue-runner"; - sops.secrets.cache-sig-key.owner = "root"; - - imports = lib.flatten [ - (with inputs; [ - sops-nix.nixosModules.sops - disko.nixosModules.disko - ]) - (with self.nixosModules; [ - common - azure-common - generic-disk-config - service-hydra - service-openssh - service-binary-cache - service-nginx - user-hrosten - user-tervis - ]) - ]; - - networking.hostName = "ghafhydra"; - nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; - boot.loader.grub = { - devices = ["/dev/sda"]; - efiSupport = true; - efiInstallAsRemovable = true; - }; - - # TODO: have a separate configuration for ghafhydra-dev? - # Ref: https://nixos.org/manual/nixos/stable/#module-security-acme - security.acme.defaults.email = "trash@unikie.com"; - security.acme.acceptTerms = true; - services.nginx = { - virtualHosts = { - "ghafhydra.northeurope.cloudapp.azure.com" = { - forceSSL = true; - enableACME = true; - locations."/".proxyPass = "http://localhost:${toString config.services.hydra.port}"; - }; - }; - }; - - # TODO: demo with static IP: - networking.useDHCP = false; - networking.nameservers = ["1.1.1.1" "8.8.8.8"]; - networking.defaultGateway = "10.0.2.1"; - networking.interfaces.eth0.ipv4.addresses = [ - { - address = "10.0.2.4"; - prefixLength = 24; - } - ]; -} diff --git a/hosts/ghafhydra/secrets.yaml b/hosts/ghafhydra/secrets.yaml deleted file mode 100644 index cf325354..00000000 --- a/hosts/ghafhydra/secrets.yaml +++ /dev/null @@ -1,43 +0,0 @@ -hydra-admin-password: ENC[AES256_GCM,data:sF0GBx+VVJpL6aAd5W683n+sbLqUVTJ7AlJnyGg=,iv:qhau81DNMPiouDZRLi8EnLjgd7a0CeQsPofrNEAB+JM=,tag:KwI7W1cEqX5kGDpG4iQQTA==,type:str] -cache-sig-key: ENC[AES256_GCM,data:ykBsRJ7VApeUXrBNEYPFlTltNRK6s/ysAfBToUmkhrsHdA4Wiu9qJiydDX37UNmI+VDIQy80ut+fPeGwOJzun4x+qE8ZYLYbVuEFFWW/9Ra+kioElJ0D4y0CHmlcmdbdQwrnBZleLDI=,iv:by52+aLVOnD6tQNudDU2zqfFhnc6JBYs7M9RTJmmmTs=,tag:uIqnDPQ85kRXmLrkvnLTLw==,type:str] -id_buildfarm: ENC[AES256_GCM,data:q+7S0ilByAaLBQHRXKPqQlzr+Jq5HsHiU4iUweCNa/YsCLOau0gQaj0Nbg1CBa/8KmCcJ+ET6Tj6kPEaNMMneWy/zaBuXXIni0L1f+iLnoqqpnaM/0kryI9BpVoIIZu5pW55NI3A4BUpo1GCbZgMHjr1rmoHFq0JdYxY0AXnvcERR/+3ZIS1GC56GUMqSiRI4G47a1fzpgZbc4b4VeyZc3kfEocrlVuDuA6cVAnwgzUwEfx46SVLfR6tGkSpiYSSU5jc5lYuO97QxJn/gVCoWqoSGlzKNLsJuoViVT2c3nzTLofC4zrH3/EpZUXfv3ijcgm+GH2rCGqK79zud6PwsRLmgFjLpgdzRyJkPfOHsAWyUURRp5qbeZg+H/IY7UVrTXSkyFSCW4TH9DM8nGRJcwe6NniPovahG6/cQ8llA/y4Ylr7w0Eg/J6zfODput/GF3CEYeruPScAqOCK0bst4BPB8d6dwAuQGm8P1lwgvXx4lUQCyTL1IttKE7Ex5IqV6cA/f4rtlZpkAF2APHm/n9W4Bk5tq6y8fKvG,iv:GpvyzXi3FE6WIS38NaK6wpjiGJ2yA9aaxRU0QCHL124=,tag:+9rGTiThCxVCSCYKSR00LQ==,type:str] -cache-public: ENC[AES256_GCM,data:ZwkGONxmqRcXlxLcTVQ38ZucxHjB3WmLUaow5N/Q7k0B/OjdQtgWj8hBXp/Ugl+fAJMd3uJSXKV8UM+K,iv:DshL6HetYtr7es+4GERAcrn58uFF8+/+NK1wP3AxGEY=,tag:Uf/mGbBYRtMHKHvx5lzxxw==,type:str] -ssh_host_ed25519_key: ENC[AES256_GCM,data:NdyGM4655TSu1wnKaEsUWZ3c2HjklDHyvvl1RX/5apUaOa8RE5IGn/+wGRYSSXcesZ7teMJYf0CjchTQaDtjswzESrO0kRwpYJXydI2vg6dlwEir6IlDnllsUS3gsUITVxNVElR+f3h5wpzX3NrACi3YTRYdg9nvv4BrCUI6ffP1ndZfgUQMp4OrcJ8DmI1onlvfRr32ukZkiCtbayTdpwgeSYwTvyT5lkwYd5f0Ns3JVl2w7fXuaFyBnZ/2dj/fdUG9bve8XqujBR3uRHQMxr+/xnHMIn6Lht3ZDULdF4u1eO24YhFsj3mMN6hdBwWUSlwjg9ksbG9OuXTEgZVP5IZGqL8p+1F8nqvCGD7oGj0WNv+jKjuSr44Y/RY7B0qZqnohVZ9iVz76ekLDgcW80CSg5RyHCTbdeZZ8F1ct8nSXFmap04Hpto6xRtd7WPP5eB25F3mQdSIYkJRKwQ+TlWSS9oaVOZsjqwuvowk+BNi3eBxZGA+IBf5mVK9iU4lAWKtA0sH1T/8fBf8fIiKo,iv:2DIpljm/2qbdb+5f3Ox2G0UrL17fiEzyIK7vAkdSREc=,tag:VdCeN2WbNqaxgJuvLtH5WA==,type:str] -sops: - kms: [] - gcp_kms: [] - azure_kv: [] - hc_vault: [] - age: - - recipient: age1lvpj49ewyx9a4uxevl05wfevmqld2d25juc65acjqpmerzdpc9kq2kxdgs - enc: | - -----BEGIN AGE ENCRYPTED FILE----- - YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSB4eW45bGhKQjRMOG5WcDNK - MzNPc1pIR2U3c2YzekgwMi95REFrSmxqN0hnCjJZNkhtS3d0T1hUUGtXd09wQUMz - OUU5d1dtdlQ3T0VHelFHLzg2ZFFhajgKLS0tIENyL3lUdU9weTcvOENCYWtUN3RJ - NjJqQnBsWUVld213TUk0alEzRElTZG8KZcG5DTWwpH8VyPQ6udZ7abTdj4Qtgmtu - dFcCWw6j5ztv4+sVyxN9CXvLW2Dz8RLujjspUJOV5+rtJP1vgaLOEg== - -----END AGE ENCRYPTED FILE----- - - recipient: age1hc6hszepd5xezxkgd3yx74pn3scxjm5w6px48m4rq9yj7w6rke7q72zhgn - enc: | - -----BEGIN AGE ENCRYPTED FILE----- - YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBkdVJHRkVUNm9DM1dmQ1Ba - MHFFb0hvbkNwd0wrblJWODNjUERLd2tXY0FFClU2UzNZZk9KV1VTT0hKNVA3eloz - eGtDblpvU3JtYWY3QVB3RFZMd0pBb1EKLS0tIHdXTjFNekNZNHJUUWQ2RkhzQ0pP - QWZrbkFOZitBTS85NmxYUy96M1IrUzQKl4xyxbyBFbJgzBQl65A7wp+SWD81a8zO - 7eVHSMWH69zBNgEFbK7KH7t+9EX/cmvhnsN2EtWRqTsXwSwdKsCUgw== - -----END AGE ENCRYPTED FILE----- - - recipient: age1qnufx7gvz5kmm48nvdma4chxd4p0lca88f5fsyce8lrae6gp2a8sul692y - enc: | - -----BEGIN AGE ENCRYPTED FILE----- - YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBaeE9uQ2thRE12MjRiVnMy - NGk3WDNOTk1WTXd0OTg5T09IeTBhdHg2RWdJClBwUWlTMUhhS3B0Vm8walFCRnNV - dDRUckx3R3RQVmhNcTNzTjB6cnRxdTAKLS0tIDcyQmxzbEhHVEZBNkNUeEc2YmpQ - UStWL2MzQWpNZys4U0pTTkhwMkVPVG8KKHKsStQZ35KVPdxTjlZXUFmOuAOVEEJl - +arlhW7G2GXE+gtqu2L4vIMjyXzjEhuUkw8ejQcQJH+dQTEsJvDyWA== - -----END AGE ENCRYPTED FILE----- - lastmodified: "2023-10-27T09:35:39Z" - mac: ENC[AES256_GCM,data:mwgpj9kuJ5v1BiE7jkA7BwVcTynf36UFZX6YfZAMZyTy05d3UMosdLOYPvfUATsJ3Y2+e83LC9xYL7t74GWdC8z7513HtGsmm6IFVZz3AfsW5uWLllSAo3UBqpBdTW+6vpedE9W5Y2M9TK/IYOoZa0PcTe+VGUdhcO8vTEJVaQo=,iv:f/OOxbppPUsElte0Hwttv/AHOAGdgxP91FW6eYHHU3w=,tag:UDCc1sb/SUvR1lvvwFf06g==,type:str] - pgp: [] - unencrypted_suffix: _unencrypted - version: 3.7.3 diff --git a/services/hydra/create-jobsets.sh b/services/hydra/create-jobsets.sh deleted file mode 100644 index 5192bab8..00000000 --- a/services/hydra/create-jobsets.sh +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env bash - -# SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) -# -# SPDX-License-Identifier: Apache-2.0 - -# Usage example -# URL=http://localhost:3000 ./create-jobsets.sh - -set -euo pipefail - -HYDRA_ADMIN_USERNAME=${HYDRA_ADMIN_USERNAME:-admin} -HYDRA_ADMIN_PASSWORD=${HYDRA_ADMIN_PASSWORD:-admin_pass} -URL=${URL:-http://localhost:3000} -PROJECT_NAME=${PROJECT_NAME:-"ghaf"} - -mycurl() { - curl --fail --referer "${URL}" -H "Accept: application/json" -H "Content-Type: application/json" "$@" -} - - -############################################################ -# Login -############################################################ - -echo "Logging to $URL with user $HYDRA_ADMIN_USERNAME" -cat >data.json <data.json <data.json < None: Decrypt host private key, print ssh and age public keys for `alias` config. Example usage: - inv print-keys --target ghafhydra-dev + inv print-keys --target binarycache-ficolo """ with TemporaryDirectory() as tmpdir: nixosconfig = _get_target(alias).nixosconfig @@ -260,7 +256,7 @@ def deploy(_c: Any, alias: str) -> None: Deploy the configuration for `alias`. Example usage: - inv deploy --alias ghafhydra-dev + inv deploy --alias binarycache-ficolo """ h = get_deploy_host(alias) res = h.run_local( @@ -414,7 +410,7 @@ def build_local(_c: Any, alias: str = "") -> None: If `alias` is not specificied, builds all TARGETS. Example usage: - inv build-local --alias ghafhydra-dev + inv build-local --alias binarycache-ficolo """ if alias: target_configs = [_get_target(alias).nixosconfig] @@ -451,7 +447,7 @@ def reboot(_c: Any, alias: str) -> None: Reboot host identified as `alias`. Example usage: - inv reboot --alias ghafhydra-dev + inv reboot --alias binarycache-ficolo """ h = get_deploy_host(alias) h.run("sudo reboot &") diff --git a/terraform/azarm/secrets.yaml b/terraform/azarm/secrets.yaml index 26e67aa9..8646344d 100644 --- a/terraform/azarm/secrets.yaml +++ b/terraform/azarm/secrets.yaml @@ -7,32 +7,23 @@ sops: azure_kv: [] hc_vault: [] age: - - recipient: age1lvpj49ewyx9a4uxevl05wfevmqld2d25juc65acjqpmerzdpc9kq2kxdgs - enc: | - -----BEGIN AGE ENCRYPTED FILE----- - YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSAxQ21BVkd0c2NVeWxNTmlq - ZkJtOTd6TTdGbFhWUCsvZGtudmdsdFBZOGg4CmEyR2dLTEtsa1ZHY3FHdm96a2dl - WDIxVFpUb0RJU2llNUtKNnFNQk9VWmcKLS0tIFNGdzNOZERtUTgzdEI1cU56TTI2 - cnhObWtZRCtrVnV1K3pMbW43dzJNYjAKatS8i91n3X2fn7TNAEWkOx66yAerHRrx - o0o9vm6925HcUVgb1raCoDpvG45IQPsJdnDcXjGIZq0AnE/NScuyng== - -----END AGE ENCRYPTED FILE----- - recipient: age1hc6hszepd5xezxkgd3yx74pn3scxjm5w6px48m4rq9yj7w6rke7q72zhgn enc: | -----BEGIN AGE ENCRYPTED FILE----- - YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSB4cVRzSnFqeVZTTEIzR0ht - WGphTkZJTlJYcGhWMEc4QWo2dFo3QlU1MHg4CmxJWXNwWkRPbkx0L2laK0xRNFpY - ZlUrYXo0MjRJcWhYS2dpaDIwZHpQTGsKLS0tIExSckVQUzNIa0JQNDVPZ3hKV0J1 - MW40UjNZVndMcEtlK3BFcWZwRHI5RTgKcuJhKNOIrCwC0y7jz8OwGsaiK1TqZjcb - 7wT3kpxrHvG8lDw/VF1Ym8lkLfnkH3UwFEjnhHJAjy/PjOpeEUReyg== + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBWRk9iaUduWjdiNWM2eWJJ + K1E2VVFtZlB6QXY2S0ZMNnpBaGpteUtGU3owCmt2Ty9kTjdqUmxVMVhwK283TEdP + ei9BQUkvN2FiRm9lYXJrZWM5VEdyVEEKLS0tIDJxQTh3OVFHMmJKTWY1TDd2Vmwx + RVA2OUp5eDRMWTgzSXRsTTFCcWF3QlUKNePT1opQB8oBQcSyD/YVuPMOWyF+P54+ + l3ksJMr/ZX1G46FpuELMfQj7P1E3CG68AQPdiZiAlyK46VMflJYhdQ== -----END AGE ENCRYPTED FILE----- - recipient: age122lvqyrdqz30fkfututykl0yle9u63u2em6e4aut7e5draws83ns3npt3a enc: | -----BEGIN AGE ENCRYPTED FILE----- - YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBuRUpDL3RUNzhOUDdGa1RO - L1BWKy9Lb1JacnBXTndkRnNTVml4cE5jRHc4CkxIZk1DeXBUSTh1OWhtbWRZTzBI - NXpTMk5NekVNOHhBVll3aTVoMVZxMFUKLS0tIEUySlZiNkgxR2xFazhaU2toZ2E0 - cXd2aHFzdlNONC8vMmRqaUVCamFxL3cKCqBr0wqn5KER6OpOPggIbNa53923eLDb - Iqmq6MrfrEtp5VnoJHB1NtnF8zUeWZMRzk1A/3Ytr3sCPrhUOAxPQg== + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSB0VGNMc1FleTkzQ2ZISGxP + ZmdHUGJqYktseDRLSytLNlgxdkpxNzRiV1RnCnV2MnNqd3kzRTVwUFUvaHR1ZlhH + SVh6RUxHQmk4V2FLcU1TU3JNSHUzMTAKLS0tIGVpT1EybW54dnZUaW5KR2FNWng1 + VG5qOEtlQXJDMjhjaFo2WTNCaGgyVkkKbM8QF0CWJHg+XfASY2G47BcmRtL6E/Sp + t25pP7CWrmg7WYnAIbizEIwR0ZVmwLqa6NVH+kI0Q/nKOthURAPSyA== -----END AGE ENCRYPTED FILE----- lastmodified: "2023-11-07T10:30:23Z" mac: ENC[AES256_GCM,data:FljnbBZO4U3OlwxdHbL6WICo4p8LfkVXGtoFSeOfsqGfXGSrRZC3hetKMtM0z26K1ubJRKmFnEzosQqccHn+e2xgMfIp28t4edZo+engbOiiMgU2DO9eHkG3FwZJt4ue3WxDG40rzQ4OvcNpOrUqDpFBbo8/wbKy3a9S7BibqO0=,iv:pWTE5tpnORHigHvCoeQ1hJrzQW5a9TzW+Ah1wZNn67s=,tag:3kihD1YCfcEZMx1xw/pFWQ==,type:str] From d0061ef45de4952dfcdfb73b1a998515e8fa3a0c Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Wed, 7 Feb 2024 06:57:18 +0200 Subject: [PATCH 85/88] scratch-disk: comment current status Signed-off-by: Henri Rosten --- hosts/azure-scratch-store-common.nix | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/hosts/azure-scratch-store-common.nix b/hosts/azure-scratch-store-common.nix index 5dcfea78..8cd3a79b 100644 --- a/hosts/azure-scratch-store-common.nix +++ b/hosts/azure-scratch-store-common.nix @@ -1,6 +1,30 @@ # SPDX-FileCopyrightText: 2023 Technology Innovation Institute (TII) # # SPDX-License-Identifier: Apache-2.0 +# +# This configuration is currently not used, but kept here for reference. +# The reason this isn't currently used is that the 'setup-resource-disk' +# service that's setup in this file systematically fails on the first +# boot-up, which then cascades other service failures. +# It fails to mount the resource disk in initrd on the first boot. +# +# The changes from this file were originally introduced in the following PR +# https://github.com/tiiuae/ghaf-infra/pull/35 in commit: +# https://github.com/tiiuae/ghaf-infra/commit/7a7a1e40b24b6776c70f7e030c7608ed90b40e45 +# Later, the scratch disk was disabled due to the reason explained above +# and worked-around by mounting /nix/store on the osdisk with the following change: +# https://github.com/tiiuae/ghaf-infra/commit/f143ac92517a3588d038e88eda09f19471e42de3 +# +# Note: if we decice to re-enable this config at some later time, it's worth +# mentioning that originally this configuration did not work on nixos-23.11 +# as described here: +# https://github.com/tiiuae/ghaf-infra/commit/e9b7db1c02c459c0b8d54a4d65aac1d400f4035d +# +# At the time of writing, ghaf-infra main branch follows 23.11: +# https://github.com/tiiuae/ghaf-infra/pull/74/commits/dd42bf9191f8133aaedb65aebb5756d8b4d567af +# which means these changes would not work without also changing the ghaf-infra +# nixpkgs reference. +# { pkgs, utils, From 58af59fd0b6a2809a97b67e7f87959dd3cedb748 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Wed, 7 Feb 2024 07:13:38 +0200 Subject: [PATCH 86/88] hosts: move azure host configs to subdir Move azure nix host configurations to their own subdirectory to avoid confusion with the ficolo (e.g. 'binarycache') and azure ('binary-cache') nix configurations. Signed-off-by: Henri Rosten --- .../binary-cache/configuration.nix | 2 +- hosts/{ => azure}/builder/configuration.nix | 2 +- .../jenkins-controller/configuration.nix | 2 +- hosts/default.nix | 24 +++++++++---------- terraform/binary-cache.tf | 2 +- terraform/builder.tf | 2 +- terraform/jenkins-controller.tf | 2 +- 7 files changed, 18 insertions(+), 18 deletions(-) rename hosts/{ => azure}/binary-cache/configuration.nix (99%) rename hosts/{ => azure}/builder/configuration.nix (98%) rename hosts/{ => azure}/jenkins-controller/configuration.nix (99%) diff --git a/hosts/binary-cache/configuration.nix b/hosts/azure/binary-cache/configuration.nix similarity index 99% rename from hosts/binary-cache/configuration.nix rename to hosts/azure/binary-cache/configuration.nix index 90664861..8d347a28 100644 --- a/hosts/binary-cache/configuration.nix +++ b/hosts/azure/binary-cache/configuration.nix @@ -9,7 +9,7 @@ ... }: { imports = [ - ../azure-common-2.nix + ../../azure-common-2.nix self.nixosModules.service-openssh ]; diff --git a/hosts/builder/configuration.nix b/hosts/azure/builder/configuration.nix similarity index 98% rename from hosts/builder/configuration.nix rename to hosts/azure/builder/configuration.nix index 91aa04de..bc2e7731 100644 --- a/hosts/builder/configuration.nix +++ b/hosts/azure/builder/configuration.nix @@ -8,7 +8,7 @@ ... }: { imports = [ - ../azure-common-2.nix + ../../azure-common-2.nix self.nixosModules.service-openssh self.nixosModules.service-remote-build ]; diff --git a/hosts/jenkins-controller/configuration.nix b/hosts/azure/jenkins-controller/configuration.nix similarity index 99% rename from hosts/jenkins-controller/configuration.nix rename to hosts/azure/jenkins-controller/configuration.nix index bbece7db..715478be 100644 --- a/hosts/jenkins-controller/configuration.nix +++ b/hosts/azure/jenkins-controller/configuration.nix @@ -52,7 +52,7 @@ ''; in { imports = [ - ../azure-common-2.nix + ../../azure-common-2.nix self.nixosModules.service-openssh ]; diff --git a/hosts/default.nix b/hosts/default.nix index 4ca43c52..c1dd4146 100644 --- a/hosts/default.nix +++ b/hosts/default.nix @@ -19,17 +19,21 @@ # make self and inputs available in nixos modules specialArgs = {inherit self inputs;}; in { - binarycache = lib.nixosSystem { + az-binary-cache = lib.nixosSystem { inherit specialArgs; - modules = [./binarycache/configuration.nix]; + modules = [./azure/binary-cache/configuration.nix]; + }; + az-builder = lib.nixosSystem { + inherit specialArgs; + modules = [./azure/builder/configuration.nix]; }; - binary-cache = lib.nixosSystem { + az-jenkins-controller = lib.nixosSystem { inherit specialArgs; - modules = [./binary-cache/configuration.nix]; + modules = [./azure/jenkins-controller/configuration.nix]; }; - builder = lib.nixosSystem { + binarycache = lib.nixosSystem { inherit specialArgs; - modules = [./builder/configuration.nix]; + modules = [./binarycache/configuration.nix]; }; ficolobuild3 = lib.nixosSystem { inherit specialArgs; @@ -39,17 +43,13 @@ inherit specialArgs; modules = [./ficolobuild/build4.nix]; }; - jenkins-controller = lib.nixosSystem { + monitoring = lib.nixosSystem { inherit specialArgs; - modules = [./jenkins-controller/configuration.nix]; + modules = [./monitoring/configuration.nix]; }; prbuilder = lib.nixosSystem { inherit specialArgs; modules = [./prbuilder/configuration.nix]; }; - monitoring = lib.nixosSystem { - inherit specialArgs; - modules = [./monitoring/configuration.nix]; - }; }; } diff --git a/terraform/binary-cache.tf b/terraform/binary-cache.tf index 5719fb4a..3eafc72a 100644 --- a/terraform/binary-cache.tf +++ b/terraform/binary-cache.tf @@ -5,7 +5,7 @@ module "binary_cache_image" { source = "./modules/azurerm-nix-vm-image" - nix_attrpath = "outputs.nixosConfigurations.binary-cache.config.system.build.azureImage" + nix_attrpath = "outputs.nixosConfigurations.az-binary-cache.config.system.build.azureImage" nix_entrypoint = "${path.module}/.." name = "binary-cache" diff --git a/terraform/builder.tf b/terraform/builder.tf index 8cfb5831..99deddd0 100644 --- a/terraform/builder.tf +++ b/terraform/builder.tf @@ -5,7 +5,7 @@ module "builder_image" { source = "./modules/azurerm-nix-vm-image" - nix_attrpath = "outputs.nixosConfigurations.builder.config.system.build.azureImage" + nix_attrpath = "outputs.nixosConfigurations.az-builder.config.system.build.azureImage" nix_entrypoint = "${path.module}/.." name = "builder" diff --git a/terraform/jenkins-controller.tf b/terraform/jenkins-controller.tf index de997d07..540fce27 100644 --- a/terraform/jenkins-controller.tf +++ b/terraform/jenkins-controller.tf @@ -6,7 +6,7 @@ module "jenkins_controller_image" { source = "./modules/azurerm-nix-vm-image" - nix_attrpath = "outputs.nixosConfigurations.jenkins-controller.config.system.build.azureImage" + nix_attrpath = "outputs.nixosConfigurations.az-jenkins-controller.config.system.build.azureImage" nix_entrypoint = "${path.module}/.." name = "jenkins-controller" From b799c59d25639e0c23117bfc325e02171284a3a5 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Wed, 7 Feb 2024 07:18:18 +0200 Subject: [PATCH 87/88] hosts: rename azure-common-2.nix Signed-off-by: Henri Rosten --- hosts/{azure-common-2.nix => azure-common.nix} | 0 hosts/azure/binary-cache/configuration.nix | 2 +- hosts/azure/builder/configuration.nix | 2 +- hosts/azure/jenkins-controller/configuration.nix | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename hosts/{azure-common-2.nix => azure-common.nix} (100%) diff --git a/hosts/azure-common-2.nix b/hosts/azure-common.nix similarity index 100% rename from hosts/azure-common-2.nix rename to hosts/azure-common.nix diff --git a/hosts/azure/binary-cache/configuration.nix b/hosts/azure/binary-cache/configuration.nix index 8d347a28..b21d2744 100644 --- a/hosts/azure/binary-cache/configuration.nix +++ b/hosts/azure/binary-cache/configuration.nix @@ -9,7 +9,7 @@ ... }: { imports = [ - ../../azure-common-2.nix + ../../azure-common.nix self.nixosModules.service-openssh ]; diff --git a/hosts/azure/builder/configuration.nix b/hosts/azure/builder/configuration.nix index bc2e7731..4020db90 100644 --- a/hosts/azure/builder/configuration.nix +++ b/hosts/azure/builder/configuration.nix @@ -8,7 +8,7 @@ ... }: { imports = [ - ../../azure-common-2.nix + ../../azure-common.nix self.nixosModules.service-openssh self.nixosModules.service-remote-build ]; diff --git a/hosts/azure/jenkins-controller/configuration.nix b/hosts/azure/jenkins-controller/configuration.nix index 715478be..6821697d 100644 --- a/hosts/azure/jenkins-controller/configuration.nix +++ b/hosts/azure/jenkins-controller/configuration.nix @@ -52,7 +52,7 @@ ''; in { imports = [ - ../../azure-common-2.nix + ../../azure-common.nix self.nixosModules.service-openssh ]; From 263572757e243ae95f9af4117babd16a1b33f35f Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Fri, 9 Feb 2024 10:07:14 +0200 Subject: [PATCH 88/88] terraform: move caddy state to persistent Move caddy state disk to persistent. Binary-cache vm stores let's encrypt certificates and data on the caddy state disk. This state disk needs to be stored in 'persistent' data, otherwise there will be issues with certificate authority rate limits when development environments are deployed and consequently destroyed. Signed-off-by: Henri Rosten --- terraform/README.md | 39 +++++++++- terraform/binary-cache.tf | 16 +--- terraform/main.tf | 6 ++ .../persistent/workspace-specific/main.tf | 76 +++++++++++++++++++ terraform/playground/terraform-playground.sh | 5 +- terraform/terraform-init.sh | 18 ++++- 6 files changed, 141 insertions(+), 19 deletions(-) create mode 100644 terraform/persistent/workspace-specific/main.tf diff --git a/terraform/README.md b/terraform/README.md index b0f720e8..7af6a6c1 100644 --- a/terraform/README.md +++ b/terraform/README.md @@ -54,8 +54,9 @@ terraform │   ├── binary-cache-sigkey │   ├── binary-cache-storage │   ├── builder-ssh-key +│   └── workspace-specific ├── playground -│   ├── terraform-playground.sh +│   └── terraform-playground.sh ├── state-storage │   └── tfstate-storage.tf ├── modules @@ -68,7 +69,7 @@ terraform ``` - The `terraform` directory contains the root terraform deployment files with the VM configurations `binary-cache.tf`, `builder.tf`, and `jenkins-controller.tf` matching the components described in [README-azure.md](./README-azure.md) in its [components section](./README-azure.md#components). - The `terraform/azarm` directory contains the terraform configuration for Azure `aarch64` builder which is used from ghaf github-actions [build.yml workflow](https://github.com/tiiuae/ghaf/blob/e81ccfb41d75eda0488b6b4325aeccb8385ce960/.github/workflows/build.yml#L151) to build `aarch64` targets for authorized PRs pre-merge. `azarm` is disconnected from the root terraform module: it's a separate configuration with its own state. -- The `terraform/persistent` directory contains the terraform configuration for parts of the infrastructure that are shared between the ghaf-infra development instances. An example of such persistent ghaf-infra resource is the binary cache storage as well as the binary cache signing key. There may be many 'persistent' infrastructure instances - currently `dev` and `prod` deployments have their own instances of the persistent resources. Section [Multiple Environments with Terraform Workspaces](./README.md#multiple-environments-with-terraform-workspaces) discusses this topic with more details. +- The `terraform/persistent` directory contains the terraform configuration for parts of the infrastructure that are considered persitent - resources defined under `terraform/persistent` will not be removed even if the ghaf-infra instance is otherwise removed. An example of such persistent ghaf-infra resource is the binary cache storage as well as the binary cache signing key. There may be many 'persistent' infrastructure instances - currently `dev` and `prod` deployments have their own instances of the persistent resources. Section [Multiple Environments with Terraform Workspaces](./README.md#multiple-environments-with-terraform-workspaces) discusses this topic with more details. - The `terraform/playground` directory contains tooling to facilitate the usage of terraform workspaces in setting-up distinct copies of the ghaf-infra infrastructure, i.e. 'playground' `dev` environments. It also includes an [example test infrastructure](./playground/test-infra.tf) that allows deploying example infrastructure including just one nix VM, highlighting the use of `terraform/modules` to build and upload the nix image on Azure. - The `terraform/state-storage` directory contains the terraform configuration for the ghaf-infra remote backend state storage using Azure storage blob. See section [Initializing Azure State and Persistent Data](./README.md#initializing-azure-state-and-persistent-data) for more details. - The `terraform/modules` directory contains terraform modules used from the ghaf-infra VM configurations to build, upload, and spin up Azure nix images. @@ -95,7 +96,7 @@ In addition to the shared terraform state, some of the infrastructure resources To support infrastructure development in isolated environments, this project uses [terraform workspaces](https://developer.hashicorp.com/terraform/cli/workspaces). The main reasons for using terraform workspaces include: - Different workspaces allow deploying different instances of ghaf-infra. Each instance has a completely separate state data, making it possible to deploy `dev`, `prod`, or even private development instances of ghaf-infra. This makes it possible to first develop and test infrastructure changes in a private development environment, before proposing changes to shared (e.g. `dev` or `prod`) environments. The configuration codebase is the same between all the environments, with the differentiation options defined in the [`main.tf`](./main.tf#L69). -- Parts of the ghaf-infra infrastructure are persistent and shared between different environments. As an example, private `dev` environments share the binary cache storage. This arrangement makes it possible to treat, for instance, `dev` and private ghaf-infra instances dispensable: ghaf-infra instances can be temporary and short-lived as it's easy to spin-up new environments without losing any valuable data. The persistent data is configured outside the root ghaf-infra terraform deployment in the `terraform/persistent` directory. There may be many 'persistent' infrastructure instances - currently `dev` and `prod` deployments have their own instances of the persistent resources. This means that `dev` and `prod` instances of ghaf-infra do **not** share any persistent data. As an example, `dev` and `prod` deployments of ghaf-infra have a separate binary cache storage. The binding to persistent resources from ghaf-infra is done in the [`main.tf`](./main.tf#L166) based on the terraform workspace name and resource location. +- Parts of the ghaf-infra infrastructure are persistent and shared between different environments. As an example, private `dev` environments share the binary cache storage. This arrangement makes it possible to treat, for instance, `dev` and private ghaf-infra instances dispensable: ghaf-infra instances can be temporary and short-lived as it's easy to spin-up new environments without losing any valuable data. The persistent data is configured outside the root ghaf-infra terraform deployment in the `terraform/persistent` directory. There may be many 'persistent' infrastructure instances - currently `dev` and `prod` deployments have their own instances of the persistent resources. This means that `dev` and `prod` instances of ghaf-infra do **not** share any persistent data. As an example, `dev` and `prod` deployments of ghaf-infra have a separate binary cache storage. The binding to persistent resources from ghaf-infra is done in the [`main.tf`](./main.tf#L166) based on the terraform workspace name and resource location. Persistent data initialization is automatically done with `terraform-init.sh` script. - Currently, the following resources are defined 'persistent', meaning `dev` and `prod` instances do not share the following resources: - Binary cache storage: [`binary-cache-storage.tf`](./persistent/binary-cache-storage/binary-cache-storage.tf) - Binray cache signing key: [`binary-cache-sigkey.ft`](./persistent/binary-cache-sigkey/binary-cache-sigkey.tf) @@ -177,4 +178,34 @@ Example fix: $ terraform import azurerm_virtual_machine_extension.deploy_ubuntu_builder /subscriptions//resourceGroups/rg-name-here/providers/Microsoft.Compute/virtualMachines/azarm/extensions/azarm-vmext # Ref: https://stackoverflow.com/questions/61418168/terraform-resource-with-the-id-already-exists -``` \ No newline at end of file +``` + +#### Error: creating/updating Image +```bash +$ terraform apply +... +│ Error: creating/updating Image (Subscription: "" +│ Resource Group Name: "ghaf-infra-dev" +│ Image Name: ""): performing CreateOrUpdate: unexpected status 400 with error: InvalidParameter: The source blob https://.blob.core.windows.net/ghaf-infra-vm-images/.vhd is not accessible. +│ +│ with module.builder_image.azurerm_image.default, +│ on modules/azurerm-nix-vm-image/main.tf line 22, in resource "azurerm_image" "default": +│ 22: resource "azurerm_image" "default" { +``` +Try running `terraform apply` again if you get an error similar to one shown above. +It's unclear why this error occasionally occurs, this issue should be analyzed in detail. + +#### Error: Disk +```bash +$ terraform apply +... +│ Error: Disk (Subscription: "" +│ Resource Group Name: "ghaf-infra-persistent-eun" +│ Disk Name: "binary-cache-vm-caddy-state-dev") was not found +│ +│ with data.azurerm_managed_disk.binary_cache_caddy_state, +│ on main.tf line 207, in data "azurerm_managed_disk" "binary_cache_caddy_state": +│ 207: data "azurerm_managed_disk" "binary_cache_caddy_state" { +``` +Above error (or similar) is likely caused by missing initialization for some `persistent` resources. +Fix the persistent initialization by running `terraform-init.sh` then run `terraform apply` again. \ No newline at end of file diff --git a/terraform/binary-cache.tf b/terraform/binary-cache.tf index 3eafc72a..485c443b 100644 --- a/terraform/binary-cache.tf +++ b/terraform/binary-cache.tf @@ -50,12 +50,12 @@ module "binary_cache_vm" { # Attach disk to the VM data_disks = [{ - name = azurerm_managed_disk.binary_cache_caddy_state.name - managed_disk_id = azurerm_managed_disk.binary_cache_caddy_state.id + name = data.azurerm_managed_disk.binary_cache_caddy_state.name + managed_disk_id = data.azurerm_managed_disk.binary_cache_caddy_state.id lun = "10" create_option = "Attach" caching = "None" - disk_size_gb = azurerm_managed_disk.binary_cache_caddy_state.disk_size_gb + disk_size_gb = data.azurerm_managed_disk.binary_cache_caddy_state.disk_size_gb }] } @@ -96,13 +96,3 @@ resource "azurerm_role_assignment" "binary_cache_access_storage" { role_definition_name = "Storage Blob Data Reader" principal_id = module.binary_cache_vm.virtual_machine_identity_principal_id } - -# Create a data disk -resource "azurerm_managed_disk" "binary_cache_caddy_state" { - name = "binary-cache-vm-caddy-state" - resource_group_name = azurerm_resource_group.infra.name - location = azurerm_resource_group.infra.location - storage_account_type = "Standard_LRS" - create_option = "Empty" - disk_size_gb = 1 -} diff --git a/terraform/main.tf b/terraform/main.tf index 6696f02e..1a10b51f 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -204,4 +204,10 @@ data "azurerm_key_vault_secret" "binary_cache_signing_key" { provider = azurerm } +data "azurerm_managed_disk" "binary_cache_caddy_state" { + name = "binary-cache-vm-caddy-state-${local.ws}" + resource_group_name = "ghaf-infra-persistent-${local.shortloc}" +} + + ################################################################################ diff --git a/terraform/persistent/workspace-specific/main.tf b/terraform/persistent/workspace-specific/main.tf new file mode 100644 index 00000000..7b00b6f8 --- /dev/null +++ b/terraform/persistent/workspace-specific/main.tf @@ -0,0 +1,76 @@ +# SPDX-FileCopyrightText: 2024 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +provider "azurerm" { + features {} +} + +terraform { + required_providers { + azurerm = { + source = "hashicorp/azurerm" + } + } +} + +################################################################################ + +terraform { + # Backend for storing terraform state (see ../../state-storage) + backend "azurerm" { + resource_group_name = "ghaf-infra-state" + storage_account_name = "ghafinfratfstatestorage" + container_name = "ghaf-infra-tfstate-container" + key = "ghaf-infra-persistent.tfstate" + } +} + +################################################################################ + +# Variables +variable "location" { + type = string + default = "northeurope" + description = "Azure region into which the resources will be deployed" +} +variable "persistent_resource_group" { + type = string + default = "ghaf-infra-persistent-eun" + description = "Parent resource group name" +} + +locals { + # Raise an error if workspace is 'default', + # this is a workaround to missing asserts in terraform: + assert_workspace_not_default = regex( + (terraform.workspace == "default") ? + "((Force invalid regex pattern)\n\nERROR: workspace 'default' is not allowed" : "", "") + + # Sanitize workspace name: + ws = substr(replace(lower(terraform.workspace), "/[^a-z0-9]/", ""), 0, 16) +} + +# Data source to access persistent resource group (see ../main.tf) +data "azurerm_resource_group" "persistent" { + name = var.persistent_resource_group +} + +# Current signed-in user +data "azurerm_client_config" "current" {} + + +################################################################################ + +# Resources + +resource "azurerm_managed_disk" "binary_cache_caddy_state" { + name = "binary-cache-vm-caddy-state-${local.ws}" + resource_group_name = data.azurerm_resource_group.persistent.name + location = data.azurerm_resource_group.persistent.location + storage_account_type = "Standard_LRS" + create_option = "Empty" + disk_size_gb = 1 +} + +################################################################################ diff --git a/terraform/playground/terraform-playground.sh b/terraform/playground/terraform-playground.sh index 004e1b5b..12881521 100755 --- a/terraform/playground/terraform-playground.sh +++ b/terraform/playground/terraform-playground.sh @@ -133,6 +133,9 @@ main () { fi } -main "$@" +# Do not execute main() if this script is being sourced +if [ "${0}" = "${BASH_SOURCE[0]}" ]; then + main "$@" +fi ################################################################################ diff --git a/terraform/terraform-init.sh b/terraform/terraform-init.sh index 80df1a77..2ac9ad2f 100755 --- a/terraform/terraform-init.sh +++ b/terraform/terraform-init.sh @@ -58,7 +58,7 @@ import_bincache_sigkey () { } init_persistent () { - echo "[+] Initializing persistent data" + echo "[+] Initializing persistent" # See: ./persistent pushd "$MYDIR/persistent" >/dev/null terraform init > /dev/null @@ -69,6 +69,22 @@ init_persistent () { echo "[+] Applying possible changes" terraform apply -auto-approve >/dev/null popd >/dev/null + + # Assigns $WORKSPACE variable + # shellcheck source=/dev/null + source "$MYDIR/playground/terraform-playground.sh" &>/dev/null + generate_azure_private_workspace_name + + echo "[+] Initializing workspace-specific persistent" + # See: ./persistent/workspace-specific + pushd "$MYDIR/persistent/workspace-specific" >/dev/null + terraform init > /dev/null + echo "[+] Applying possible changes" + for ws in "dev" "prod" "$WORKSPACE"; do + terraform workspace select "$ws" &>/dev/null || terraform workspace new "$ws" + terraform apply -auto-approve >/dev/null + done + popd >/dev/null } init_terraform () {