From 3f33b72a270624e8edc5815440c132e345eff03d Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Fri, 15 Mar 2024 20:44:06 +0000 Subject: [PATCH 01/21] Add opensearch logs to the openstack dashboard --- .../dashboards/openstack/openstack.json | 473 ++++++++++++++++++ 1 file changed, 473 insertions(+) diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/openstack.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/openstack.json index c75cae2b2..557231fad 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/openstack.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/openstack.json @@ -2011,6 +2011,19 @@ "alignLevel": null } }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 73 + }, + "id": 11, + "panels": [], + "title": "Logs", + "type": "row" + }, { "datasource": { "type": "prometheus", @@ -2532,6 +2545,360 @@ } ], "type": "table" + }, + { + "datasource": { + "type": "grafana-opensearch-datasource", + "uid": "${os_datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 2, + "type": "symlog" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [ + { + "targetBlank": true, + "title": "Show in Opensearch", + "url": "http{% endraw %}{{ 's' if kolla_enable_tls_internal | bool else '' }}://{{ kolla_internal_vip_address }}{% raw %}:5601/app/discover#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:'${__from:date}',to:'${__to:date}'))&_a=(columns:!(_source),filters:!(),interval:auto,query:(language:lucene,query:'log_level:${loglevel:lucene} AND programname:(\"${__data.fields[\"programname.keyword\"]}\") AND Hostname:${host:lucene}'),sort:!())" + } + ], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "programname.keyword" + }, + "properties": [ + { + "id": "displayName", + "value": "Program Name" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 74 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "", + "bucketAggs": [ + { + "field": "programname.keyword", + "id": "3", + "settings": { + "min_doc_count": "1", + "order": "desc", + "orderBy": "_count", + "size": "20" + }, + "type": "terms" + }, + { + "field": "@timestamp", + "id": "2", + "settings": { + "interval": "1h", + "min_doc_count": "0", + "trimEdges": "0" + }, + "type": "date_histogram" + } + ], + "datasource": { + "type": "grafana-opensearch-datasource", + "uid": "${os_datasource}" + }, + "format": "table", + "metrics": [ + { + "id": "1", + "type": "count" + } + ], + "query": "log_level:$loglevel AND programname:$program_name AND Hostname:$host", + "queryType": "lucene", + "refId": "A", + "timeField": "@timestamp" + } + ], + "title": "Number of $loglevel per service", + "type": "timeseries" + }, + { + "datasource": { + "type": "grafana-opensearch-datasource", + "uid": "${os_datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "links": [ + { + "targetBlank": true, + "title": "Show in Opensearch", + "url": "http{% endraw %}{{ 's' if kolla_enable_tls_internal | bool else '' }}://{{ kolla_internal_vip_address }}{% raw %}:5601/app/discover#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:'${__from:date}',to:'${__to:date}'))&_a=(columns:!(_source),filters:!(),interval:auto,query:(language:lucene,query:'log_level:${loglevel:lucene} AND programname:(\"${__data.fields[\"programname.keyword\"]}\") AND Hostname:${host:lucene}'),sort:!())" + } + ], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Count" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "type": "gauge" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "programname.keyword" + }, + "properties": [ + { + "id": "displayName", + "value": "Program Name" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 84 + }, + "id": 8, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "alias": "", + "bucketAggs": [ + { + "field": "programname.keyword", + "id": "2", + "settings": { + "min_doc_count": "1", + "order": "desc", + "orderBy": "_count", + "size": "20" + }, + "type": "terms" + } + ], + "datasource": { + "type": "grafana-opensearch-datasource", + "uid": "${os_datasource}" + }, + "format": "table", + "metrics": [ + { + "id": "1", + "type": "count" + } + ], + "query": "log_level:$loglevel AND programname:$program_name AND Hostname:$host", + "queryType": "lucene", + "refId": "A", + "timeField": "@timestamp" + } + ], + "title": "Number of $loglevel per service", + "type": "table" + }, + { + "datasource": { + "type": "grafana-opensearch-datasource", + "uid": "${os_datasource}" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 94 + }, + "id": 100, + "links": [ + { + "targetBlank": true, + "title": "View in Opensearch", + "url": "http{% endraw %}{{ 's' if kolla_enable_tls_internal | bool else '' }}://{{ kolla_internal_vip_address }}{% raw %}:5601/app/discover#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:'${__from:date}',to:'${__to:date}'))&_a=(columns:!(_source),filters:!(),interval:auto,query:(language:lucene,query:'log_level:${loglevel:lucene} AND programname:\"${program_name:lucene}\" AND Hostname:${host:lucene}'),sort:!())" + } + ], + "maxPerRow": 2, + "options": { + "dedupStrategy": "exact", + "enableLogDetails": true, + "prettifyLogMessage": true, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "10.1.2", + "repeat": "program_name", + "repeatDirection": "v", + "targets": [ + { + "alias": "", + "bucketAggs": [ + { + "id": "2", + "settings": { + "min_doc_count": "0", + "order": "desc", + "orderBy": "_term", + "size": "10" + }, + "type": "terms" + } + ], + "datasource": { + "type": "grafana-opensearch-datasource", + "uid": "${os_datasource}" + }, + "format": "logs", + "luceneQueryType": "Metric", + "metrics": [ + { + "id": "1", + "type": "logs" + } + ], + "query": "log_level:$loglevel AND programname:$program_name AND Hostname:$host", + "queryType": "lucene", + "refId": "A", + "timeField": "@timestamp" + } + ], + "title": "Logs - $loglevel - $program_name", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": {}, + "includeByName": {}, + "indexByName": { + "@timestamp": 0, + "Hostname": 2, + "Payload": 1, + "_id": 3, + "_index": 4, + "_source": 5, + "_type": 6, + "level": 10, + "log_level": 7, + "payload": 9, + "programname": 8 + }, + "renameByName": {} + } + } + ], + "type": "logs" } ], "refresh": false, @@ -2638,6 +3005,112 @@ "refresh": 2, "skipUrlSync": false, "type": "interval" + }, + { + "current": { + "selected": true, + "text": [ + "ERROR" + ], + "value": [ + "ERROR" + ] + }, + "datasource": { + "type": "grafana-opensearch-datasource", + "uid": "${os_datasource}" + }, + "definition": "{\"find\": \"terms\", \"query\": \"Hostname: ${host:lucene}\", \"field\": \"log_level.keyword\", \"size\": 1000}", + "hide": 0, + "includeAll": true, + "label": "Log Level", + "multi": true, + "name": "loglevel", + "options": [], + "query": "{\"find\": \"terms\", \"query\": \"Hostname: ${host:lucene}\", \"field\": \"log_level.keyword\", \"size\": 1000}", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "grafana-opensearch-datasource", + "uid": "${os_datasource}" + }, + "definition": "{\"find\": \"terms\", \"query\": \"log_level: $loglevel AND Hostname: $host\", \"field\": \"programname.keyword\", \"size\": 1000}", + "hide": 0, + "includeAll": true, + "label": "Program Name", + "multi": true, + "name": "program_name", + "options": [], + "query": "{\"find\": \"terms\", \"query\": \"log_level: $loglevel AND Hostname: $host\", \"field\": \"programname.keyword\", \"size\": 1000}", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(node_cpu_seconds_total,instance)", + "hide": 0, + "includeAll": true, + "label": "Host", + "multi": true, + "name": "host", + "options": [], + "query": { + "query": "label_values(node_cpu_seconds_total,instance)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "opensearch", + "value": "fdfos0ns7hce8f" + }, + "description": "Opensearch", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "os_datasource", + "options": [], + "query": "grafana-opensearch-datasource", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" } ] }, From 26f6c7ecbe66451d48541452bb0cac02df8f2360 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Tue, 23 Apr 2024 14:27:53 +0100 Subject: [PATCH 02/21] Change metric to retrieve hostname --- .../kolla/config/grafana/dashboards/openstack/openstack.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/openstack.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/openstack.json index 557231fad..b93aeda32 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/openstack.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/openstack.json @@ -3076,7 +3076,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "definition": "label_values(node_cpu_seconds_total,instance)", + "definition": "label_values(node_uname_info,nodename)", "hide": 0, "includeAll": true, "label": "Host", @@ -3084,7 +3084,7 @@ "name": "host", "options": [], "query": { - "query": "label_values(node_cpu_seconds_total,instance)", + "query": "label_values(node_uname_info,nodename)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 2, From 2d6f17550c4da63605b83fbfde3165757e8e0ea0 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Tue, 23 Apr 2024 14:29:31 +0100 Subject: [PATCH 03/21] Add release note --- .../notes/logs-in-openstack-dashboard-6e345ff7f16c0658.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 releasenotes/notes/logs-in-openstack-dashboard-6e345ff7f16c0658.yaml diff --git a/releasenotes/notes/logs-in-openstack-dashboard-6e345ff7f16c0658.yaml b/releasenotes/notes/logs-in-openstack-dashboard-6e345ff7f16c0658.yaml new file mode 100644 index 000000000..0176ac636 --- /dev/null +++ b/releasenotes/notes/logs-in-openstack-dashboard-6e345ff7f16c0658.yaml @@ -0,0 +1,5 @@ +--- +features: + - | + The Openstack Dashboard in Grafana now includes logs from Openstack + services. From e036472d6a2da2cc63db63ee94aae3ff7d9ac980 Mon Sep 17 00:00:00 2001 From: Seunghun Lee <45145778+seunghun1ee@users.noreply.github.com> Date: Fri, 11 Oct 2024 13:32:05 +0100 Subject: [PATCH 04/21] Update package update testing instructions (#830) Co-authored-by: Alex-Welsh --- doc/source/contributor/package-updates.rst | 43 ++++++++++++++-------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/doc/source/contributor/package-updates.rst b/doc/source/contributor/package-updates.rst index 9a6c74589..49e2b0852 100644 --- a/doc/source/contributor/package-updates.rst +++ b/doc/source/contributor/package-updates.rst @@ -63,18 +63,20 @@ The following steps describe the process to test the new package and container r Creating the multinode environments ----------------------------------- -There is a comprehensive guide to setting up a multinode environment with Terraform, found here: https://github.com/stackhpc/terraform-kayobe-multinode. There are some things to note: +The `Multinode deployment workflow `_ can be used to automatically test changes. + +To manually test the changes, there is a comprehensive guide to set up a Multinode environment with Terraform, found here: https://github.com/stackhpc/terraform-kayobe-multinode. There are some things to note: * OVN is enabled by default, you should override it under ``etc/kayobe/environments/ci-multinode/kolla.yml kolla_enable_ovn: false`` for the OVS multinode environment. -* Remember to set different vxlan_vnis for each. +* Remember to set a different ``vxlan_vni`` for each. -* Before starting any tests, run ``dnf distro-sync`` on each host to ensure you are using the same snapshots as in the release train. You can do this using the following commands: +* Before starting any tests, run ``dnf distro-sync -y`` on each host to ensure you are using the same snapshots as in the release train. Option ``-y`` is used to prevent hosts hang waiting for the confirmation input. You can do this using the following commands: .. code-block:: console - kayobe seed host command run -b --command "dnf distro-sync" - kayobe overcloud host command run -b --command "dnf distro-sync" + kayobe seed host command run -b --command "dnf distro-sync -y" + kayobe overcloud host command run -b --command "dnf distro-sync -y" * This may have installed a new kernel version. If so, you will need to reboot the overcloud hosts. You can check the installed kernels and the currently running kernel with the following commands. If the latest listed version is not running, you will need to reboot. @@ -85,7 +87,7 @@ There is a comprehensive guide to setting up a multinode environment with Terraf kayobe playbook run --limit seed,overcloud $KAYOBE_CONFIG_PATH/ansible/reboot.yml -* The tempest tests run automatically at the end of deploy-openstack.sh. If you have the time, it is worth fixing any failing tests you can so that there is greater coverage for the package updates. (Also remember to propose these fixes in the relevant repos where applicable.) +* The tempest tests run automatically at the end of the multinode deployment script. If you have the time, it is worth fixing any failing tests you can so that there is greater coverage for the package updates. (Also remember to propose these fixes in the relevant repos where applicable.) Upgrading host packages ----------------------- @@ -102,6 +104,7 @@ Bump the snapshot versions in /etc/yum/repos.d with: .. code-block:: console + kayobe seed host configure -t dnf -kt none kayobe overcloud host configure -t dnf -kt none Install new packages: @@ -112,22 +115,32 @@ Install new packages: Perform a rolling reboot of hosts: +.. note:: + In the Multinode environment, the seed-hypervisor cannot access control + plane instances with the Openstack client. To use Openstack client, connect + to the Seed instance via SSH first. For authentication, use scp to copy + ``public-openrc.sh`` to the Seed + .. code-block:: console - export ANSIBLE_SERIAL=1 - kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/reboot.yml --limit controllers - kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/reboot.yml --limit compute[0] + # Check your hypervisor hostname + (seed) openstack hypervisor list + + # Reboot controller instances and zeroth compute instance + (seed-hypervisor) export ANSIBLE_SERIAL=1 + (seed-hypervisor) kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/reboot.yml --limit controllers + (seed-hypervisor) kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/reboot.yml --limit compute[0] # Test live migration - openstack server create --image cirros --flavor m1.tiny --network external --hypervisor-hostname wallaby-pkg-refresh-ovs-compute-02.novalocal --os-compute-api-version 2.74 server1 - openstack server migrate --live-migration server1 - watch openstack server show server1 + (seed) openstack server create --image cirros --flavor m1.tiny --network external --hypervisor-hostname --os-compute-api-version 2.74 server1 + (seed) openstack server migrate --live-migration server1 + (seed) watch openstack server show server1 - kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/reboot.yml --limit compute[1] + (seed-hypervisor) kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/reboot.yml --limit compute[1] # Try and migrate back - openstack server migrate --live-migration server1 - watch openstack server show server1 + (seed) openstack server migrate --live-migration server1 + (seed) watch openstack server show server1 Upgrading containers within a release ------------------------------------- From 19207e1bb23e4185bb1be1ff70731a2d68a5d6f6 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Tue, 17 Sep 2024 11:04:00 +0200 Subject: [PATCH 05/21] Configure IPA with useful inspection settings Add configuration to include extra-hardware and mellanox elements, with other useful collectors and kernel settings. --- etc/kayobe/ipa.yml | 15 ++++++++++++--- .../ipa-inspection-settings-133fe91b1d855fa0.yaml | 5 +++++ 2 files changed, 17 insertions(+), 3 deletions(-) create mode 100644 releasenotes/notes/ipa-inspection-settings-133fe91b1d855fa0.yaml diff --git a/etc/kayobe/ipa.yml b/etc/kayobe/ipa.yml index 5877d039a..6d350a6c0 100644 --- a/etc/kayobe/ipa.yml +++ b/etc/kayobe/ipa.yml @@ -29,7 +29,9 @@ # List of additional Diskimage Builder (DIB) elements to use when building IPA # images. Default is none. -#ipa_build_dib_elements_extra: +ipa_build_dib_elements_extra: + - extra-hardware + - mellanox # List of Diskimage Builder (DIB) elements to use when building IPA images. # Default is combination of ipa_build_dib_elements_default and @@ -115,7 +117,10 @@ #ipa_collectors_default: # List of additional inspection collectors to run. -#ipa_collectors_extra: +ipa_collectors_extra: + - "dmi-decode" + - "extra-hardware" + - "numa-topology" # List of inspection collectors to run. #ipa_collectors: @@ -133,7 +138,11 @@ #ipa_kernel_options_default: # List of additional kernel parameters for Ironic python agent. -#ipa_kernel_options_extra: +ipa_kernel_options_extra: + # Useful until NTP is configured by default + - ipa-insecure=1 + # Avoid disk benchmark failures on some NVMe drives + - nvme_core.multipath=N # List of kernel parameters for Ironic python agent. #ipa_kernel_options: diff --git a/releasenotes/notes/ipa-inspection-settings-133fe91b1d855fa0.yaml b/releasenotes/notes/ipa-inspection-settings-133fe91b1d855fa0.yaml new file mode 100644 index 000000000..cfb761290 --- /dev/null +++ b/releasenotes/notes/ipa-inspection-settings-133fe91b1d855fa0.yaml @@ -0,0 +1,5 @@ +--- +features: + - | + Configures the Ironic Python Agent with useful settings for inspection, + such as the ``extra-hardware`` and ``mellanox`` elements. From 67f0a691e49ff775c63549802950ab23c6c38f71 Mon Sep 17 00:00:00 2001 From: Alex-Welsh Date: Wed, 16 Oct 2024 15:59:03 +0100 Subject: [PATCH 06/21] Fix upgrade-prerequisites --- tools/upgrade-prerequisites.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/upgrade-prerequisites.sh b/tools/upgrade-prerequisites.sh index aa66708c3..fa4357ec4 100755 --- a/tools/upgrade-prerequisites.sh +++ b/tools/upgrade-prerequisites.sh @@ -35,7 +35,7 @@ function rabbit_migration() { -e 's/om_enable_rabbitmq_quorum_queues: false/om_enable_rabbitmq_quorum_queues: true/' \ $KAYOBE_CONFIG_PATH/environments/$KAYOBE_ENVIRONMENT/kolla/globals.yml - $KAYOBE_CONFIG_ROOT/tools/rabbitmq-quorum-migration.sh + $KAYOBE_CONFIG_PATH/../../tools/rabbitmq-quorum-migration.sh sed -i -e 's/om_enable_rabbitmq_high_availability: false/om_enable_rabbitmq_high_availability: true/' \ -e 's/om_enable_rabbitmq_quorum_queues: true/om_enable_rabbitmq_quorum_queues: false/' \ From 4cd62776cad134b79ed7ffe8d1019d5bb23bc4b6 Mon Sep 17 00:00:00 2001 From: Seunghun Lee Date: Fri, 1 Nov 2024 15:37:19 +0000 Subject: [PATCH 07/21] Add conditional choice of kolla/globals.yml path --- tools/upgrade-prerequisites.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/upgrade-prerequisites.sh b/tools/upgrade-prerequisites.sh index fa4357ec4..24cd9bbee 100755 --- a/tools/upgrade-prerequisites.sh +++ b/tools/upgrade-prerequisites.sh @@ -31,15 +31,19 @@ function rabbit_upgrade() { function rabbit_migration() { if ! kayobe overcloud host command run -l controllers -b --command "docker exec rabbitmq rabbitmqctl list_queues type | grep quorum"; then # Set quorum flag, execute RabbitMQ queue migration script, unset quorum flag (to avoid git conflicts) + KOLLA_GLOBALS_PATH=$KAYOBE_CONFIG_PATH/kolla/globals.yml + if [[ $KAYOBE_ENVIRONMENT ]]; then + KOLLA_GLOBALS_PATH=$KAYOBE_CONFIG_PATH/environments/$KAYOBE_ENVIRONMENT/kolla/globals.yml + fi sed -i -e 's/om_enable_rabbitmq_high_availability: true/om_enable_rabbitmq_high_availability: false/' \ -e 's/om_enable_rabbitmq_quorum_queues: false/om_enable_rabbitmq_quorum_queues: true/' \ - $KAYOBE_CONFIG_PATH/environments/$KAYOBE_ENVIRONMENT/kolla/globals.yml + $KOLLA_GLOBALS_PATH $KAYOBE_CONFIG_PATH/../../tools/rabbitmq-quorum-migration.sh sed -i -e 's/om_enable_rabbitmq_high_availability: false/om_enable_rabbitmq_high_availability: true/' \ -e 's/om_enable_rabbitmq_quorum_queues: true/om_enable_rabbitmq_quorum_queues: false/' \ - $KAYOBE_CONFIG_PATH/environments/$KAYOBE_ENVIRONMENT/kolla/globals.yml + $KOLLA_GLOBALS_PATH fi } From de697bd50df09d75352d76e7c32b05c529b1d5d0 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Fri, 1 Nov 2024 16:16:47 +0000 Subject: [PATCH 08/21] Redfish exporter: Fixes scrape group We were only including the scrape group `overcloud` in the prometheus config. This change allows the scrape group to work as intended. --- .../kolla/config/prometheus/prometheus.yml.d/60-redfish.yml | 2 -- ...with-redfish-exporter-scrape-group-b10eaac6ee1e6af3.yaml | 6 ++++++ 2 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 releasenotes/notes/fix-issue-with-redfish-exporter-scrape-group-b10eaac6ee1e6af3.yaml diff --git a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml index 84e85e04f..6f234e5a0 100644 --- a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml +++ b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml @@ -15,13 +15,11 @@ scrape_configs: replacement: "{{ lookup('vars', admin_oc_net_name ~ '_ips')[groups.seed.0] }}:9610" static_configs: {% for host in groups.get('redfish_exporter_targets', []) %} -{% if hostvars[host]["redfish_exporter_scrape_group"] | default('overcloud') == 'overcloud' %} - targets: - '{{ hostvars[host]["redfish_exporter_target_address"] }}' labels: server: '{{ host }}' env: "{{ kayobe_environment | default('openstack') }}" group: "{{ hostvars[host]['redfish_exporter_scrape_group'] | default('overcloud') }}" -{% endif %} {% endfor %} {% endif %} diff --git a/releasenotes/notes/fix-issue-with-redfish-exporter-scrape-group-b10eaac6ee1e6af3.yaml b/releasenotes/notes/fix-issue-with-redfish-exporter-scrape-group-b10eaac6ee1e6af3.yaml new file mode 100644 index 000000000..1ee5a9a41 --- /dev/null +++ b/releasenotes/notes/fix-issue-with-redfish-exporter-scrape-group-b10eaac6ee1e6af3.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Fixes an issue where setting ``redfish_exporter_scrape_group`` to a value + other than ``overcloud`` would exclude those nodes from the redfish + exporter scrapes. From ecc15f5e4ca1747be9260c455b7c3b150ab61e3c Mon Sep 17 00:00:00 2001 From: Alex-Welsh Date: Mon, 4 Nov 2024 10:37:48 +0000 Subject: [PATCH 09/21] Fix Rabbit upgrade script conditional evaluation --- tools/rabbitmq-quorum-migration.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/rabbitmq-quorum-migration.sh b/tools/rabbitmq-quorum-migration.sh index b24e0d446..9bd4d79d2 100755 --- a/tools/rabbitmq-quorum-migration.sh +++ b/tools/rabbitmq-quorum-migration.sh @@ -13,7 +13,7 @@ fi if [[ ! "$1" = "--skip-checks" ]]; then # Fail if clocks are not synced - if ! kayobe overcloud host command run -l controllers -b --command "timedatectl status | grep 'synchronized: yes'"; then + if ! ( kayobe overcloud host command run -l controllers -b --command "timedatectl status | grep 'synchronized: yes'" ); then echo "Failed precheck: Time not synced on controllers" echo "Use 'timedatectl status' to check sync state" echo "Either wait for sync or use 'chronyc makestep'" @@ -21,7 +21,7 @@ if [[ ! "$1" = "--skip-checks" ]]; then fi kayobe overcloud service configuration generate --node-config-dir /tmp/rabbit-migration --kolla-tags none # Fail if HA is set or quorum is not - if ! grep 'om_enable_rabbitmq_quorum_queues: true' $KOLLA_CONFIG_PATH/globals.yml || grep 'om_enable_rabbitmq_high_availability: true' $KOLLA_CONFIG_PATH/globals.yml; then + if ! ( grep 'om_enable_rabbitmq_quorum_queues: true' $KOLLA_CONFIG_PATH/globals.yml || grep 'om_enable_rabbitmq_high_availability: true' $KOLLA_CONFIG_PATH/globals.yml ); then echo "Failed precheck: om_enable_rabbitmq_quorum_queues must be enabled, om_enable_rabbitmq_high_availability must be disabled" exit 1 fi @@ -35,12 +35,12 @@ kayobe kolla ansible run rabbitmq-reset-state if [[ ! "$1" = "--skip-checks" ]]; then # Fail if any queues still exist sleep 20 - if kayobe overcloud host command run -l controllers -b --command "docker exec $RABBITMQ_CONTAINER_NAME rabbitmqctl list_queues name --silent | grep -v '^$'"; then + if ( kayobe overcloud host command run -l controllers -b --command "docker exec $RABBITMQ_CONTAINER_NAME rabbitmqctl list_queues name --silent | grep -v '^$'" ); then echo "Failed check: RabbitMQ has not stopped properly, queues still exist" exit 1 fi # Fail if any exchanges still exist (excluding those starting with 'amq.') - if kayobe overcloud host command run -l controllers -b --command "docker exec $RABBITMQ_CONTAINER_NAME rabbitmqctl list_exchanges name --silent | grep -v '^$' | grep -v '^amq.'"; then + if ( kayobe overcloud host command run -l controllers -b --command "docker exec $RABBITMQ_CONTAINER_NAME rabbitmqctl list_exchanges name --silent | grep -v '^$' | grep -v '^amq.'" ); then echo "Failed check: RabbitMQ has not stopped properly, exchanges still exist" exit 1 fi @@ -52,7 +52,7 @@ kayobe kolla ansible run deploy-containers -kt $RABBITMQ_SERVICES_TO_RESTART if [[ ! "$1" = "--skip-checks" ]]; then sleep 20 # Assert that at least one quorum queue exists on each controller - if kayobe overcloud host command run -l controllers -b --command "docker exec $RABBITMQ_CONTAINER_NAME rabbitmqctl list_queues type | grep quorum"; then + if ( kayobe overcloud host command run -l controllers -b --command "docker exec $RABBITMQ_CONTAINER_NAME rabbitmqctl list_queues type | grep quorum" ); then echo "Queues migrated successfully" else echo "Failed post-check: A controller does not have any quorum queues" From 363c1c86141949a1b92aee617710749865fee382 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Mon, 4 Nov 2024 14:26:31 +0000 Subject: [PATCH 10/21] Redfish exporter: Decrease sensitivity of alert (#1358) The redfish exporter is prone to failed scrapes. Lets wait for mulitple failed scrapes before triggering an alert. This should still catch the case where it is completely dead, but reduce the false positives from failed scrapes. --- etc/kayobe/kolla/config/prometheus/prometheus.rules | 11 ++++++++++- ...ity-of-redfish-target-alerts-a3d77a3f0c3dac8a.yaml | 6 ++++++ 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 releasenotes/notes/reduces-sensitivity-of-redfish-target-alerts-a3d77a3f0c3dac8a.yaml diff --git a/etc/kayobe/kolla/config/prometheus/prometheus.rules b/etc/kayobe/kolla/config/prometheus/prometheus.rules index c9803946a..20e1b303a 100644 --- a/etc/kayobe/kolla/config/prometheus/prometheus.rules +++ b/etc/kayobe/kolla/config/prometheus/prometheus.rules @@ -7,7 +7,7 @@ groups: rules: - alert: PrometheusTargetMissing - expr: up == 0 + expr: up{job!="redfish-exporter-seed"} == 0 for: 5m labels: severity: critical @@ -15,6 +15,15 @@ groups: summary: "Prometheus target missing (instance {{ $labels.instance }})" description: "A Prometheus target has disappeared. An exporter might have crashed." + - alert: PrometheusTargetMissing + expr: up{job="redfish-exporter-seed"} == 0 + for: 15m + labels: + severity: critical + annotations: + summary: "Prometheus target missing (instance {{ $labels.instance }})" + description: "A Prometheus target has disappeared. An exporter might have crashed." + - alert: PrometheusAllTargetsMissing expr: count by (job) (up) == 0 for: 1m diff --git a/releasenotes/notes/reduces-sensitivity-of-redfish-target-alerts-a3d77a3f0c3dac8a.yaml b/releasenotes/notes/reduces-sensitivity-of-redfish-target-alerts-a3d77a3f0c3dac8a.yaml new file mode 100644 index 000000000..0ba59ea7a --- /dev/null +++ b/releasenotes/notes/reduces-sensitivity-of-redfish-target-alerts-a3d77a3f0c3dac8a.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Changes the duration for which redfish exporter must continually fail + scrapes before triggering an alert to 15 minutes. This should hopefully + reduce some alert spam. From d229d41a84011d8f650de7526a443b5a328c686c Mon Sep 17 00:00:00 2001 From: Jack Hodgkiss Date: Mon, 4 Nov 2024 15:06:53 +0000 Subject: [PATCH 11/21] fix!: manage the `physical` interface in `ci-aio` Treat the physical interface as network interface inside of the `ci-aio` environment. This is to ensure the network interface files are created correctly with the appropriate permissions otherwise we will lose connection after the DHCP lease expires. Also this approach enables us to include the interface within the firewall configuration ensuring we maintain SSH access through the firewall via this interface. --- .github/workflows/stackhpc-all-in-one.yml | 19 +++++-------------- .../environments/ci-aio/automated-setup.sh | 4 ---- .../environments/ci-aio/controllers.yml | 3 +++ .../group_vars/controllers/network-interfaces | 5 +++++ etc/kayobe/environments/ci-aio/networks.yml | 6 ++++++ 5 files changed, 19 insertions(+), 18 deletions(-) diff --git a/.github/workflows/stackhpc-all-in-one.yml b/.github/workflows/stackhpc-all-in-one.yml index 1909129ac..7f426d3f8 100644 --- a/.github/workflows/stackhpc-all-in-one.yml +++ b/.github/workflows/stackhpc-all-in-one.yml @@ -202,23 +202,14 @@ jobs: - name: Write Terraform network config run: | cat << EOF > etc/kayobe/environments/$KAYOBE_ENVIRONMENT/tf-networks.yml - - admin_oc_net_name: admin - admin_cidr: "{{ access_cidr.value }}" - admin_allocation_pool_start: 0.0.0.0 - admin_allocation_pool_end: 0.0.0.0 - admin_gateway: "{{ access_gw.value }}" - admin_bootproto: dhcp - admin_ips: + admin_oc_net_name: ethernet + ethernet_cidr: "{{ access_cidr.value }}" + ethernet_allocation_pool_start: 0.0.0.0 + ethernet_allocation_pool_end: 0.0.0.0 + ethernet_ips: controller0: "{{ access_ip_v4.value }}" EOF - - name: Write Terraform network interface config - run: | - cat << EOF > etc/kayobe/environments/$KAYOBE_ENVIRONMENT/inventory/group_vars/controllers/tf-network-interfaces - admin_interface: "{{ access_interface.value }}" - EOF - - name: Write all-in-one scenario config run: | cat << EOF > etc/kayobe/environments/$KAYOBE_ENVIRONMENT/zz-aio-scenario.yml diff --git a/etc/kayobe/environments/ci-aio/automated-setup.sh b/etc/kayobe/environments/ci-aio/automated-setup.sh index 84b9b5f09..5129db015 100644 --- a/etc/kayobe/environments/ci-aio/automated-setup.sh +++ b/etc/kayobe/environments/ci-aio/automated-setup.sh @@ -72,10 +72,6 @@ fi sudo ip l set dummy1 up sudo ip l set dummy1 master breth1 -if type apt; then - sudo cp /run/systemd/network/* /etc/systemd/network -fi - export KAYOBE_VAULT_PASSWORD=$(cat $BASE_PATH/vault-pw) pushd $BASE_PATH/src/kayobe-config source kayobe-env --environment ci-aio diff --git a/etc/kayobe/environments/ci-aio/controllers.yml b/etc/kayobe/environments/ci-aio/controllers.yml index b34536705..b67cb68f1 100644 --- a/etc/kayobe/environments/ci-aio/controllers.yml +++ b/etc/kayobe/environments/ci-aio/controllers.yml @@ -6,6 +6,9 @@ # to setup the Kayobe user account. Default is {{ os_distribution }}. controller_bootstrap_user: "{{ os_distribution if os_distribution == 'ubuntu' else 'cloud-user' }}" +controller_extra_network_interfaces: + - ethernet + # Controller lvm configuration. See intentory/group_vars/controllers/lvm.yml # for the exact configuration. controller_lvm_groups: diff --git a/etc/kayobe/environments/ci-aio/inventory/group_vars/controllers/network-interfaces b/etc/kayobe/environments/ci-aio/inventory/group_vars/controllers/network-interfaces index 2f8d30103..85f318f42 100644 --- a/etc/kayobe/environments/ci-aio/inventory/group_vars/controllers/network-interfaces +++ b/etc/kayobe/environments/ci-aio/inventory/group_vars/controllers/network-interfaces @@ -2,6 +2,11 @@ ############################################################################### # Network interface definitions for the controller group. +# Ethernet interface is the `primary` or `physical` interface associated +# with the instance that the AIO deployment runs inside of. It is the interface used +# to reach the instance. +ethernet_interface: "{{ ansible_facts['default_ipv4']['interface'] }}" + # Controller interface on all-in-one network. aio_interface: breth1 # Use dummy1 if it exists, otherwise the bridge will have no ports. diff --git a/etc/kayobe/environments/ci-aio/networks.yml b/etc/kayobe/environments/ci-aio/networks.yml index 216696eaa..f09e1cbc2 100644 --- a/etc/kayobe/environments/ci-aio/networks.yml +++ b/etc/kayobe/environments/ci-aio/networks.yml @@ -80,6 +80,12 @@ cleaning_net_name: aio ############################################################################### # Network definitions. +# This network is required to be defined within `ci-aio` environment to ensure that +# the network interface files are created appropriately and to provide easy inclusion +# within the firewall configuration. +ethernet_bootproto: dhcp +ethernet_zone: trusted + # All-in-one network. aio_cidr: 192.168.33.0/24 aio_allocation_pool_start: 192.168.33.3 From f4f88993ee936bb8eccbf8c5d1476f22387f11be Mon Sep 17 00:00:00 2001 From: Jack Hodgkiss Date: Mon, 4 Nov 2024 15:06:53 +0000 Subject: [PATCH 12/21] fix!: manage the `physical` interface in `ci-aio` Treat the physical interface as network interface inside of the `ci-aio` environment. This is to ensure the network interface files are created correctly with the appropriate permissions otherwise we will lose connection after the DHCP lease expires. Also this approach enables us to include the interface within the firewall configuration ensuring we maintain SSH access through the firewall via this interface. --- .github/workflows/stackhpc-all-in-one.yml | 19 +++++-------------- .../environments/ci-aio/automated-setup.sh | 4 ---- .../environments/ci-aio/controllers.yml | 3 +++ .../group_vars/controllers/network-interfaces | 5 +++++ etc/kayobe/environments/ci-aio/networks.yml | 6 ++++++ 5 files changed, 19 insertions(+), 18 deletions(-) diff --git a/.github/workflows/stackhpc-all-in-one.yml b/.github/workflows/stackhpc-all-in-one.yml index 45a33f47d..0ec9b6d34 100644 --- a/.github/workflows/stackhpc-all-in-one.yml +++ b/.github/workflows/stackhpc-all-in-one.yml @@ -225,23 +225,14 @@ jobs: - name: Write Terraform network config run: | cat << EOF > etc/kayobe/environments/$KAYOBE_ENVIRONMENT/tf-networks.yml - - admin_oc_net_name: admin - admin_cidr: "{{ access_cidr.value }}" - admin_allocation_pool_start: 0.0.0.0 - admin_allocation_pool_end: 0.0.0.0 - admin_gateway: "{{ access_gw.value }}" - admin_bootproto: dhcp - admin_ips: + admin_oc_net_name: ethernet + ethernet_cidr: "{{ access_cidr.value }}" + ethernet_allocation_pool_start: 0.0.0.0 + ethernet_allocation_pool_end: 0.0.0.0 + ethernet_ips: controller0: "{{ access_ip_v4.value }}" EOF - - name: Write Terraform network interface config - run: | - cat << EOF > etc/kayobe/environments/$KAYOBE_ENVIRONMENT/inventory/group_vars/controllers/tf-network-interfaces - admin_interface: "{{ access_interface.value }}" - EOF - - name: Write all-in-one scenario config run: | cat << EOF > etc/kayobe/environments/$KAYOBE_ENVIRONMENT/zz-aio-scenario.yml diff --git a/etc/kayobe/environments/ci-aio/automated-setup.sh b/etc/kayobe/environments/ci-aio/automated-setup.sh index f7d34db86..82e642e2b 100644 --- a/etc/kayobe/environments/ci-aio/automated-setup.sh +++ b/etc/kayobe/environments/ci-aio/automated-setup.sh @@ -72,10 +72,6 @@ fi sudo ip l set dummy1 up sudo ip l set dummy1 master breth1 -if type apt; then - sudo cp /run/systemd/network/* /etc/systemd/network -fi - export KAYOBE_VAULT_PASSWORD=$(cat $BASE_PATH/vault-pw) pushd $BASE_PATH/src/kayobe-config source kayobe-env --environment ci-aio diff --git a/etc/kayobe/environments/ci-aio/controllers.yml b/etc/kayobe/environments/ci-aio/controllers.yml index b34536705..b67cb68f1 100644 --- a/etc/kayobe/environments/ci-aio/controllers.yml +++ b/etc/kayobe/environments/ci-aio/controllers.yml @@ -6,6 +6,9 @@ # to setup the Kayobe user account. Default is {{ os_distribution }}. controller_bootstrap_user: "{{ os_distribution if os_distribution == 'ubuntu' else 'cloud-user' }}" +controller_extra_network_interfaces: + - ethernet + # Controller lvm configuration. See intentory/group_vars/controllers/lvm.yml # for the exact configuration. controller_lvm_groups: diff --git a/etc/kayobe/environments/ci-aio/inventory/group_vars/controllers/network-interfaces b/etc/kayobe/environments/ci-aio/inventory/group_vars/controllers/network-interfaces index 2f8d30103..85f318f42 100644 --- a/etc/kayobe/environments/ci-aio/inventory/group_vars/controllers/network-interfaces +++ b/etc/kayobe/environments/ci-aio/inventory/group_vars/controllers/network-interfaces @@ -2,6 +2,11 @@ ############################################################################### # Network interface definitions for the controller group. +# Ethernet interface is the `primary` or `physical` interface associated +# with the instance that the AIO deployment runs inside of. It is the interface used +# to reach the instance. +ethernet_interface: "{{ ansible_facts['default_ipv4']['interface'] }}" + # Controller interface on all-in-one network. aio_interface: breth1 # Use dummy1 if it exists, otherwise the bridge will have no ports. diff --git a/etc/kayobe/environments/ci-aio/networks.yml b/etc/kayobe/environments/ci-aio/networks.yml index 216696eaa..f09e1cbc2 100644 --- a/etc/kayobe/environments/ci-aio/networks.yml +++ b/etc/kayobe/environments/ci-aio/networks.yml @@ -80,6 +80,12 @@ cleaning_net_name: aio ############################################################################### # Network definitions. +# This network is required to be defined within `ci-aio` environment to ensure that +# the network interface files are created appropriately and to provide easy inclusion +# within the firewall configuration. +ethernet_bootproto: dhcp +ethernet_zone: trusted + # All-in-one network. aio_cidr: 192.168.33.0/24 aio_allocation_pool_start: 192.168.33.3 From 77dce90a798640c645fdd869c8377dbab09b114f Mon Sep 17 00:00:00 2001 From: Jack Hodgkiss Date: Fri, 25 Oct 2024 14:54:46 +0100 Subject: [PATCH 13/21] fix: test if `admin-openrc.sh` exists before deploying os-capacity During service deploys using Kayobe Automation this playbook will fail as the `admin-openrc.sh` file is not generated during the deployment process. This in turn causes the workflow to be reported as a failure even though the service deployment succeeded. --- .../ansible/deploy-os-capacity-exporter.yml | 104 +++++++++--------- 1 file changed, 53 insertions(+), 51 deletions(-) diff --git a/etc/kayobe/ansible/deploy-os-capacity-exporter.yml b/etc/kayobe/ansible/deploy-os-capacity-exporter.yml index 41d91bfbd..2632cab30 100644 --- a/etc/kayobe/ansible/deploy-os-capacity-exporter.yml +++ b/etc/kayobe/ansible/deploy-os-capacity-exporter.yml @@ -15,59 +15,61 @@ tags: os_capacity gather_facts: false tasks: - - name: Create os-capacity directory - ansible.builtin.file: - path: /opt/kayobe/os-capacity/ - state: directory - when: stackhpc_enable_os_capacity - - - name: Read admin-openrc credential file - ansible.builtin.command: - cmd: "cat {{ lookup('ansible.builtin.env', 'KOLLA_CONFIG_PATH') }}/admin-openrc.sh" + - name: Check if admin-openrc.sh exists + ansible.builtin.stat: + path: "{{ lookup('ansible.builtin.env', 'KOLLA_CONFIG_PATH') }}/admin-openrc.sh" delegate_to: localhost - register: credential - when: stackhpc_enable_os_capacity - changed_when: false + register: openrc_file_stat + run_once: true - - name: Set facts for admin credentials - ansible.builtin.set_fact: - stackhpc_os_capacity_auth_url: "{{ credential.stdout_lines | select('match', '.*OS_AUTH_URL*.') | first | split('=') | last | replace(\"'\",'') }}" - stackhpc_os_capacity_project_name: "{{ credential.stdout_lines | select('match', '.*OS_PROJECT_NAME*.') | first | split('=') | last | replace(\"'\",'') }}" - stackhpc_os_capacity_domain_name: "{{ credential.stdout_lines | select('match', '.*OS_PROJECT_DOMAIN_NAME*.') | first | split('=') | last | replace(\"'\",'') }}" - stackhpc_os_capacity_openstack_region_name: "{{ credential.stdout_lines | select('match', '.*OS_REGION_NAME*.') | first | split('=') | last | replace(\"'\",'') }}" - stackhpc_os_capacity_username: "{{ credential.stdout_lines | select('match', '.*OS_USERNAME*.') | first | split('=') | last | replace(\"'\",'') }}" - stackhpc_os_capacity_password: "{{ credential.stdout_lines | select('match', '.*OS_PASSWORD*.') | first | split('=') | last | replace(\"'\",'') }}" - when: stackhpc_enable_os_capacity + - block: + - name: Create os-capacity directory + ansible.builtin.file: + path: /opt/kayobe/os-capacity/ + state: directory - - name: Template clouds.yml - ansible.builtin.template: - src: templates/os_capacity-clouds.yml.j2 - dest: /opt/kayobe/os-capacity/clouds.yaml - when: stackhpc_enable_os_capacity - register: clouds_yaml_result + - name: Read admin-openrc credential file + ansible.builtin.command: + cmd: "cat {{ lookup('ansible.builtin.env', 'KOLLA_CONFIG_PATH') }}/admin-openrc.sh" + delegate_to: localhost + register: credential + changed_when: false - - name: Copy CA certificate to OpenStack Capacity nodes - ansible.builtin.copy: - src: "{{ stackhpc_os_capacity_openstack_cacert }}" - dest: /opt/kayobe/os-capacity/cacert.pem - when: - - stackhpc_enable_os_capacity - - stackhpc_os_capacity_openstack_cacert | length > 0 - register: cacert_result + - name: Set facts for admin credentials + ansible.builtin.set_fact: + stackhpc_os_capacity_auth_url: "{{ credential.stdout_lines | select('match', '.*OS_AUTH_URL*.') | first | split('=') | last | replace(\"'\",'') }}" + stackhpc_os_capacity_project_name: "{{ credential.stdout_lines | select('match', '.*OS_PROJECT_NAME*.') | first | split('=') | last | replace(\"'\",'') }}" + stackhpc_os_capacity_domain_name: "{{ credential.stdout_lines | select('match', '.*OS_PROJECT_DOMAIN_NAME*.') | first | split('=') | last | replace(\"'\",'') }}" + stackhpc_os_capacity_openstack_region_name: "{{ credential.stdout_lines | select('match', '.*OS_REGION_NAME*.') | first | split('=') | last | replace(\"'\",'') }}" + stackhpc_os_capacity_username: "{{ credential.stdout_lines | select('match', '.*OS_USERNAME*.') | first | split('=') | last | replace(\"'\",'') }}" + stackhpc_os_capacity_password: "{{ credential.stdout_lines | select('match', '.*OS_PASSWORD*.') | first | split('=') | last | replace(\"'\",'') }}" - - name: Ensure os_capacity container is running - community.docker.docker_container: - name: os_capacity - image: ghcr.io/stackhpc/os-capacity:master - env: - OS_CLOUD: openstack - OS_CLIENT_CONFIG_FILE: /etc/openstack/clouds.yaml - mounts: - - type: bind - source: /opt/kayobe/os-capacity/ - target: /etc/openstack/ - network_mode: host - restart: "{{ clouds_yaml_result is changed or cacert_result is changed }}" - restart_policy: unless-stopped - become: true - when: stackhpc_enable_os_capacity + - name: Template clouds.yml + ansible.builtin.template: + src: templates/os_capacity-clouds.yml.j2 + dest: /opt/kayobe/os-capacity/clouds.yaml + register: clouds_yaml_result + + - name: Copy CA certificate to OpenStack Capacity nodes + ansible.builtin.copy: + src: "{{ stackhpc_os_capacity_openstack_cacert }}" + dest: /opt/kayobe/os-capacity/cacert.pem + when: stackhpc_os_capacity_openstack_cacert | length > 0 + register: cacert_result + + - name: Ensure os_capacity container is running + community.docker.docker_container: + name: os_capacity + image: ghcr.io/stackhpc/os-capacity:master + env: + OS_CLOUD: openstack + OS_CLIENT_CONFIG_FILE: /etc/openstack/clouds.yaml + mounts: + - type: bind + source: /opt/kayobe/os-capacity/ + target: /etc/openstack/ + network_mode: host + restart: "{{ clouds_yaml_result is changed or cacert_result is changed }}" + restart_policy: unless-stopped + become: true + when: stackhpc_enable_os_capacity and openrc_file_stat.stat.exists From 6682a6f3f90ab97f52b25d53ff0ef334b9ffe23d Mon Sep 17 00:00:00 2001 From: Jack Hodgkiss Date: Mon, 4 Nov 2024 15:06:53 +0000 Subject: [PATCH 14/21] fix!: manage the `physical` interface in `ci-aio` Treat the physical interface as network interface inside of the `ci-aio` environment. This is to ensure the network interface files are created correctly with the appropriate permissions otherwise we will lose connection after the DHCP lease expires. Also this approach enables us to include the interface within the firewall configuration ensuring we maintain SSH access through the firewall via this interface. --- .github/workflows/stackhpc-all-in-one.yml | 20 +++++-------------- .../environments/ci-aio/automated-setup.sh | 4 ---- .../environments/ci-aio/controllers.yml | 3 +++ .../group_vars/controllers/network-interfaces | 5 +++++ etc/kayobe/environments/ci-aio/networks.yml | 6 ++++++ 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/.github/workflows/stackhpc-all-in-one.yml b/.github/workflows/stackhpc-all-in-one.yml index 5d9d4f125..84442121a 100644 --- a/.github/workflows/stackhpc-all-in-one.yml +++ b/.github/workflows/stackhpc-all-in-one.yml @@ -214,22 +214,12 @@ jobs: - name: Write Terraform network config run: | cat << EOF > etc/kayobe/environments/$KAYOBE_ENVIRONMENT/tf-networks.yml - - admin_oc_net_name: admin - admin_cidr: "{{ access_cidr.value }}" - admin_allocation_pool_start: 0.0.0.0 - admin_allocation_pool_end: 0.0.0.0 - admin_gateway: "{{ access_gw.value }}" - admin_bootproto: dhcp - admin_ips: + admin_oc_net_name: ethernet + ethernet_cidr: "{{ access_cidr.value }}" + ethernet_allocation_pool_start: 0.0.0.0 + ethernet_allocation_pool_end: 0.0.0.0 + ethernet_ips: controller0: "{{ access_ip_v4.value }}" - admin_zone: admin - EOF - - - name: Write Terraform network interface config - run: | - cat << EOF > etc/kayobe/environments/$KAYOBE_ENVIRONMENT/inventory/group_vars/controllers/tf-network-interfaces - admin_interface: "{{ access_interface.value }}" EOF - name: Write all-in-one scenario config diff --git a/etc/kayobe/environments/ci-aio/automated-setup.sh b/etc/kayobe/environments/ci-aio/automated-setup.sh index 441c8f967..868d408ca 100644 --- a/etc/kayobe/environments/ci-aio/automated-setup.sh +++ b/etc/kayobe/environments/ci-aio/automated-setup.sh @@ -76,10 +76,6 @@ fi sudo ip l set dummy1 up sudo ip l set dummy1 master breth1 -if type apt; then - sudo cp /run/systemd/network/* /etc/systemd/network -fi - export KAYOBE_VAULT_PASSWORD=$(cat $BASE_PATH/vault-pw) pushd $BASE_PATH/src/kayobe-config source kayobe-env --environment ci-aio diff --git a/etc/kayobe/environments/ci-aio/controllers.yml b/etc/kayobe/environments/ci-aio/controllers.yml index 8972187df..12fe3afcb 100644 --- a/etc/kayobe/environments/ci-aio/controllers.yml +++ b/etc/kayobe/environments/ci-aio/controllers.yml @@ -6,6 +6,9 @@ # to setup the Kayobe user account. Default is {{ os_distribution }}. controller_bootstrap_user: "{{ os_distribution if os_distribution == 'ubuntu' else 'cloud-user' }}" +controller_extra_network_interfaces: + - ethernet + # Controller lvm configuration. See intentory/group_vars/controllers/lvm.yml # for the exact configuration. controller_lvm_groups: diff --git a/etc/kayobe/environments/ci-aio/inventory/group_vars/controllers/network-interfaces b/etc/kayobe/environments/ci-aio/inventory/group_vars/controllers/network-interfaces index 2f8d30103..85f318f42 100644 --- a/etc/kayobe/environments/ci-aio/inventory/group_vars/controllers/network-interfaces +++ b/etc/kayobe/environments/ci-aio/inventory/group_vars/controllers/network-interfaces @@ -2,6 +2,11 @@ ############################################################################### # Network interface definitions for the controller group. +# Ethernet interface is the `primary` or `physical` interface associated +# with the instance that the AIO deployment runs inside of. It is the interface used +# to reach the instance. +ethernet_interface: "{{ ansible_facts['default_ipv4']['interface'] }}" + # Controller interface on all-in-one network. aio_interface: breth1 # Use dummy1 if it exists, otherwise the bridge will have no ports. diff --git a/etc/kayobe/environments/ci-aio/networks.yml b/etc/kayobe/environments/ci-aio/networks.yml index e3cc4d43d..4bf4e96cd 100644 --- a/etc/kayobe/environments/ci-aio/networks.yml +++ b/etc/kayobe/environments/ci-aio/networks.yml @@ -80,6 +80,12 @@ cleaning_net_name: aio ############################################################################### # Network definitions. +# This network is required to be defined within `ci-aio` environment to ensure that +# the network interface files are created appropriately and to provide easy inclusion +# within the firewall configuration. +ethernet_bootproto: dhcp +ethernet_zone: trusted + # All-in-one network. aio_cidr: 192.168.33.0/24 aio_allocation_pool_start: 192.168.33.3 From f0e1cf560d6387ac7d4760b1aa12579fe1a230da Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Wed, 6 Nov 2024 10:59:54 +0000 Subject: [PATCH 15/21] CIS: Stop recursively setting permissions on logs files This suffers from a time-of-check to time-of-use race condition and in general recursively setting file permissions seems to be a bad idea. --- etc/kayobe/inventory/group_vars/cis-hardening/cis | 6 ++++++ ...-longer-sets-permissions-on-logs-81a48ab8ed2d6b5f.yaml | 8 ++++++++ 2 files changed, 14 insertions(+) create mode 100644 releasenotes/notes/cis-hardening-no-longer-sets-permissions-on-logs-81a48ab8ed2d6b5f.yaml diff --git a/etc/kayobe/inventory/group_vars/cis-hardening/cis b/etc/kayobe/inventory/group_vars/cis-hardening/cis index 37d01492b..2c103cb34 100644 --- a/etc/kayobe/inventory/group_vars/cis-hardening/cis +++ b/etc/kayobe/inventory/group_vars/cis-hardening/cis @@ -51,6 +51,9 @@ rhel9cis_rule_6_1_15: false # filesystem. We do not want to change /var/lib/docker permissions. rhel9cis_no_world_write_adjust: false +# Prevent hardening from recursivley changing permissions on log files +rhel9cis_rule_4_2_3: false + # Configure log rotation to prevent audit logs from filling the disk rhel9cis_auditd: space_left_action: syslog @@ -153,6 +156,9 @@ ubtu22cis_no_owner_adjust: false ubtu22cis_no_world_write_adjust: false ubtu22cis_suid_adjust: false +# Prevent hardening from recursivley changing permissions on log files +ubtu22cis_rule_4_2_3: false + # Configure log rotation to prevent audit logs from filling the disk ubtu22cis_auditd: action_mail_acct: root diff --git a/releasenotes/notes/cis-hardening-no-longer-sets-permissions-on-logs-81a48ab8ed2d6b5f.yaml b/releasenotes/notes/cis-hardening-no-longer-sets-permissions-on-logs-81a48ab8ed2d6b5f.yaml new file mode 100644 index 000000000..e50b5b62b --- /dev/null +++ b/releasenotes/notes/cis-hardening-no-longer-sets-permissions-on-logs-81a48ab8ed2d6b5f.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + The CIS hardening scripts no longer change permissions of log files by + default. It is preferred to configure these permissions at source i.e on + whatever is creating the files. It also suffered from a time-of-check to + time-of-use race condition. If you want the old behaviour you can change + ``rhel9cis_rule_4_2_3`` and/or ``ubtu22cis_rule_4_2_3`` to ``true``. From a3c08c09417db78c85b203e6c0d6b21dbc597b1b Mon Sep 17 00:00:00 2001 From: Alex-Welsh Date: Fri, 8 Nov 2024 11:29:27 +0000 Subject: [PATCH 16/21] Add upgrade host configure warning for ceph nodes --- doc/source/operations/upgrading-openstack.rst | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/doc/source/operations/upgrading-openstack.rst b/doc/source/operations/upgrading-openstack.rst index 23fc8b67f..700f093f7 100644 --- a/doc/source/operations/upgrading-openstack.rst +++ b/doc/source/operations/upgrading-openstack.rst @@ -969,17 +969,27 @@ would be applied: kayobe overcloud host configure --check --diff When ready to apply the changes, it may be advisable to do so in batches, or at -least start with a small number of hosts.: +least start with a small number of hosts: .. code-block:: console kayobe overcloud host configure --limit -Alternatively, to apply the configuration to all hosts: -.. code-block:: console +.. warning:: + + Take extra care when configuring Ceph hosts. Set the hosts to maintenance + mode before reconfiguring them, and unset when done: + + .. code-block:: console + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/ceph-enter-maintenance.yml --limit + kayobe overcloud host configure --limit + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/ceph-exit-maintenance.yml --limit - kayobe overcloud host configure + **Always** reconfigure hosts in small batches or one-by-one. Check the Ceph + state after each host configuration. Ensure all warnings and errors are + resolved before moving on. .. _building_ironic_deployment_images: From 1bdc619db52c327d3ed212c8ca763ad55711431d Mon Sep 17 00:00:00 2001 From: Jake Hutchinson <39007539+assumptionsandg@users.noreply.github.com> Date: Tue, 12 Nov 2024 11:05:29 +0000 Subject: [PATCH 17/21] Update Blazar image (#1334) Switch to using the StackHPC fork - master branch - when building blazar --- etc/kayobe/kolla-image-tags.yml | 3 +++ etc/kayobe/kolla.yml | 4 ++++ releasenotes/notes/update-blazar-image-d176c27d55716469.yaml | 5 +++++ 3 files changed, 12 insertions(+) create mode 100644 releasenotes/notes/update-blazar-image-d176c27d55716469.yaml diff --git a/etc/kayobe/kolla-image-tags.yml b/etc/kayobe/kolla-image-tags.yml index 7132a6d1a..11faa6a44 100644 --- a/etc/kayobe/kolla-image-tags.yml +++ b/etc/kayobe/kolla-image-tags.yml @@ -8,6 +8,9 @@ kolla_image_tags: ubuntu-jammy: 2023.1-ubuntu-jammy-20240621T104542 bifrost_deploy: ubuntu-jammy: 2023.1-ubuntu-jammy-20240423T125905 + blazar: + rocky-9: 2023.1-rocky-9-20241107T140552 + ubuntu-jammy: 2023.1-ubuntu-jammy-20241107T140552 cinder: ubuntu-jammy: 2023.1-ubuntu-jammy-20240701T123544 glance: diff --git a/etc/kayobe/kolla.yml b/etc/kayobe/kolla.yml index ed5ef31e9..0f69948e6 100644 --- a/etc/kayobe/kolla.yml +++ b/etc/kayobe/kolla.yml @@ -159,6 +159,10 @@ kolla_sources: type: git location: https://github.com/stackhpc/octavia.git reference: stackhpc/{{ openstack_release }} + blazar-base: + type: git + location: https://github.com/stackhpc/blazar + reference: stackhpc/master ############################################################################### # Kolla image build configuration. diff --git a/releasenotes/notes/update-blazar-image-d176c27d55716469.yaml b/releasenotes/notes/update-blazar-image-d176c27d55716469.yaml new file mode 100644 index 000000000..7e53b3543 --- /dev/null +++ b/releasenotes/notes/update-blazar-image-d176c27d55716469.yaml @@ -0,0 +1,5 @@ +--- +features: + - | + Use the StackHPC fork for building Blazar images with customizations to support + flavor-based reservation. From f21cbf55cea2a0f61005168295a6e2e4060307ab Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Wed, 13 Nov 2024 15:44:35 +0000 Subject: [PATCH 18/21] Pin os-capacity to v0.5 release (#1365) Pin os-capacity to v0.5 release --------- Co-authored-by: Jake Hutchinson --- etc/kayobe/ansible/deploy-os-capacity-exporter.yml | 2 +- etc/kayobe/stackhpc-monitoring.yml | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/etc/kayobe/ansible/deploy-os-capacity-exporter.yml b/etc/kayobe/ansible/deploy-os-capacity-exporter.yml index 2632cab30..f0a2f7c9c 100644 --- a/etc/kayobe/ansible/deploy-os-capacity-exporter.yml +++ b/etc/kayobe/ansible/deploy-os-capacity-exporter.yml @@ -60,7 +60,7 @@ - name: Ensure os_capacity container is running community.docker.docker_container: name: os_capacity - image: ghcr.io/stackhpc/os-capacity:master + image: ghcr.io/stackhpc/os-capacity:{{ stackhpc_os_capacity_version }} env: OS_CLOUD: openstack OS_CLIENT_CONFIG_FILE: /etc/openstack/clouds.yaml diff --git a/etc/kayobe/stackhpc-monitoring.yml b/etc/kayobe/stackhpc-monitoring.yml index e2377a13e..66354d974 100644 --- a/etc/kayobe/stackhpc-monitoring.yml +++ b/etc/kayobe/stackhpc-monitoring.yml @@ -34,6 +34,9 @@ alertmanager_packet_errors_threshold: 1 # targets being templated during deployment. stackhpc_enable_os_capacity: true +# OpenStack Capacity exporter version +stackhpc_os_capacity_version: v0.5 + # Path to a CA certificate file to trust in the OpenStack Capacity exporter. stackhpc_os_capacity_openstack_cacert: "" From b9fde8b40708f685499c95a1f9bd0ea172258edb Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Wed, 20 Nov 2024 23:00:02 +0100 Subject: [PATCH 19/21] docs: fix link to release train page --- doc/source/configuration/release-train.rst | 2 ++ doc/source/operations/upgrading.rst | 5 ++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/configuration/release-train.rst b/doc/source/configuration/release-train.rst index 0d62fadfd..f77109aff 100644 --- a/doc/source/configuration/release-train.rst +++ b/doc/source/configuration/release-train.rst @@ -1,3 +1,5 @@ +.. _stackhpc_release_train: + ====================== StackHPC Release Train ====================== diff --git a/doc/source/operations/upgrading.rst b/doc/source/operations/upgrading.rst index 218c39bb1..b8d468b44 100644 --- a/doc/source/operations/upgrading.rst +++ b/doc/source/operations/upgrading.rst @@ -363,9 +363,8 @@ To upgrade the Ansible control host: Syncing Release Train artifacts ------------------------------- -New `StackHPC Release Train <../configuration/release-train>`_ content should -be synced to the local Pulp server. This includes host packages (Deb/RPM) and -container images. +New :ref:`stackhpc_release_train` content should be synced to the local Pulp +server. This includes host packages (Deb/RPM) and container images. .. _sync-rt-package-repos: From 6975cb113330edb28d0dae8839e8ffb89f336a99 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Fri, 22 Nov 2024 09:47:22 +0100 Subject: [PATCH 20/21] Bump kayobe-automation This is required to fix some CI jobs [1]. [1] https://github.com/stackhpc/kayobe-automation/pull/69 --- .automation | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.automation b/.automation index b00f285be..a7de3abb3 160000 --- a/.automation +++ b/.automation @@ -1 +1 @@ -Subproject commit b00f285be240e34c643c4bd93a877e56587f71fa +Subproject commit a7de3abb3f0bf529e78c4ba9ad1cbe26d356dd3b From 94c132c6fbe7283303aa2095e7763457051418b9 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Mon, 25 Nov 2024 11:05:02 +0100 Subject: [PATCH 21/21] Rebuild Blazar images for 2024.1 --- etc/kayobe/kolla-image-tags.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/etc/kayobe/kolla-image-tags.yml b/etc/kayobe/kolla-image-tags.yml index 68c331aba..9919548cb 100644 --- a/etc/kayobe/kolla-image-tags.yml +++ b/etc/kayobe/kolla-image-tags.yml @@ -6,6 +6,9 @@ kolla_image_tags: openstack: rocky-9: 2024.1-rocky-9-20240903T113235 ubuntu-jammy: 2024.1-ubuntu-jammy-20240917T091559 + blazar: + rocky-9: 2024.1-rocky-9-20241125T093138 + ubuntu-jammy: 2024.1-ubuntu-jammy-20241125T093138 heat: rocky-9: 2024.1-rocky-9-20240805T142526 nova: