From 71c422fb21883f5a8b17d7420c42e37d53f1ae17 Mon Sep 17 00:00:00 2001 From: Andrew Scribner Date: Wed, 26 Jun 2024 15:12:11 -0400 Subject: [PATCH] add prometheus scrape jobs for all units (#264) updates the scrape job generated by charm.py to include all units, not just the leader, to enable prometheus to scrape metrics from all alertmanager units. Also clarifies the terminology used when talking about netlocs, hostnames, etc. --- src/alertmanager.py | 14 +++-- src/charm.py | 68 ++++++++++++------------ tests/unit/test_self_scrape_jobs.py | 81 +++++++++++++++++++++++++++++ 3 files changed, 121 insertions(+), 42 deletions(-) create mode 100644 tests/unit/test_self_scrape_jobs.py diff --git a/src/alertmanager.py b/src/alertmanager.py index c2f87311..6f54f7fc 100644 --- a/src/alertmanager.py +++ b/src/alertmanager.py @@ -79,7 +79,7 @@ def __init__( charm, *, container_name: str, - peer_addresses: List[str], + peer_netlocs: List[str], api_port: int, ha_port: int, web_external_url: str, @@ -96,7 +96,7 @@ def __init__( self._service_name = self._container_name = container_name self._container = charm.unit.get_container(container_name) - self._peer_addresses = peer_addresses + self._peer_netlocs = peer_netlocs self._api_port = api_port self._ha_port = ha_port @@ -169,17 +169,15 @@ def _alertmanager_layer(self) -> Layer: def _command(): """Returns full command line to start alertmanager.""" - # cluster listen address - empty string disables HA mode - listen_address_arg = ( - "" if len(self._peer_addresses) == 0 else f"0.0.0.0:{self._ha_port}" - ) + # cluster listen netloc - empty string disables HA mode + listen_netloc_arg = "" if len(self._peer_netlocs) == 0 else f"0.0.0.0:{self._ha_port}" # The chosen port in the cluster.listen-address flag is the port that needs to be # specified in the cluster.peer flag of the other peers. # Assuming all replicas use the same port. # Sorting for repeatability in comparing between service layers. peer_cmd_args = " ".join( - sorted([f"--cluster.peer={address}" for address in self._peer_addresses]) + sorted([f"--cluster.peer={netloc}" for netloc in self._peer_netlocs]) ) web_config_arg = ( f"--web.config.file={self._web_config_path} " if self._is_tls_enabled() else "" @@ -189,7 +187,7 @@ def _command(): f"--config.file={self._config_path} " f"--storage.path={self._storage_path} " f"--web.listen-address=:{self._api_port} " - f"--cluster.listen-address={listen_address_arg} " + f"--cluster.listen-address={listen_netloc_arg} " f"--web.external-url={self._web_external_url} " f"{web_config_arg}" f"{peer_cmd_args}" diff --git a/src/charm.py b/src/charm.py index 676af659..23aaebe0 100755 --- a/src/charm.py +++ b/src/charm.py @@ -159,10 +159,15 @@ def __init__(self, *args): # Core lifecycle events self.framework.observe(self.on.config_changed, self._on_config_changed) + peer_ha_netlocs = [ + f"{hostname}:{self._ports.ha}" + for hostname in self._get_peer_hostnames(include_this_unit=False) + ] + self.alertmanager_workload = WorkloadManager( self, container_name=self._container_name, - peer_addresses=self._get_peer_addresses(), + peer_netlocs=peer_ha_netlocs, api_port=self.api_port, ha_port=self._ports.ha, web_external_url=self._internal_url, @@ -240,16 +245,15 @@ def self_scraping_job(self): # This assumption is necessary because the local CA signs CSRs with FQDN as the SAN DNS. # If prometheus were to scrape an ingress URL instead, it would error out with: # x509: cannot validate certificate. - metrics_endpoint = urlparse(self._internal_url.rstrip("/") + "/metrics") - metrics_path = metrics_endpoint.path - # Render a ':port' section only if it is explicit (e.g. 9093; without an explicit port, the - # port is deduced from the scheme). - port_str = (":" + str(metrics_endpoint.port)) if metrics_endpoint.port is not None else "" - target = f"{metrics_endpoint.hostname}{port_str}" + peer_api_netlocs = [ + f"{hostname}:{self._ports.api}" + for hostname in self._get_peer_hostnames(include_this_unit=True) + ] + config = { - "scheme": metrics_endpoint.scheme, - "metrics_path": metrics_path, - "static_configs": [{"targets": [target]}], + "scheme": self._scheme, + "metrics_path": "/metrics", + "static_configs": [{"targets": peer_api_netlocs}], } return [config] @@ -526,29 +530,25 @@ def _update_ca_certs(self): ca_cert_path.unlink(missing_ok=True) subprocess.run(["update-ca-certificates", "--fresh"], check=True) - def _get_peer_addresses(self) -> List[str]: - """Create a list of HA addresses of all peer units (all units excluding current). + def _get_peer_hostnames(self, include_this_unit=True) -> List[str]: + """Returns a list of the hostnames of the peer units. - The returned addresses include the hostname, HA port number and path, but do not include - scheme (http). + An example of the return format is: + ["alertmanager-1.alertmanager-endpoints.am.svc.cluster.local"] """ addresses = [] + if include_this_unit: + addresses.append(self._internal_url) if pr := self.peer_relation: for unit in pr.units: # pr.units only holds peers (self.unit is not included) - if api_url := pr.data[unit].get("private_address"): - parsed = urlparse(api_url) - if not (parsed.scheme in ["http", "https"] and parsed.hostname): - # This shouldn't happen - logger.error( - "Invalid peer address in relation data: '%s'; skipping. " - "Address must include scheme (http or https) and hostname.", - api_url, - ) - continue - # Drop scheme and replace API port with HA port - addresses.append(f"{parsed.hostname}:{self._ports.ha}{parsed.path}") - - return addresses + if address := pr.data[unit].get("private_address"): + addresses.append(address) + + # Save only the hostname part of the address + # Sort the hostnames in case their order is not guaranteed, to reduce unnecessary updates + hostnames = sorted([urlparse(address).hostname for address in addresses]) + + return hostnames def _is_tls_ready(self) -> bool: """Returns True if the workload is ready to operate in TLS mode.""" @@ -561,18 +561,18 @@ def _is_tls_ready(self) -> bool: @property def _internal_url(self) -> str: - """Return the fqdn dns-based in-cluster (private) address of the alertmanager api server. - - If an external (public) url is set, add in its path. - """ - scheme = "https" if self._is_tls_ready() else "http" - return f"{scheme}://{socket.getfqdn()}:{self._ports.api}" + """Return the fqdn dns-based in-cluster (private) address of the alertmanager api server.""" + return f"{self._scheme}://{socket.getfqdn()}:{self._ports.api}" @property def _external_url(self) -> str: """Return the externally-reachable (public) address of the alertmanager api server.""" return self.ingress.url or self._internal_url + @property + def _scheme(self) -> str: + return "https" if self._is_tls_ready() else "http" + @property def tracing_endpoint(self) -> Optional[str]: """Otlp http endpoint for charm instrumentation.""" diff --git a/tests/unit/test_self_scrape_jobs.py b/tests/unit/test_self_scrape_jobs.py new file mode 100644 index 00000000..67c7005a --- /dev/null +++ b/tests/unit/test_self_scrape_jobs.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +# Copyright 2021 Canonical Ltd. +# See LICENSE file for licensing details. +import unittest +from unittest.mock import PropertyMock, patch + +from alertmanager import WorkloadManager +from charm import AlertmanagerCharm +from helpers import k8s_resource_multipatch +from ops.testing import Harness + + +class TestWithInitialHooks(unittest.TestCase): + container_name: str = "alertmanager" + + @patch("lightkube.core.client.GenericSyncClient") + @patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", "")) + @k8s_resource_multipatch + @patch.object(WorkloadManager, "_alertmanager_version", property(lambda *_: "0.0.0")) + def setUp(self, *unused): + self.harness = Harness(AlertmanagerCharm) + self.addCleanup(self.harness.cleanup) + + self.harness.set_leader(True) + self.app_name = "am" + # Create the peer relation before running harness.begin_with_initial_hooks(), because + # otherwise it will create it for you and we don't know the rel_id + self.peer_rel_id = self.harness.add_relation("replicas", self.app_name) + + self.harness.begin_with_initial_hooks() + + @patch.object(AlertmanagerCharm, "_internal_url", new_callable=PropertyMock) + @patch.object(AlertmanagerCharm, "_scheme", new_callable=PropertyMock) + def test_self_scraping_job_with_no_peers(self, _mock_scheme, _mock_internal_url): + scheme = "https" + _mock_scheme.return_value = scheme + url_no_scheme = f"test-internal.url:{self.harness.charm._ports.api}" + _mock_internal_url.return_value = f"{scheme}://{url_no_scheme}" + jobs_expected = [ + { + "metrics_path": "/metrics", + "scheme": scheme, + "static_configs": [{"targets": [url_no_scheme]}], + } + ] + + jobs = self.harness.charm.self_scraping_job + self.assertEqual(jobs, jobs_expected) + + @patch.object(AlertmanagerCharm, "_internal_url", new_callable=PropertyMock) + @patch.object(AlertmanagerCharm, "_scheme", new_callable=PropertyMock) + def test_self_scraping_job_with_peers(self, _mock_scheme, _mock_internal_url): + scheme = "https" + _mock_scheme.return_value = scheme + + targets = [ + f"test-internal-0.url:{self.harness.charm._ports.api}", + f"test-internal-1.url:{self.harness.charm._ports.api}", + f"test-internal-2.url:{self.harness.charm._ports.api}", + ] + metrics_path = "/metrics" + _mock_internal_url.return_value = f"{scheme}://{targets[0]}" + + jobs_expected = [ + { + "metrics_path": metrics_path, + "scheme": scheme, + "static_configs": [{"targets": targets}], + } + ] + + # Add peers + for i, target in enumerate(targets[1:], 1): + unit_name = f"{self.app_name}/{i}" + self.harness.add_relation_unit(self.peer_rel_id, unit_name) + self.harness.update_relation_data( + self.peer_rel_id, unit_name, {"private_address": f"{scheme}://{target}"} + ) + + jobs = self.harness.charm.self_scraping_job + self.assertEqual(jobs_expected, jobs)