[azure][feat] Rewrite metrics collection to make it better (#2298)

Co-authored-by: Matthias Veit <[email protected]>
someengineering · Dec 12, 2024 · dd5ed52 · dd5ed52
1 parent 8fe3371
commit dd5ed52
Show file tree

Hide file tree

Showing 11 changed files with 435 additions and 405 deletions.
diff --git a/plugins/azure/fix_plugin_azure/collector.py b/plugins/azure/fix_plugin_azure/collector.py
@@ -39,6 +39,7 @@
     MicrosoftGraphOrganizationRoot,
 )
 from fix_plugin_azure.resource.monitor import resources as monitor_resources
+from fix_plugin_azure.resource.metrics import AzureMetricData
 from fix_plugin_azure.resource.mysql import AzureMysqlServerType, resources as mysql_resources
 from fix_plugin_azure.resource.network import (
     AzureNetworkExpressRoutePortsLocation,
@@ -159,6 +160,14 @@ def get_last_run() -> Optional[datetime]:
             for after_collect in builder.after_collect_actions:
                 after_collect()
 
+            if builder.config.collect_usage_metrics:
+                try:
+                    log.info(f"[Azure:{self.account.safe_name}] Collect usage metrics.")
+                    self.collect_usage_metrics(builder)
+                    builder.executor.wait_for_submitted_work()
+                except Exception as e:
+                    log.warning(f"[Azure] Failed to collect usage metrics in project {self.account.safe_name}: {e}")
+
             # connect nodes
             log.info(f"[Azure:{self.account.safe_name}] Connect resources and create edges.")
             for node, data in list(self.graph.nodes(data=True)):
@@ -184,6 +193,12 @@ def get_last_run() -> Optional[datetime]:
             self.core_feedback.progress_done(self.account.id, 1, 1, context=[self.cloud.id])
             log.info(f"[Azure:{self.account.safe_name}] Collecting resources done.")
 
+    def collect_usage_metrics(self, builder: GraphBuilder) -> None:
+        for resource in builder.graph.nodes:
+            if isinstance(resource, MicrosoftResource) and (mq := resource.collect_usage_metrics(builder)):
+                start_at = builder.created_at - builder.metrics_delta
+                AzureMetricData.query_for(builder, resource, mq, start_at, builder.created_at)
+
     def collect_resource_list(
         self, name: str, builder: GraphBuilder, resources: List[Type[MicrosoftResource]]
     ) -> Future[None]:

diff --git a/plugins/azure/fix_plugin_azure/resource/base.py b/plugins/azure/fix_plugin_azure/resource/base.py
@@ -3,9 +3,10 @@
 import logging
 from concurrent.futures import Future
 from datetime import datetime, timedelta
-from typing import Any, ClassVar, Dict, Optional, TypeVar, List, Type, Callable, cast, Union, Set
+from typing import Any, ClassVar, Dict, Optional, TypeVar, List, Type, Tuple, Callable, cast, Union, Set
 
 from attr import define, field
+from attrs import frozen
 from azure.identity import DefaultAzureCredential
 
 from fix_plugin_azure.azure_client import AzureResourceSpec, MicrosoftClient, MicrosoftRestSpec
@@ -20,6 +21,9 @@
     BaseRegion,
     ModelReference,
     PhantomBaseResource,
+    StatName,
+    MetricName,
+    MetricUnit,
 )
 from fixlib.config import current_config
 from fixlib.core.actions import CoreFeedback
@@ -187,12 +191,9 @@ def connect_in_graph(self, builder: GraphBuilder, source: Json) -> None:
         # Default behavior: add resource to the namespace
         pass
 
-    @classmethod
-    def collect_usage_metrics(
-        cls: Type[MicrosoftResourceType], builder: GraphBuilder, collected_resources: List[MicrosoftResourceType]
-    ) -> None:
+    def collect_usage_metrics(self, builder: GraphBuilder) -> List[AzureMetricQuery]:
         # Default behavior: do nothing
-        pass
+        return []
 
     @classmethod
     def collect_resources(
@@ -203,13 +204,7 @@ def collect_resources(
         if spec := cls.api_spec:
             try:
                 items = builder.client.list(spec, **kwargs)
-                collected = cls.collect(items, builder)
-                if builder.config.collect_usage_metrics:
-                    try:
-                        cls.collect_usage_metrics(builder, collected)
-                    except Exception as e:
-                        log.warning(f"Failed to collect usage metrics for {cls.__name__}: {e}")
-                return collected
+                return cls.collect(items, builder)
             except Exception as e:
                 msg = f"Error while collecting {cls.__name__} with service {spec.service} and location: {builder.location}: {e}"
                 builder.core_feedback.info(msg, log)
@@ -1008,6 +1003,66 @@ def with_location(self, location: BaseRegion) -> GraphBuilder:
         )
 
 
+STAT_MAP: Dict[str, StatName] = {
+    "minimum": StatName.min,
+    "average": StatName.avg,
+    "maximum": StatName.max,
+}
+
+
+@frozen(kw_only=True)
+class MetricNormalization:
+    unit: MetricUnit
+    normalize_value: Callable[[float], float] = lambda x: x
+
+
+@define(hash=True, frozen=True)
+class AzureMetricQuery:
+    metric_name: str
+    metric_namespace: str
+    metric_normalization_name: MetricName
+    ref_id: str
+    instance_id: str
+    metric_id: str
+    aggregation: Tuple[str, ...]
+    normalization: MetricNormalization
+    # Optional `start_time` and `period` override defaults for query timespan and interval.
+    period: Optional[timedelta] = None
+    start_time: Optional[datetime] = None
+    unit: str = "Count"
+
+    @staticmethod
+    def create(
+        *,
+        metric_name: str,
+        metric_namespace: str,
+        metric_normalization_name: MetricName,
+        instance_id: str,
+        ref_id: str,
+        normalization: MetricNormalization,
+        aggregation: Tuple[str, ...],
+        unit: str = "Count",
+        start_time: Optional[datetime] = None,
+        period: Optional[timedelta] = None,
+        metric_id: Optional[str] = None,
+    ) -> "AzureMetricQuery":
+        metric_id = f"{instance_id}/providers/Microsoft.Insights/metrics/{metric_name}"
+        # noinspection PyTypeChecker
+        return AzureMetricQuery(
+            metric_name=metric_name,
+            metric_namespace=metric_namespace,
+            metric_normalization_name=metric_normalization_name,
+            instance_id=instance_id,
+            metric_id=metric_id,
+            aggregation=aggregation,
+            ref_id=ref_id,
+            unit=unit,
+            normalization=normalization,
+            period=period,
+            start_time=start_time,
+        )
+
+
 resources: List[Type[MicrosoftResource]] = [
     AzureResourceGroup,
 ]