cta-observatory · TjarkMiener · Oct 25, 2024 · Oct 25, 2024 · Oct 28, 2024 · Oct 28, 2024
diff --git a/docs/changes/2628.features.rst b/docs/changes/2628.features.rst
@@ -0,0 +1 @@
+Add a generic stats-calculation tool utilizing the PixelStatisticsCalculator.
diff --git a/docs/user-guide/tools.rst b/docs/user-guide/tools.rst
@@ -17,6 +17,7 @@ Data Processing Tools
 * ``ctapipe-quickstart``: create some default analysis configurations and a working directory
 * ``ctapipe-process``: Process event data in any supported format from R0/R1/DL0 to DL1 or DL2 HDF5 files.
 * ``ctapipe-apply-models``: Tool to apply machine learning models in bulk (as opposed to event by event).
+* ``ctapipe-stats-calculation``: Tool to aggregate statistics and detect outliers from DL1a image data.
 * ``ctapipe-train-disp-reconstructor`` : Train the ML models for the  `ctapipe.reco.DispReconstructor` (monoscopic reconstruction)
 * ``ctapipe-train-energy-regressor``:  Train the ML models for the `ctapipe.reco.EnergyRegressor` (energy estimation)
 * ``ctapipe-train-particle-classifier``: Train the ML models for the  `ctapipe.reco.ParticleClassifier` (gamma-hadron separation)

diff --git a/pyproject.toml b/pyproject.toml
@@ -99,6 +99,7 @@ ctapipe-process = "ctapipe.tools.process:main"
 ctapipe-merge = "ctapipe.tools.merge:main"
 ctapipe-fileinfo = "ctapipe.tools.fileinfo:main"
 ctapipe-quickstart = "ctapipe.tools.quickstart:main"
+ctapipe-stats-calculation = "ctapipe.tools.stats_calculation:main"
 ctapipe-train-energy-regressor = "ctapipe.tools.train_energy_regressor:main"
 ctapipe-train-particle-classifier = "ctapipe.tools.train_particle_classifier:main"
 ctapipe-train-disp-reconstructor = "ctapipe.tools.train_disp_reconstructor:main"

diff --git a/src/ctapipe/resources/stats_calc_config.yaml b/src/ctapipe/resources/stats_calc_config.yaml
@@ -0,0 +1,37 @@
+StatisticsCalculatorTool:
+  allowed_tels: [1,2,3,4]
+  dl1a_column_name: "image"
+  output_column_name: "statistics"
+
+PixelStatisticsCalculator:
+  stats_aggregator_type: [["type", "*", "SigmaClippingAggregator"]]
+  chunk_shift: 1000
+  faulty_pixels_fraction: 0.1
+  outlier_detector_list: [
+        {
+            "apply_to": "median",
+            "name": "MedianOutlierDetector",
+            "config": {
+                "median_range_factors": [-15, 15],
+            },
+        },
+        {
+            "apply_to": "median",
+            "name": "RangeOutlierDetector",
+            "config": {
+                "validity_range": [-20, 120],
+            }
+        },
+        {
+            "apply_to": "std",
+            "name": "StdOutlierDetector",
+            "config": {
+                "std_range_factors": [-15, 15],
+            },
+        }
+    ]
+
+SigmaClippingAggregator:
+  chunk_size: 2500
+  max_sigma: 4
+  iterations: 5
diff --git a/src/ctapipe/tools/quickstart.py b/src/ctapipe/tools/quickstart.py
@@ -15,6 +15,7 @@
     "stage1_config.yaml",
     "stage2_config.yaml",
     "ml_preprocessing_config.yaml",
+    "stats_calc_config.yaml",
     "train_energy_regressor.yaml",
     "train_particle_classifier.yaml",
     "train_disp_reconstructor.yaml",

diff --git a/src/ctapipe/tools/stats_calculation.py b/src/ctapipe/tools/stats_calculation.py
@@ -0,0 +1,179 @@
+"""
+Perform statistics calculation from DL1a image data
+"""
+
+import pathlib
+
+import numpy as np
+from astropy.table import vstack
+
+from ctapipe.core import Tool
+from ctapipe.core.tool import ToolConfigurationError
+from ctapipe.core.traits import (
+    Bool,
+    CaselessStrEnum,
+    CInt,
+    Path,
+    Set,
+    Unicode,
+    classes_with_traits,
+)
+from ctapipe.instrument import SubarrayDescription
+from ctapipe.io import write_table
+from ctapipe.io.tableloader import TableLoader
+from ctapipe.monitoring.calculator import PixelStatisticsCalculator
+
+
+class StatisticsCalculatorTool(Tool):
+    """
+    Perform statistics calculation for DL1a image data
+    """
+
+    name = "StatisticsCalculatorTool"
+    description = "Perform statistics calculation for DL1a image data"
+
+    examples = """
+    To calculate statistics of DL1a image data files:
+
+    > ctapipe-stats-calculation --input_url input.dl1.h5 --output_path /path/monitoring.h5 --overwrite
+
+    """
+
+    input_url = Path(
+        help="Input CTA HDF5 files including DL1a image data",
+        allow_none=True,
+        exists=True,
+        directory_ok=False,
+        file_ok=True,
+    ).tag(config=True)
+
+    allowed_tels = Set(
+        trait=CInt(),
+        default_value=None,
+        allow_none=True,
+        help=(
+            "List of allowed tel_ids, others will be ignored. "
+            "If None, all telescopes in the input stream will be included."
+        ),
+    ).tag(config=True)
+
+    dl1a_column_name = CaselessStrEnum(
+        ["image", "peak_time", "variance"],
+        default_value="image",
+        allow_none=False,
+        help="Column name of the DL1a image data to calculate statistics",
+    ).tag(config=True)
+
+    output_column_name = Unicode(
+        default_value="statistics",
+        allow_none=False,
+        help="Column name of the output statistics",
+    ).tag(config=True)
+
+    output_path = Path(
+        help="Output filename", default_value=pathlib.Path("monitoring.h5")
+    ).tag(config=True)
+
+    overwrite = Bool(help="Overwrite output file if it exists").tag(config=True)
+
+    aliases = {
+        ("i", "input_url"): "StatisticsCalculatorTool.input_url",
+        ("o", "output_path"): "StatisticsCalculatorTool.output_path",
+    }
+
+    flags = {
+        "overwrite": (
+            {"StatisticsCalculatorTool": {"overwrite": True}},
+            "Overwrite existing files",
+        ),
+    }
+
+    classes = classes_with_traits(PixelStatisticsCalculator)
+
+    def setup(self):
+        # Check that the input and output files are not the same
+        if self.input_url == self.output_path:
+            raise ToolConfigurationError(
+                "Input and output files are same. Fix your configuration / cli arguments."
+            )
+
+        # Load the subarray description from the input file
+        subarray = SubarrayDescription.from_hdf(self.input_url)
+        # Initialization of the statistics calculator
+        self.stats_calculator = PixelStatisticsCalculator(
+            parent=self, subarray=subarray
+        )
+        # Read the input data with the 'TableLoader'
+        input_data = TableLoader(input_url=self.input_url)
+        # Get the telescope ids from the input data or use the allowed_tels configuration
+        tel_ids = subarray.tel_ids if self.allowed_tels is None else self.allowed_tels
+        # Read the whole dl1 images
+        self.dl1_tables = input_data.read_telescope_events_by_id(
+            telescopes=tel_ids,
+            dl1_images=True,
+            dl1_parameters=False,
+            dl1_muons=False,
+            dl2=False,
+            simulated=False,
+            true_images=False,
+            true_parameters=False,
+            instrument=False,
+            pointing=False,
+        )
+
+    def start(self):
+        # Iterate over the telescope ids and their corresponding dl1 tables
+        for tel_id, dl1_table in self.dl1_tables.items():
+            # Perform the first pass of the statistics calculation
+            aggregated_stats = self.stats_calculator.first_pass(
+                table=dl1_table,
+                tel_id=tel_id,
+                col_name=self.dl1a_column_name,
+            )
+            # Check if 'chunk_shift' is selected
+            if self.stats_calculator.chunk_shift is not None:
+                # Check if there are any faulty chunks to perform a second pass over the data
+                if np.any(~aggregated_stats["is_valid"].data):
+                    # Perform the second pass of the statistics calculation
+                    aggregated_stats_secondpass = self.stats_calculator.second_pass(
+                        table=dl1_table,
+                        valid_chunks=aggregated_stats["is_valid"].data,
+                        tel_id=tel_id,
+                        col_name=self.dl1a_column_name,
+                    )
+                    # Stack the statistic values from the first and second pass
+                    aggregated_stats = vstack(
+                        [aggregated_stats, aggregated_stats_secondpass]
+                    )
+                    # Sort the stacked aggregated statistic values by starting time
+                    aggregated_stats.sort(["time_start"])
+                else:
+                    self.log.info(
+                        "No faulty chunks found for telescope 'tel_id=%d'. Skipping second pass.",
+                        tel_id,
+                    )
+            # Write the aggregated statistics and their outlier mask to the output file
+            write_table(
+                aggregated_stats,
+                self.output_path,
+                f"/dl1/monitoring/telescope/{self.output_column_name}/tel_{tel_id:03d}",
+                overwrite=self.overwrite,
+            )
+
+    def finish(self):
+        self.log.info(
+            "DL1 monitoring data was stored in '%s' under '%s'",
+            self.output_path,
+            f"/dl1/monitoring/telescope/{self.output_column_name}",
+        )
+        self.log.info("Tool is shutting down")
+
+
+def main():
+    # Run the tool
+    tool = StatisticsCalculatorTool()
+    tool.run()
+
+
+if __name__ == "main":
+    main()
diff --git a/src/ctapipe/tools/tests/test_stats_calc.py b/src/ctapipe/tools/tests/test_stats_calc.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+"""
+Test ctapipe-stats-calculation tool
+"""
+
+from traitlets.config.loader import Config
+
+from ctapipe.core import run_tool
+from ctapipe.io import read_table
+from ctapipe.tools.stats_calculation import StatisticsCalculatorTool
+
+
+def test_stats_calc_tool(tmp_path, dl1_image_file):
+    """check statistics calculation from DL1a files"""
+
+    # Create a configuration suitable for the test
+    tel_id = 3
+    config = Config(
+        {
+            "StatisticsCalculatorTool": {
+                "allowed_tels": [tel_id],
+                "dl1a_column_name": "image",
+                "output_column_name": "statistics",
+            },
+            "PixelStatisticsCalculator": {
+                "stats_aggregator_type": [
+                    ("id", tel_id, "PlainAggregator"),
+                ],
+            },
+            "PlainAggregator": {
+                "chunk_size": 1,
+            },
+        }
+    )
+    # Set the output file path
+    monitoring_file = tmp_path / "monitoring.dl1.h5"
+    # Run the tool with the configuration and the input file
+    run_tool(
+        StatisticsCalculatorTool(config=config),
+        argv=[
+            f"--input_url={dl1_image_file}",
+            f"--output_path={monitoring_file}",
+            "--overwrite",
+        ],
+        cwd=tmp_path,
+        raises=True,
+    )
+    # Check that the output file has been created
+    assert monitoring_file.exists()
+    # Check that the output file is not empty
+    assert (
+        read_table(
+            monitoring_file,
+            path=f"/dl1/monitoring/telescope/statistics/tel_{tel_id:03d}",
+        )["mean"]
+        is not None
+    )