Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stats calc tool #2628

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
1 change: 1 addition & 0 deletions docs/changes/2628.features.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add a generic stats-calculation tool utilizing the PixelStatisticsCalculator.
1 change: 1 addition & 0 deletions docs/user-guide/tools.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Data Processing Tools
* ``ctapipe-quickstart``: create some default analysis configurations and a working directory
* ``ctapipe-process``: Process event data in any supported format from R0/R1/DL0 to DL1 or DL2 HDF5 files.
* ``ctapipe-apply-models``: Tool to apply machine learning models in bulk (as opposed to event by event).
* ``ctapipe-stats-calculation``: Tool to aggregate statistics and detect outliers from DL1a image data.
* ``ctapipe-train-disp-reconstructor`` : Train the ML models for the `ctapipe.reco.DispReconstructor` (monoscopic reconstruction)
* ``ctapipe-train-energy-regressor``: Train the ML models for the `ctapipe.reco.EnergyRegressor` (energy estimation)
* ``ctapipe-train-particle-classifier``: Train the ML models for the `ctapipe.reco.ParticleClassifier` (gamma-hadron separation)
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ ctapipe-process = "ctapipe.tools.process:main"
ctapipe-merge = "ctapipe.tools.merge:main"
ctapipe-fileinfo = "ctapipe.tools.fileinfo:main"
ctapipe-quickstart = "ctapipe.tools.quickstart:main"
ctapipe-stats-calculation = "ctapipe.tools.stats_calculation:main"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer a verb here, like the other tools. E.g. ctapipe-calculate-pixel-statistics

ctapipe-train-energy-regressor = "ctapipe.tools.train_energy_regressor:main"
ctapipe-train-particle-classifier = "ctapipe.tools.train_particle_classifier:main"
ctapipe-train-disp-reconstructor = "ctapipe.tools.train_disp_reconstructor:main"
Expand Down
37 changes: 37 additions & 0 deletions src/ctapipe/resources/stats_calc_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
StatisticsCalculatorTool:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we don't use yaml for anything apart from the configurations, I suggest to rename the configuration file to just tool_name.yaml, i.e. stripping _config.

allowed_tels: [1,2,3,4]
dl1a_column_name: "image"
output_column_name: "statistics"

PixelStatisticsCalculator:
maxnoe marked this conversation as resolved.
Show resolved Hide resolved
stats_aggregator_type: [["type", "*", "SigmaClippingAggregator"]]
chunk_shift: 1000
faulty_pixels_fraction: 0.1
outlier_detector_list: [
maxnoe marked this conversation as resolved.
Show resolved Hide resolved
{
"apply_to": "median",
"name": "MedianOutlierDetector",
"config": {
"median_range_factors": [-15, 15],
},
},
{
"apply_to": "median",
"name": "RangeOutlierDetector",
"config": {
"validity_range": [-20, 120],
}
},
{
"apply_to": "std",
"name": "StdOutlierDetector",
"config": {
"std_range_factors": [-15, 15],
},
}
]

SigmaClippingAggregator:
chunk_size: 2500
max_sigma: 4
iterations: 5
1 change: 1 addition & 0 deletions src/ctapipe/tools/quickstart.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"stage1_config.yaml",
"stage2_config.yaml",
"ml_preprocessing_config.yaml",
"stats_calc_config.yaml",
"train_energy_regressor.yaml",
"train_particle_classifier.yaml",
"train_disp_reconstructor.yaml",
Expand Down
179 changes: 179 additions & 0 deletions src/ctapipe/tools/stats_calculation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
"""
Perform statistics calculation from DL1a image data
"""

import pathlib

import numpy as np
from astropy.table import vstack

from ctapipe.core import Tool
from ctapipe.core.tool import ToolConfigurationError
from ctapipe.core.traits import (
Bool,
CaselessStrEnum,
CInt,
Path,
Set,
Unicode,
classes_with_traits,
)
from ctapipe.instrument import SubarrayDescription
from ctapipe.io import write_table
from ctapipe.io.tableloader import TableLoader
from ctapipe.monitoring.calculator import PixelStatisticsCalculator


class StatisticsCalculatorTool(Tool):
"""
Perform statistics calculation for DL1a image data
"""

name = "StatisticsCalculatorTool"
description = "Perform statistics calculation for DL1a image data"

examples = """
To calculate statistics of DL1a image data files:

> ctapipe-stats-calculation --input_url input.dl1.h5 --output_path /path/monitoring.h5 --overwrite

"""

input_url = Path(
help="Input CTA HDF5 files including DL1a image data",
allow_none=True,
exists=True,
directory_ok=False,
file_ok=True,
).tag(config=True)

allowed_tels = Set(
trait=CInt(),
default_value=None,
allow_none=True,
help=(
"List of allowed tel_ids, others will be ignored. "
"If None, all telescopes in the input stream will be included."
),
).tag(config=True)

dl1a_column_name = CaselessStrEnum(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is DL1a/b is "official"? Also, I'd perhaps use generic input_column_name similar to the output one.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, in ctapipe we use DL1_IMAGES and DL1_PARAMETERS to distinguish between things that are per-pixel vs. single quantities per event.

https://ctapipe.readthedocs.io/en/latest/api/ctapipe.io.DataLevel.html

Copy link
Member

@maxnoe maxnoe Oct 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd also not make this an enum. In the generic tool, users should be able to chose any column that has compatible shape. Just provide a clear error when the column is not found in the input file.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could also be a list of columns, to compute on multiple at the same time.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I changed the column name and also polish the references to DL1a data by using pixel-wise image data which is more descriptive. ToolConfigurationError is raised once the column is not found. Having list of columns seems a little bit of an overkill here, which would just make the code more complex. Maybe the aggregation config could be shared between the columns, but especially the outlier detection will be different between the columns.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ToolConfigurationError is raised once the column is not found. Having list of columns seems a little bit of an overkill here, which would just make the code more complex. Maybe the aggregation config could be shared between the columns, but especially the outlier detection will be different between the columns.

I think the case where you only want to know about a single column is quite rare, you are usually interested in multiple. So having to read all data again to compute metrics on a new column seems very limiting and a loop over columns shouldn't make the code much more complex.

["image", "peak_time", "variance"],
default_value="image",
allow_none=False,
help="Column name of the DL1a image data to calculate statistics",
).tag(config=True)

output_column_name = Unicode(
default_value="statistics",
allow_none=False,
help="Column name of the output statistics",
).tag(config=True)

output_path = Path(
help="Output filename", default_value=pathlib.Path("monitoring.h5")
).tag(config=True)

overwrite = Bool(help="Overwrite output file if it exists").tag(config=True)

aliases = {
("i", "input_url"): "StatisticsCalculatorTool.input_url",
("o", "output_path"): "StatisticsCalculatorTool.output_path",
}

flags = {
"overwrite": (
{"StatisticsCalculatorTool": {"overwrite": True}},
"Overwrite existing files",
),
}

classes = classes_with_traits(PixelStatisticsCalculator)

def setup(self):
# Check that the input and output files are not the same
if self.input_url == self.output_path:
raise ToolConfigurationError(
"Input and output files are same. Fix your configuration / cli arguments."
)

# Load the subarray description from the input file
subarray = SubarrayDescription.from_hdf(self.input_url)
# Initialization of the statistics calculator
self.stats_calculator = PixelStatisticsCalculator(
parent=self, subarray=subarray
)
# Read the input data with the 'TableLoader'
input_data = TableLoader(input_url=self.input_url)
# Get the telescope ids from the input data or use the allowed_tels configuration
tel_ids = subarray.tel_ids if self.allowed_tels is None else self.allowed_tels
# Read the whole dl1 images
self.dl1_tables = input_data.read_telescope_events_by_id(
maxnoe marked this conversation as resolved.
Show resolved Hide resolved
telescopes=tel_ids,
dl1_images=True,
dl1_parameters=False,
dl1_muons=False,
dl2=False,
simulated=False,
true_images=False,
true_parameters=False,
instrument=False,
pointing=False,
)

def start(self):
# Iterate over the telescope ids and their corresponding dl1 tables
for tel_id, dl1_table in self.dl1_tables.items():
# Perform the first pass of the statistics calculation
aggregated_stats = self.stats_calculator.first_pass(
table=dl1_table,
tel_id=tel_id,
col_name=self.dl1a_column_name,
)
# Check if 'chunk_shift' is selected
if self.stats_calculator.chunk_shift is not None:
# Check if there are any faulty chunks to perform a second pass over the data
if np.any(~aggregated_stats["is_valid"].data):
# Perform the second pass of the statistics calculation
aggregated_stats_secondpass = self.stats_calculator.second_pass(
table=dl1_table,
valid_chunks=aggregated_stats["is_valid"].data,
tel_id=tel_id,
col_name=self.dl1a_column_name,
)
# Stack the statistic values from the first and second pass
aggregated_stats = vstack(
[aggregated_stats, aggregated_stats_secondpass]
)
# Sort the stacked aggregated statistic values by starting time
aggregated_stats.sort(["time_start"])
else:
self.log.info(
"No faulty chunks found for telescope 'tel_id=%d'. Skipping second pass.",
tel_id,
)
# Write the aggregated statistics and their outlier mask to the output file
maxnoe marked this conversation as resolved.
Show resolved Hide resolved
write_table(
aggregated_stats,
self.output_path,
f"/dl1/monitoring/telescope/{self.output_column_name}/tel_{tel_id:03d}",
overwrite=self.overwrite,
)

def finish(self):
self.log.info(
"DL1 monitoring data was stored in '%s' under '%s'",
self.output_path,
f"/dl1/monitoring/telescope/{self.output_column_name}",
)
self.log.info("Tool is shutting down")


def main():
# Run the tool
tool = StatisticsCalculatorTool()
tool.run()


if __name__ == "main":
main()
57 changes: 57 additions & 0 deletions src/ctapipe/tools/tests/test_stats_calc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/env python3
"""
Test ctapipe-stats-calculation tool
"""

from traitlets.config.loader import Config

from ctapipe.core import run_tool
from ctapipe.io import read_table
from ctapipe.tools.stats_calculation import StatisticsCalculatorTool


def test_stats_calc_tool(tmp_path, dl1_image_file):
"""check statistics calculation from DL1a files"""

# Create a configuration suitable for the test
tel_id = 3
config = Config(
{
"StatisticsCalculatorTool": {
"allowed_tels": [tel_id],
"dl1a_column_name": "image",
"output_column_name": "statistics",
},
"PixelStatisticsCalculator": {
"stats_aggregator_type": [
("id", tel_id, "PlainAggregator"),
],
},
"PlainAggregator": {
"chunk_size": 1,
},
}
)
# Set the output file path
monitoring_file = tmp_path / "monitoring.dl1.h5"
# Run the tool with the configuration and the input file
run_tool(
StatisticsCalculatorTool(config=config),
argv=[
f"--input_url={dl1_image_file}",
f"--output_path={monitoring_file}",
"--overwrite",
],
cwd=tmp_path,
raises=True,
)
# Check that the output file has been created
assert monitoring_file.exists()
# Check that the output file is not empty
assert (
read_table(
monitoring_file,
path=f"/dl1/monitoring/telescope/statistics/tel_{tel_id:03d}",
)["mean"]
is not None
)