diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 76eb5c78b..132de5c1d 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -17,29 +17,28 @@ env: TQDM_MININTERVAL: 100 jobs: - build-and-test: name: Test pygama with Python runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: - python-version: ['3.9', '3.10', '3.11'] + python-version: ["3.9", "3.10", "3.11"] os: [ubuntu-latest, macOS-latest] steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Get dependencies and install the package - run: | - python -m pip install --upgrade pip wheel setuptools - python -m pip install --upgrade .[test] - - name: Run unit tests - run: | - python -m pytest + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Get dependencies and install the package + run: | + python -m pip install --upgrade pip wheel setuptools + python -m pip install --upgrade .[test] + - name: Run unit tests + run: | + python -m pytest test-coverage: name: Calculate and upload test coverage @@ -50,7 +49,7 @@ jobs: fetch-depth: 2 - uses: actions/setup-python@v5 with: - python-version: '3.10' + python-version: "3.10" - name: Generate Report run: | @@ -72,7 +71,7 @@ jobs: fetch-depth: 0 - uses: actions/setup-python@v5 with: - python-version: '3.10' + python-version: "3.10" - name: Setup build environment run: | sudo apt-get install -y pandoc diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 74f3d3661..6ffddcac6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -36,18 +36,18 @@ repos: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: "v3.15.0" + rev: "v3.15.2" hooks: - id: pyupgrade args: ["--py38-plus"] - repo: https://github.com/psf/black - rev: "23.12.1" + rev: "24.3.0" hooks: - id: black-jupyter - repo: https://github.com/pre-commit/mirrors-mypy - rev: "v1.8.0" + rev: "v1.9.0" hooks: - id: mypy files: src @@ -57,14 +57,12 @@ repos: rev: "v2.4.0" hooks: - id: pycln - exclude: ^src/pygama/pargen args: [--all] - repo: https://github.com/PyCQA/flake8 - rev: "6.1.0" + rev: "7.0.0" hooks: - id: flake8 - exclude: ^src/pygama/pargen additional_dependencies: [ flake8-bugbear>=23.1.17, flake8-print, @@ -73,7 +71,7 @@ repos: args: ["--extend-ignore", "E203,E501"] - repo: https://github.com/kynan/nbstripout - rev: "0.6.1" + rev: "0.7.1" hooks: - id: nbstripout args: ["--drop-empty-cells", @@ -93,7 +91,7 @@ repos: - tomli - repo: https://github.com/shellcheck-py/shellcheck-py - rev: "v0.9.0.6" + rev: "v0.10.0.1" hooks: - id: shellcheck diff --git a/CITATION.cff b/CITATION.cff index 90524f881..1a416f427 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -2,7 +2,6 @@ cff-version: 1.2.0 title: pygama doi: https://doi.org/10.5281/zenodo.10614246 date-released: 2024-02-03 -url: https://github.com/github-linguist/linguist message: "If you use this software, please cite it as below." authors: - family-names: Agostini diff --git a/codecov.yml b/codecov.yml index 2e66c0446..55c040463 100644 --- a/codecov.yml +++ b/codecov.yml @@ -9,7 +9,7 @@ coverage: patch: false github_checks: - annotations: false + annotations: false comment: require_changes: true diff --git a/docs/source/conf.py b/docs/source/conf.py index 3bcd7d1fc..0b86f6627 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -76,7 +76,7 @@ autodoc_default_options = {"ignore-module-all": True} # Include __init__() docstring in class docstring autoclass_content = "both" -autodoc_typehints = "both" +autodoc_typehints = "description" autodoc_typehints_description_target = "documented_params" autodoc_typehints_format = "short" diff --git a/docs/source/extensions/numbadoc.py b/docs/source/extensions/numbadoc.py index 5b4202a10..f2f1a5dad 100644 --- a/docs/source/extensions/numbadoc.py +++ b/docs/source/extensions/numbadoc.py @@ -5,6 +5,7 @@ Adapted from https://github.com/numba/numba/issues/5755#issuecomment-646587651 """ + from copy import copy from typing import Iterator, List diff --git a/pyproject.toml b/pyproject.toml index b4edbf1ef..621a67032 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,13 +35,14 @@ dependencies = [ "dspeed>=1.3", "h5py>=3.2", "iminuit", - "legend-daq2lh5>=1.2", - "legend-pydataobj>=1.5", + "legend-daq2lh5>=1.2.1", + "legend-pydataobj>=1.6", "matplotlib", "numba!=0.53.*,!=0.54.*,!=0.57", "numpy>=1.21", "pandas>=1.4.4", "pint", + "pyyaml", "scikit-learn", "scipy>=1.0.1", "tables", @@ -115,7 +116,7 @@ report.exclude_also = [ extend-ignore = "E203, E501" [tool.codespell] -ignore-words-list = "hist, gaus, nd, ans, crate, nin, puls, spms, fom" +ignore-words-list = "hist, gaus, nd, ans, crate, nin, puls, spms, fom, FOM" [tool.pytest.ini_options] minversion = "6.0" diff --git a/src/pygama/cli.py b/src/pygama/cli.py index fb05ef658..20ad5c76d 100644 --- a/src/pygama/cli.py +++ b/src/pygama/cli.py @@ -1,6 +1,7 @@ """ pygama's command line interface utilities. """ + import argparse import logging import os diff --git a/src/pygama/evt/aggregators.py b/src/pygama/evt/aggregators.py index dbcae2829..c9adee29b 100644 --- a/src/pygama/evt/aggregators.py +++ b/src/pygama/evt/aggregators.py @@ -6,135 +6,113 @@ import awkward as ak import numpy as np -from lgdo import Array, ArrayOfEqualSizedArrays, VectorOfVectors, lh5 +from lgdo import lh5, types from lgdo.lh5 import LH5Store -from numpy.typing import NDArray from . import utils def evaluate_to_first_or_last( - cumulength: NDArray, - idx: NDArray, - ids: NDArray, - f_hit: str, - f_dsp: str, - chns: list, - chns_rm: list, - expr: str, - exprl: list, - qry: str | NDArray, - nrows: int, - sorter: tuple, - var_ph: dict = None, - defv: bool | int | float = np.nan, + datainfo, + tcm, + channels, + channels_skip, + expr, + field_list, + query, + n_rows, + sorter, + pars_dict=None, + default_value=np.nan, is_first: bool = True, - tcm_id_table_pattern: str = "ch{}", - evt_group: str = "evt", - hit_group: str = "hit", - dsp_group: str = "dsp", -) -> Array: +) -> types.Array: """Aggregates across channels by returning the expression of the channel with value of `sorter`. Parameters ---------- - idx - `tcm` index array. - ids - `tcm` id array. - f_hit - path to `hit` tier file. - f_dsp - path to `dsp` tier file. - chns + datainfo + input and output LH5 datainfo with HDF5 groups where tables are found. + tcm + TCM data arrays in an object that can be accessed by attribute. + channels list of channels to be aggregated. - chns_rm + channels_skip list of channels to be skipped from evaluation and set to default value. expr expression string to be evaluated. - exprl + field_list list of `dsp/hit/evt` parameter tuples in expression ``(tier, field)``. - qry + query query expression to mask aggregation. - nrows + n_rows length of output array. sorter tuple of field in `hit/dsp/evt` tier to evaluate ``(tier, field)``. - var_ph + pars_dict dictionary of `evt` and additional parameters and their values. - defv + default_value default value. is_first defines if sorted by smallest or largest value of `sorter` - tcm_id_table_pattern - pattern to format `tcm` id values to table name in higher tiers. Must have one - placeholder which is the `tcm` id. - dsp_group - LH5 root group in `dsp` file. - hit_group - LH5 root group in `hit` file. - evt_group - LH5 root group in `evt` file. """ + f = utils.make_files_config(datainfo) - # define dimension of output array - out = np.full(nrows, defv, dtype=type(defv)) - outt = np.zeros(len(out)) + out = None + outt = None + store = LH5Store(keep_open=True) - store = LH5Store() + for ch in channels: + table_id = utils.get_tcm_id_by_pattern(f.hit.table_fmt, ch) - for ch in chns: # get index list for this channel to be loaded - idx_ch = idx[ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch)] - evt_ids_ch = np.searchsorted( - cumulength, - np.where(ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch))[0], - "right", - ) + idx_ch = tcm.idx[tcm.id == table_id] # evaluate at channel - res = utils.get_data_at_channel( - ch=ch, - ids=ids, - idx=idx, - expr=expr, - exprl=exprl, - var_ph=var_ph, - is_evaluated=ch not in chns_rm, - f_hit=f_hit, - f_dsp=f_dsp, - defv=defv, - tcm_id_table_pattern=tcm_id_table_pattern, - evt_group=evt_group, - hit_group=hit_group, - dsp_group=dsp_group, - ) + if ch not in channels_skip: + res = utils.get_data_at_channel( + datainfo=datainfo, + ch=ch, + tcm=tcm, + expr=expr, + field_list=field_list, + pars_dict=pars_dict, + ) + + if out is None: + # define dimension of output array + out = utils.make_numpy_full(n_rows, default_value, res.dtype) + outt = np.zeros(len(out)) + else: + res = np.full(len(idx_ch), default_value) # get mask from query limarr = utils.get_mask_from_query( - qry=qry, + datainfo=datainfo, + query=query, length=len(res), ch=ch, idx_ch=idx_ch, - f_hit=f_hit, - f_dsp=f_dsp, - hit_group=hit_group, - dsp_group=dsp_group, ) # find if sorter is in hit or dsp t0 = store.read( f"{ch}/{sorter[0]}/{sorter[1]}", - f_hit if f"{hit_group}" == sorter[0] else f_dsp, + f.hit.file if f"{f.hit.group}" == sorter[0] else f.dsp.file, idx=idx_ch, )[0].view_as("np") if t0.ndim > 1: raise ValueError(f"sorter '{sorter[0]}/{sorter[1]}' must be a 1D array") + evt_ids_ch = np.searchsorted( + tcm.cumulative_length, + np.where(tcm.id == table_id)[0], + "right", + ) + if is_first: - if ch == chns[0]: + if ch == channels[0]: outt[:] = np.inf out[evt_ids_ch] = np.where( @@ -152,292 +130,236 @@ def evaluate_to_first_or_last( (t0 > outt[evt_ids_ch]) & (limarr), t0, outt[evt_ids_ch] ) - return Array(nda=out, dtype=type(defv)) + return types.Array(nda=out) def evaluate_to_scalar( - mode: str, - cumulength: NDArray, - idx: NDArray, - ids: NDArray, - f_hit: str, - f_dsp: str, - chns: list, - chns_rm: list, - expr: str, - exprl: list, - qry: str | NDArray, - nrows: int, - var_ph: dict = None, - defv: bool | int | float = np.nan, - tcm_id_table_pattern: str = "ch{}", - evt_group: str = "evt", - hit_group: str = "hit", - dsp_group: str = "dsp", -) -> Array: + datainfo, + tcm, + mode, + channels, + channels_skip, + expr, + field_list, + query, + n_rows, + pars_dict=None, + default_value=np.nan, +) -> types.Array: """Aggregates by summation across channels. Parameters ---------- + datainfo + input and output LH5 datainfo with HDF5 groups where tables are found. + tcm + TCM data arrays in an object that can be accessed by attribute. mode aggregation mode. - idx - `tcm` index array. - ids - `tcm` id array. - f_hit - path to `hit` tier file. - f_dsp - path to `dsp` tier file. - chns + channels list of channels to be aggregated. - chns_rm + channels_skip list of channels to be skipped from evaluation and set to default value. expr expression string to be evaluated. - exprl + field_list list of `dsp/hit/evt` parameter tuples in expression ``(tier, field)``. - qry + query query expression to mask aggregation. - nrows + n_rows length of output array - var_ph + pars_dict dictionary of `evt` and additional parameters and their values. - defv + default_value default value. - tcm_id_table_pattern - pattern to format `tcm` id values to table name in higher tiers. Must have one - placeholder which is the `tcm` id. - dsp_group - LH5 root group in `dsp` file. - hit_group - LH5 root group in `hit` file. - evt_group - LH5 root group in `evt` file. """ + f = utils.make_files_config(datainfo) + out = None - # define dimension of output array - out = np.full(nrows, defv, dtype=type(defv)) + for ch in channels: + table_id = utils.get_tcm_id_by_pattern(f.hit.table_fmt, ch) - for ch in chns: # get index list for this channel to be loaded - idx_ch = idx[ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch)] - evt_ids_ch = np.searchsorted( - cumulength, - np.where(ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch))[0], - "right", - ) + idx_ch = tcm.idx[tcm.id == table_id] + + if ch not in channels_skip: + res = utils.get_data_at_channel( + datainfo=datainfo, + ch=ch, + tcm=tcm, + expr=expr, + field_list=field_list, + pars_dict=pars_dict, + ) - res = utils.get_data_at_channel( - ch=ch, - ids=ids, - idx=idx, - expr=expr, - exprl=exprl, - var_ph=var_ph, - is_evaluated=ch not in chns_rm, - f_hit=f_hit, - f_dsp=f_dsp, - defv=defv, - tcm_id_table_pattern=tcm_id_table_pattern, - evt_group=evt_group, - hit_group=hit_group, - dsp_group=dsp_group, - ) + if out is None: + # define dimension of output array + out = utils.make_numpy_full(n_rows, default_value, res.dtype) + else: + res = np.full(len(idx_ch), default_value) # get mask from query limarr = utils.get_mask_from_query( - qry=qry, + datainfo=datainfo, + query=query, length=len(res), ch=ch, idx_ch=idx_ch, - f_hit=f_hit, - f_dsp=f_dsp, - hit_group=hit_group, - dsp_group=dsp_group, + ) + + evt_ids_ch = np.searchsorted( + tcm.cumulative_length, + np.where(tcm.id == table_id)[0], + side="right", ) # switch through modes if "sum" == mode: if res.dtype == bool: res = res.astype(int) + out[evt_ids_ch] = np.where(limarr, res + out[evt_ids_ch], out[evt_ids_ch]) + if "any" == mode: if res.dtype != bool: res = res.astype(bool) + out[evt_ids_ch] = out[evt_ids_ch] | (res & limarr) + if "all" == mode: if res.dtype != bool: res = res.astype(bool) + out[evt_ids_ch] = out[evt_ids_ch] & res & limarr - return Array(nda=out, dtype=type(defv)) + return types.Array(nda=out) def evaluate_at_channel( - cumulength: NDArray, - idx: NDArray, - ids: NDArray, - f_hit: str, - f_dsp: str, - chns_rm: list, - expr: str, - exprl: list, - ch_comp: Array, - var_ph: dict = None, - defv: bool | int | float = np.nan, - tcm_id_table_pattern: str = "ch{}", - evt_group: str = "evt", - hit_group: str = "hit", - dsp_group: str = "dsp", -) -> Array: + datainfo, + tcm, + channels_skip, + expr, + field_list, + ch_comp, + pars_dict=None, + default_value=np.nan, +) -> types.Array: """Aggregates by evaluating the expression at a given channel. Parameters ---------- - idx - `tcm` index array. - ids - `tcm` id array. - f_hit - path to `hit` tier file. - f_dsp - path to `dsp` tier file. - chns_rm + datainfo + input and output LH5 datainfo with HDF5 groups where tables are found. + tcm + TCM data arrays in an object that can be accessed by attribute. + channels_skip list of channels to be skipped from evaluation and set to default value. expr expression string to be evaluated. - exprl + field_list list of `dsp/hit/evt` parameter tuples in expression ``(tier, field)``. ch_comp array of rawids at which the expression is evaluated. - var_ph + pars_dict dictionary of `evt` and additional parameters and their values. - defv + default_value default value. - tcm_id_table_pattern - pattern to format `tcm` id values to table name in higher tiers. Must have one - placeholder which is the `tcm` id. - dsp_group - LH5 root group in `dsp` file. - hit_group - LH5 root group in `hit` file. - evt_group - LH5 root group in `evt` file. """ + f = utils.make_files_config(datainfo) + table_id_fmt = f.hit.table_fmt - out = np.full(len(ch_comp.nda), defv, dtype=type(defv)) + out = None for ch in np.unique(ch_comp.nda.astype(int)): + table_name = utils.get_table_name_by_pattern(table_id_fmt, ch) # skip default value - if utils.get_table_name_by_pattern(tcm_id_table_pattern, ch) not in lh5.ls( - f_hit - ): + if table_name not in lh5.ls(f.hit.file): continue - idx_ch = idx[ids == ch] - evt_ids_ch = np.searchsorted(cumulength, np.where(ids == ch)[0], "right") - res = utils.get_data_at_channel( - ch=utils.get_table_name_by_pattern(tcm_id_table_pattern, ch), - ids=ids, - idx=idx, - expr=expr, - exprl=exprl, - var_ph=var_ph, - is_evaluated=utils.get_table_name_by_pattern(tcm_id_table_pattern, ch) - not in chns_rm, - f_hit=f_hit, - f_dsp=f_dsp, - defv=defv, - tcm_id_table_pattern=tcm_id_table_pattern, - evt_group=evt_group, - hit_group=hit_group, - dsp_group=dsp_group, + + idx_ch = tcm.idx[tcm.id == ch] + evt_ids_ch = np.searchsorted( + tcm.cumulative_length, np.where(tcm.id == ch)[0], "right" ) + if table_name not in channels_skip: + res = utils.get_data_at_channel( + datainfo=datainfo, + ch=table_name, + tcm=tcm, + expr=expr, + field_list=field_list, + pars_dict=pars_dict, + ) + else: + res = np.full(len(idx_ch), default_value) + + if out is None: + out = utils.make_numpy_full(len(ch_comp.nda), default_value, res.dtype) out[evt_ids_ch] = np.where(ch == ch_comp.nda[idx_ch], res, out[evt_ids_ch]) - return Array(nda=out, dtype=type(defv)) + return types.Array(nda=out) def evaluate_at_channel_vov( - cumulength: NDArray, - idx: NDArray, - ids: NDArray, - f_hit: str, - f_dsp: str, - expr: str, - exprl: list, - ch_comp: VectorOfVectors, - chns_rm: list, - var_ph: dict = None, - defv: bool | int | float = np.nan, - tcm_id_table_pattern: str = "ch{}", - evt_group: str = "evt", - hit_group: str = "hit", - dsp_group: str = "dsp", -) -> VectorOfVectors: + datainfo, + tcm, + expr, + field_list, + ch_comp, + channels_skip, + pars_dict=None, + default_value=np.nan, +) -> types.VectorOfVectors: """Same as :func:`evaluate_at_channel` but evaluates expression at non flat channels :class:`.VectorOfVectors`. Parameters ---------- - idx - `tcm` index array. - ids - `tcm` id array. - f_hit - path to `hit` tier file. - f_dsp - path to `dsp` tier file. + datainfo + input and output LH5 datainfo with HDF5 groups where tables are found. + tcm + TCM data arrays in an object that can be accessed by attribute. expr expression string to be evaluated. - exprl + field_list list of `dsp/hit/evt` parameter tuples in expression ``(tier, field)``. ch_comp array of "rawid"s at which the expression is evaluated. - chns_rm + channels_skip list of channels to be skipped from evaluation and set to default value. - var_ph + pars_dict dictionary of `evt` and additional parameters and their values. - defv + default_value default value. - tcm_id_table_pattern - pattern to format `tcm` id values to table name in higher tiers. Must have one - placeholder which is the `tcm` id. - dsp_group - LH5 root group in `dsp` file. - hit_group - LH5 root group in `hit` file. - evt_group - LH5 root group in `evt` file. """ + f = utils.make_files_config(datainfo) # blow up vov to aoesa out = ak.Array([[] for _ in range(len(ch_comp))]) - chns = np.unique(ch_comp.flattened_data.nda).astype(int) + channels = np.unique(ch_comp.flattened_data.nda).astype(int) ch_comp = ch_comp.view_as("ak") type_name = None - for ch in chns: - evt_ids_ch = np.searchsorted(cumulength, np.where(ids == ch)[0], "right") - res = utils.get_data_at_channel( - ch=utils.get_table_name_by_pattern(tcm_id_table_pattern, ch), - ids=ids, - idx=idx, - expr=expr, - exprl=exprl, - var_ph=var_ph, - is_evaluated=utils.get_table_name_by_pattern(tcm_id_table_pattern, ch) - not in chns_rm, - f_hit=f_hit, - f_dsp=f_dsp, - defv=defv, - tcm_id_table_pattern=tcm_id_table_pattern, - evt_group=evt_group, - hit_group=hit_group, - dsp_group=dsp_group, + for ch in channels: + table_name = utils.get_table_name_by_pattern(f.hit.table_fmt, ch) + + evt_ids_ch = np.searchsorted( + tcm.cumulative_length, np.where(tcm.id == ch)[0], "right" ) + if table_name not in channels_skip: + res = utils.get_data_at_channel( + datainfo=datainfo, + ch=table_name, + tcm=tcm, + expr=expr, + field_list=field_list, + pars_dict=pars_dict, + ) + else: + idx_ch = tcm.idx[tcm.id == ch] + res = np.full(len(idx_ch), default_value) # see in which events the current channel is present mask = ak.to_numpy(ak.any(ch_comp == ch, axis=-1), allow_missing=False) @@ -448,231 +370,181 @@ def evaluate_at_channel_vov( out = ak.concatenate((out, cv), axis=-1) - if ch == chns[0]: + if ch == channels[0]: type_name = res.dtype - return VectorOfVectors(ak.values_astype(out, type_name), dtype=type_name) + return types.VectorOfVectors(ak.values_astype(out, type_name)) def evaluate_to_aoesa( - cumulength: NDArray, - idx: NDArray, - ids: NDArray, - f_hit: str, - f_dsp: str, - chns: list, - chns_rm: list, - expr: str, - exprl: list, - qry: str | NDArray, - nrows: int, - var_ph: dict = None, - defv: bool | int | float = np.nan, - missv=np.nan, - tcm_id_table_pattern: str = "ch{}", - evt_group: str = "evt", - hit_group: str = "hit", - dsp_group: str = "dsp", -) -> ArrayOfEqualSizedArrays: + datainfo, + tcm, + channels, + channels_skip, + expr, + field_list, + query, + n_rows, + pars_dict=None, + default_value=np.nan, + missing_value=np.nan, +) -> types.ArrayOfEqualSizedArrays: """Aggregates by returning an :class:`.ArrayOfEqualSizedArrays` of evaluated expressions of channels that fulfill a query expression. Parameters ---------- - idx - `tcm` index array. - ids - `tcm` id array. - f_hit - path to `hit` tier file. - f_dsp - path to `dsp` tier file. - chns + datainfo + input and output LH5 datainfo with HDF5 groups where tables are found. + tcm + TCM data arrays in an object that can be accessed by attribute. + channels list of channels to be aggregated. - chns_rm + channels_skip list of channels to be skipped from evaluation and set to default value. expr expression string to be evaluated. - exprl + field_list list of `dsp/hit/evt` parameter tuples in expression ``(tier, field)``. - qry + query query expression to mask aggregation. - nrows + n_rows length of output :class:`.VectorOfVectors`. ch_comp array of "rawid"s at which the expression is evaluated. - var_ph + pars_dict dictionary of `evt` and additional parameters and their values. - defv + default_value default value. - missv + missing_value missing value. sorter sorts the entries in the vector according to sorter expression. - tcm_id_table_pattern - pattern to format `tcm` id values to table name in higher tiers. Must have one - placeholder which is the `tcm` id. - dsp_group - LH5 root group in `dsp` file. - hit_group - LH5 root group in `hit` file. - evt_group - LH5 root group in `evt` file. """ + f = utils.make_files_config(datainfo) + # define dimension of output array - out = np.full((nrows, len(chns)), missv) + dtype = None + out = None + + for i, ch in enumerate(channels): + table_id = utils.get_tcm_id_by_pattern(f.hit.table_fmt, ch) + idx_ch = tcm.idx[tcm.id == table_id] - i = 0 - for ch in chns: - idx_ch = idx[ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch)] evt_ids_ch = np.searchsorted( - cumulength, - np.where(ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch))[0], + tcm.cumulative_length, + np.where(tcm.id == table_id)[0], "right", ) - res = utils.get_data_at_channel( - ch=ch, - ids=ids, - idx=idx, - expr=expr, - exprl=exprl, - var_ph=var_ph, - is_evaluated=ch not in chns_rm, - f_hit=f_hit, - f_dsp=f_dsp, - defv=defv, - tcm_id_table_pattern=tcm_id_table_pattern, - evt_group=evt_group, - hit_group=hit_group, - dsp_group=dsp_group, - ) + + if ch not in channels_skip: + res = utils.get_data_at_channel( + datainfo=datainfo, + ch=ch, + tcm=tcm, + expr=expr, + field_list=field_list, + pars_dict=pars_dict, + ) + + if dtype is None: + dtype = res.dtype + + if out is None: + out = utils.make_numpy_full( + (n_rows, len(channels)), missing_value, res.dtype + ) + else: + res = np.full(len(idx_ch), default_value) # get mask from query limarr = utils.get_mask_from_query( - qry=qry, + datainfo=datainfo, + query=query, length=len(res), ch=ch, idx_ch=idx_ch, - f_hit=f_hit, - f_dsp=f_dsp, - hit_group=hit_group, - dsp_group=dsp_group, ) out[evt_ids_ch, i] = np.where(limarr, res, out[evt_ids_ch, i]) - i += 1 - - return ArrayOfEqualSizedArrays(nda=out) + return out, dtype def evaluate_to_vector( - cumulength: NDArray, - idx: NDArray, - ids: NDArray, - f_hit: str, - f_dsp: str, - chns: list, - chns_rm: list, - expr: str, - exprl: list, - qry: str | NDArray, - nrows: int, - var_ph: dict = None, - defv: bool | int | float = np.nan, - sorter: str = None, - tcm_id_table_pattern: str = "ch{}", - evt_group: str = "evt", - hit_group: str = "hit", - dsp_group: str = "dsp", -) -> VectorOfVectors: + datainfo, + tcm, + channels, + channels_skip, + expr, + field_list, + query, + n_rows, + pars_dict=None, + default_value=np.nan, + sorter=None, +) -> types.VectorOfVectors: """Aggregates by returning a :class:`.VectorOfVector` of evaluated expressions of channels that fulfill a query expression. Parameters ---------- - idx - `tcm` index array. - ids - `tcm` id array. - f_hit - path to `hit` tier file. - f_dsp - path to `dsp` tier file. - chns + datainfo + input and output LH5 datainfo with HDF5 groups where tables are found. + tcm + TCM data arrays in an object that can be accessed by attribute. + channels list of channels to be aggregated. - chns_rm + channels_skip list of channels to be skipped from evaluation and set to default value. expr expression string to be evaluated. - exprl + field_list list of `dsp/hit/evt` parameter tuples in expression ``(tier, field)``. - qry + query query expression to mask aggregation. - nrows + n_rows length of output :class:`.VectorOfVectors`. ch_comp array of "rawids" at which the expression is evaluated. - var_ph + pars_dict dictionary of `evt` and additional parameters and their values. - defv + default_value default value. sorter sorts the entries in the vector according to sorter expression. ``ascend_by:`` results in an vector ordered ascending, ``decend_by:`` sorts descending. - tcm_id_table_pattern - pattern to format `tcm` id values to table name in higher tiers. Must have one - placeholder which is the `tcm` id. - dsp_group - LH5 root group in `dsp` file. - hit_group - LH5 root group in `hit` file. - evt_group - LH5 root group in `evt` file. """ - out = evaluate_to_aoesa( - cumulength=cumulength, - idx=idx, - ids=ids, - f_hit=f_hit, - f_dsp=f_dsp, - chns=chns, - chns_rm=chns_rm, + out, dtype = evaluate_to_aoesa( + datainfo=datainfo, + tcm=tcm, + channels=channels, + channels_skip=channels_skip, expr=expr, - exprl=exprl, - qry=qry, - nrows=nrows, - var_ph=var_ph, - defv=defv, - missv=np.nan, - tcm_id_table_pattern=tcm_id_table_pattern, - evt_group=evt_group, - hit_group=hit_group, - dsp_group=dsp_group, - ).view_as("np") + field_list=field_list, + query=query, + n_rows=n_rows, + pars_dict=pars_dict, + default_value=default_value, + missing_value=np.nan, + ) # if a sorter is given sort accordingly if sorter is not None: md, fld = sorter.split(":") - s_val = evaluate_to_aoesa( - cumulength=cumulength, - idx=idx, - ids=ids, - f_hit=f_hit, - f_dsp=f_dsp, - chns=chns, - chns_rm=chns_rm, + s_val, _ = evaluate_to_aoesa( + datainfo=datainfo, + tcm=tcm, + channels=channels, + channels_skip=channels_skip, expr=fld, - exprl=[tuple(fld.split("."))], - qry=None, - nrows=nrows, - missv=np.nan, - tcm_id_table_pattern=tcm_id_table_pattern, - evt_group=evt_group, - hit_group=hit_group, - dsp_group=dsp_group, - ).view_as("np") + field_list=[tuple(fld.split("."))], + query=None, + n_rows=n_rows, + missing_value=np.nan, + ) + if "ascend_by" == md: out = out[np.arange(len(out))[:, None], np.argsort(s_val)] @@ -683,7 +555,6 @@ def evaluate_to_vector( "sorter values can only have 'ascend_by' or 'descend_by' prefixes" ) - return VectorOfVectors( - ak.values_astype(ak.drop_none(ak.nan_to_none(ak.Array(out))), type(defv)), - dtype=type(defv), + return types.VectorOfVectors( + ak.values_astype(ak.drop_none(ak.nan_to_none(ak.Array(out))), dtype) ) diff --git a/src/pygama/evt/build_evt.py b/src/pygama/evt/build_evt.py index 5f7949bdb..3620dd373 100644 --- a/src/pygama/evt/build_evt.py +++ b/src/pygama/evt/build_evt.py @@ -4,59 +4,67 @@ from __future__ import annotations +import importlib import itertools -import json import logging import re -from importlib import import_module +from collections.abc import Mapping, Sequence +from typing import Any import awkward as ak import numpy as np from lgdo import Array, ArrayOfEqualSizedArrays, Table, VectorOfVectors, lh5 -from lgdo.lh5 import LH5Store +from ..utils import load_dict from . import aggregators, utils log = logging.getLogger(__name__) def build_evt( - f_tcm: str, - f_dsp: str, - f_hit: str, - evt_config: str | dict, - f_evt: str | None = None, + datainfo: utils.DataInfo | Mapping[str, Sequence[str, ...]], + config: str | Mapping[str, ...], wo_mode: str = "write_safe", - evt_group: str = "evt", - tcm_group: str = "hardware_tcm_1", - dsp_group: str = "dsp", - hit_group: str = "hit", - tcm_id_table_pattern: str = "ch{}", ) -> None | Table: - """Transform data from the `hit` and `dsp` levels which a channel sorted to a - event sorted data format. + r"""Transform data from hit-structured tiers to event-structured data. Parameters ---------- - f_tcm - input LH5 file of the `tcm` level. - f_dsp - input LH5 file of the `dsp` level. - f_hit - input LH5 file of the `hit` level. - evt_config + datainfo + input and output LH5 datainfo with HDF5 groups where tables are found, + (see :obj:`.utils.DataInfo`). Example: :: + + # syntax: {"tier-name": ("file-name", "hdf5-group"[, "table-format"])} + { + "tcm": ("data-tier_tcm.lh5", "hardware_tcm_1"), + "dsp": ("data-tier_dsp.lh5", "dsp", "ch{}"), + "hit": ("data-tier_hit.lh5", "hit", "ch{}"), + "evt": ("data-tier_evt.lh5", "evt") + } + + config name of configuration file or dictionary defining event fields. Channel lists can be defined by importing a metadata module. - - ``operations`` defines the fields ``name=key``, where ``channels`` - specifies the channels used to for this field (either a string or a - list of strings), + - ``channels`` specifies the channels used to for this field (either a + string or a list of strings). + - ``operations`` defines the event fields (``name=key``). If the key + contains slahes it will be interpreted as the path to the output + field inside nested sub-tables. + - ``outputs`` defines the fields that are actually included in the + output table. + + Inside the ``operations`` block: + - ``aggregation_mode`` defines how the channels should be combined (see :func:`evaluate_expression`). - - ``expression`` defnies the mathematical/special function to apply + - ``expression`` defines the expression or function call to apply (see :func:`evaluate_expression`), - ``query`` defines an expression to mask the aggregation. - ``parameters`` defines any other parameter used in expression. + - ``dtype`` defines the NumPy data type of the resulting data. + - ``initial`` defines the initial/default value. Useful with some types + of aggregators. For example: @@ -68,6 +76,7 @@ def build_evt( "spms_on": ["ch1057600", "ch1059201", "ch1062405"], "muon": "ch1027202", }, + "outputs": ["energy_id", "multiplicity"], "operations": { "energy_id":{ "channels": "geds_on", @@ -83,90 +92,69 @@ def build_evt( "is_muon_rejected":{ "channels": "muon", "aggregation_mode": "any", - "expression": "dsp.wf_max>a", - "parameters": {"a":15100}, + "expression": "dsp.wf_max > a", + "parameters": {"a": 15100}, "initial": false }, "multiplicity":{ "channels": ["geds_on", "geds_no_psd", "geds_ac"], "aggregation_mode": "sum", "expression": "hit.cuspEmax_ctc_cal > a", - "parameters": {"a":25}, + "parameters": {"a": 25}, "initial": 0 }, "t0":{ "aggregation_mode": "keep_at_ch:evt.energy_id", - "expression": "dsp.tp_0_est" + "expression": "dsp.tp_0_est", + "initial": "np.nan" }, "lar_energy":{ "channels": "spms_on", "aggregation_mode": "function", - "expression": ".modules.spm.get_energy(0.5, evt.t0, 48000, 1000, 5000)" + "expression": "pygama.evt.modules.spms.gather_pulse_data(<...>, observable='hit.energy_in_pe')" }, } } - f_evt - name of the output file. If ``None``, return the output :class:`.Table` - instead of writing to disk. wo_mode - writing mode. - evt group - LH5 root group name of `evt` tier. - tcm_group - LH5 root group in `tcm` file. - dsp_group - LH5 root group in `dsp` file. - hit_group - LH5 root group in `hit` file. - tcm_id_table_pattern - pattern to format `tcm` id values to table name in higher tiers. Must - have one placeholder which is the `tcm` id. + writing mode, see :func:`lgdo.lh5.core.write`. """ + if not isinstance(config, dict): + config = load_dict(config) - store = LH5Store() - tbl_cfg = evt_config - if not isinstance(tbl_cfg, (str, dict)): - raise TypeError() - if isinstance(tbl_cfg, str): - with open(tbl_cfg) as f: - tbl_cfg = json.load(f) - - if "channels" not in tbl_cfg.keys(): + if "channels" not in config.keys(): raise ValueError("channel field needs to be specified in the config") - if "operations" not in tbl_cfg.keys(): + if "operations" not in config.keys(): raise ValueError("operations field needs to be specified in the config") - # check tcm_id_table_pattern validity - pattern_check = re.findall(r"{([^}]*?)}", tcm_id_table_pattern) + # convert into a nice named tuple + f = utils.make_files_config(datainfo) + + # check chname_fmt validity + chname_fmt = f.hit.table_fmt + pattern_check = re.findall(r"{([^}]*?)}", chname_fmt) if len(pattern_check) != 1: - raise ValueError( - f"tcm_id_table_pattern must have exactly one placeholder. {tcm_id_table_pattern} is invalid." - ) + raise ValueError("chname_fmt must have exactly one placeholder {}") elif "{" in pattern_check[0] or "}" in pattern_check[0]: - raise ValueError( - f"tcm_id_table_pattern {tcm_id_table_pattern} has an invalid placeholder." - ) + raise ValueError(f"{chname_fmt=} has an invalid placeholder.") if ( utils.get_table_name_by_pattern( - tcm_id_table_pattern, - utils.get_tcm_id_by_pattern(tcm_id_table_pattern, lh5.ls(f_hit)[0]), + chname_fmt, + utils.get_tcm_id_by_pattern(chname_fmt, lh5.ls(f.hit.file)[0]), ) - != lh5.ls(f_hit)[0] + != lh5.ls(f.hit.file)[0] ): - raise ValueError( - f"tcm_id_table_pattern {tcm_id_table_pattern} does not match keys in data!" - ) + raise ValueError(f"chname_fmt {chname_fmt} does not match keys in data!") # create channel list according to config # This can be either read from the meta data # or a list of channel names - log.debug("Creating channel dictionary") + log.debug("creating channel dictionary") - chns = {} + channels = {} - for k, v in tbl_cfg["channels"].items(): + for key, v in config["channels"].items(): if isinstance(v, dict): # it is a meta module. module_name must exist if "module" not in v.keys(): @@ -175,10 +163,9 @@ def build_evt( ) attr = {} - # the time_key argument is set to the time key of the DSP file - # in case it is not provided by the config + # the time_key argument is mandatory if "time_key" not in v.keys(): - attr["time_key"] = re.search(r"\d{8}T\d{6}Z", f_dsp).group(0) + raise RuntimeError("the 'time_key' configuration field is mandatory") # if "None" do None elif "None" == v["time_key"]: @@ -186,160 +173,179 @@ def build_evt( # load module p, m = v["module"].rsplit(".", 1) - met = getattr(import_module(p, package=__package__), m) - chns[k] = met(v | attr) + met = getattr(importlib.import_module(p, package=__package__), m) + channels[key] = met(v | attr) elif isinstance(v, str): - chns[k] = [v] + channels[key] = [v] elif isinstance(v, list): - chns[k] = [e for e in v] - - nrows = store.read_n_rows(f"/{tcm_group}/cumulative_length", f_tcm) + channels[key] = [e for e in v] + + # load tcm data from disk + tcm = utils.TCMData( + id=lh5.read_as(f"/{f.tcm.group}/array_id", f.tcm.file, library="np"), + idx=lh5.read_as(f"/{f.tcm.group}/array_idx", f.tcm.file, library="np"), + cumulative_length=lh5.read_as( + f"/{f.tcm.group}/cumulative_length", f.tcm.file, library="np" + ), + ) - table = Table(size=nrows) + # get number of events in file (ask the TCM) + n_rows = len(tcm.cumulative_length) + table = Table(size=n_rows) - for k, v in tbl_cfg["operations"].items(): - log.debug("Processing field " + k) + # now loop over operations (columns in evt table) + for field, v in config["operations"].items(): + log.debug(f"processing field: '{field}'") - # if mode not defined in operation, it can only be an operation on the evt level. + # if mode not defined in operation, it can only be an operation on the + # evt level if "aggregation_mode" not in v.keys(): - var = {} - if "parameters" in v.keys(): - var = var | v["parameters"] - res = table.eval(v["expression"].replace(f"{evt_group}.", ""), var) + # compute and eventually get rid of evt. suffix + obj = table.eval( + v["expression"].replace("evt.", ""), v.get("parameters", {}) + ) - # add attribute if present + # add attributes if present if "lgdo_attrs" in v.keys(): - res.attrs |= v["lgdo_attrs"] - - table.add_field(k, res) + obj.attrs |= v["lgdo_attrs"] - # Else we build the event entry + # else we build the event entry else: if "channels" not in v.keys(): - chns_e = [] + channels_e = [] elif isinstance(v["channels"], str): - chns_e = chns[v["channels"]] + channels_e = channels[v["channels"]] elif isinstance(v["channels"], list): - chns_e = list( - itertools.chain.from_iterable([chns[e] for e in v["channels"]]) + channels_e = list( + itertools.chain.from_iterable([channels[e] for e in v["channels"]]) ) - chns_rm = [] + channels_skip = [] if "exclude_channels" in v.keys(): if isinstance(v["exclude_channels"], str): - chns_rm = chns[v["exclude_channels"]] + channels_skip = channels[v["exclude_channels"]] elif isinstance(v["exclude_channels"], list): - chns_rm = list( + channels_skip = list( itertools.chain.from_iterable( - [chns[e] for e in v["exclude_channels"]] + [channels[e] for e in v["exclude_channels"]] ) ) - pars, qry, defaultv, srter = None, None, np.nan, None - if "parameters" in v.keys(): - pars = v["parameters"] - if "query" in v.keys(): - qry = v["query"] - if "initial" in v.keys(): - defaultv = v["initial"] - if isinstance(defaultv, str) and ( - defaultv in ["np.nan", "np.inf", "-np.inf"] - ): - defaultv = eval(defaultv) - if "sort" in v.keys(): - srter = v["sort"] + defaultv = v.get("initial", np.nan) + if isinstance(defaultv, str) and ( + defaultv in ["np.nan", "np.inf", "-np.inf"] + ): + defaultv = eval(defaultv) obj = evaluate_expression( - f_tcm=f_tcm, - f_hit=f_hit, - f_dsp=f_dsp, - chns=chns_e, - chns_rm=chns_rm, + datainfo, + tcm, + channels=channels_e, + channels_skip=channels_skip, mode=v["aggregation_mode"], expr=v["expression"], - nrows=nrows, + n_rows=n_rows, table=table, - para=pars, - qry=qry, - defv=defaultv, - sorter=srter, - tcm_id_table_pattern=tcm_id_table_pattern, - evt_group=evt_group, - hit_group=hit_group, - dsp_group=dsp_group, - tcm_group=tcm_group, + parameters=v.get("parameters", None), + query=v.get("query", None), + default_value=defaultv, + sorter=v.get("sort", None), ) # add attribute if present if "lgdo_attrs" in v.keys(): obj.attrs |= v["lgdo_attrs"] - table.add_field(k, obj) - - # write output fields into f_evt - if "outputs" in tbl_cfg.keys(): - if len(tbl_cfg["outputs"]) < 1: - log.warning("No output fields specified, no file will be written.") - return table - else: - clms_to_remove = [e for e in table.keys() if e not in tbl_cfg["outputs"]] - for fld in clms_to_remove: - table.remove_field(fld, True) - - if f_evt: - store.write( - obj=table, name=f"/{evt_group}/", lh5_file=f_evt, wo_mode=wo_mode - ) - else: - return table + # cast to type, if required + # hijack the poor LGDO + if "dtype" in v: + type_ = v["dtype"] + + if isinstance(obj, Array): + obj.nda = obj.nda.astype(type_) + if isinstance(obj, VectorOfVectors): + fldata_ptr = obj.flattened_data + while isinstance(fldata_ptr, VectorOfVectors): + fldata_ptr = fldata_ptr.flattened_data + + fldata_ptr.nda = fldata_ptr.nda.astype(type_) + + log.debug(f"new column {field!s} = {obj!r}") + table.add_field(field, obj) + + # might need to re-organize fields in subtables, create a new object for that + nested_tbl = Table(size=n_rows) + output_fields = config.get("outputs", table.keys()) + + for field, obj in table.items(): + # also only add fields requested by the user + if field not in output_fields: + continue + + # if names contain slahes, put in sub-tables + lvl_ptr = nested_tbl + subfields = field.strip("/").split("___") + for level in subfields: + # if we are at the end, just add the field + if level == subfields[-1]: + lvl_ptr.add_field(level, obj) + break + + if not level: + msg = f"invalid field name '{field}'" + raise RuntimeError(msg) + + # otherwise, increase nesting + if level not in lvl_ptr: + lvl_ptr.add_field(level, Table(size=n_rows)) + lvl_ptr = lvl_ptr[level] + + # write output fields into outfile + if output_fields: + if f.evt.file is None: + return nested_tbl + + lh5.write( + obj=nested_tbl, + name=f.evt.group, + lh5_file=f.evt.file, + wo_mode=wo_mode, + ) else: - log.warning("No output fields specified, no file will be written.") - - key = re.search(r"\d{8}T\d{6}Z", f_hit).group(0) - log.info( - f"Applied {len(tbl_cfg['operations'])} operations to key {key} and saved " - f"{len(tbl_cfg['outputs'])} evt fields across {len(chns)} channel groups" - ) + log.warning("no output fields specified, no file will be written.") + return nested_tbl def evaluate_expression( - f_tcm: str, - f_hit: str, - f_dsp: str, - chns: list, - chns_rm: list, + datainfo: utils.DataInfo | Mapping[str, Sequence[str, ...]], + tcm: utils.TCMData, + channels: Sequence[str], + channels_skip: Sequence[list], mode: str, expr: str, - nrows: int, + n_rows: int, table: Table = None, - para: dict = None, - qry: str = None, - defv: bool | int | float = np.nan, + parameters: Mapping[str, Any] = None, + query: str = None, + default_value: bool | int | float = np.nan, sorter: str = None, - tcm_id_table_pattern: str = "ch{}", - evt_group: str = "evt", - hit_group: str = "hit", - dsp_group: str = "dsp", - tcm_group: str = "tcm", ) -> Array | ArrayOfEqualSizedArrays | VectorOfVectors: """Evaluates the expression defined by the user across all channels according to the mode. Parameters ---------- - f_tcm - path to `tcm` tier file. - f_hit - path to `hit` tier file. - f_dsp - path to `dsp` tier file. - chns - list of channel names across which expression gets evaluated (form: - ``ch``). - chns_rm + datainfo + input and output LH5 files with HDF5 groups where tables are found. + (see :obj:`.utils.DataInfo`) + tcm + tcm data structure (see :obj:`.utils.TCMData`) + channels + list of channel names across which expression gets evaluated + channels_skip list of channels which get set to default value during evaluation. In - function mode they are removed entirely (form: ``ch``) + function mode they are removed entirely mode The mode determines how the event entry is calculated across channels. Options are: @@ -354,118 +360,131 @@ def evaluate_expression( - ``keep_at_ch:ch_field``: aggregates according to passed ch_field. - ``keep_at_idx:tcm_idx_field``: aggregates according to passed tcm index field. - - ``gather``: Channels are not combined, but result saved as + - ``gather``: channels are not combined, but result saved as :class:`.VectorOfVectors`. + - ``function``: the function call specified in `expr` is evaluated, and + the resulting column is inserted into the output table. - qry + query a query that can mask the aggregation. expr the expression. That can be any mathematical equation/comparison. If `mode` is ``function``, the expression needs to be a special processing - function defined in modules (e.g. :func:`.modules.spm.get_energy`). In - the expression parameters from either hit, dsp, evt tier (from - operations performed before this one! Dictionary operations order - matters), or from the ``parameters`` field can be used. - nrows + function defined in :mod:`.modules`. In the expression, parameters from + either `evt` or lower tiers (from operations performed before this one! + Dictionary operations order matters), or from the ``parameters`` field + can be used. Fields can be prefixed with the tier id (e.g. + ``evt.energy`` or `hit.quality_flag``). + n_rows number of rows to be processed. table table of `evt` tier data. - para + parameters dictionary of parameters defined in the ``parameters`` field in the configuration dictionary. - defv + default_value default value of evaluation. sorter can be used to sort vector outputs according to sorter expression (see - :func:`evaluate_to_vector`). - tcm_id_table_pattern - pattern to format tcm id values to table name in higher tiers. Must have one - placeholder which is the `tcm` id. - evt group - LH5 root group name of `evt` tier. - tcm_group - LH5 root group in `tcm` file. - dsp_group - LH5 root group in `dsp` file. - hit_group - LH5 root group in `hit` file. + :func:`.evaluate_to_vector`). + + Note + ---- + The specification of custom functions that can be used as expression is + documented in :mod:`.modules`. """ + f = utils.make_files_config(datainfo) - store = LH5Store() + # build dictionary of parameter names and their values + # a parameter can be a column in the existing table... + pars_dict = {} - # find parameters in evt file or in parameters - exprl = re.findall( - rf"({evt_group}|{hit_group}|{dsp_group}).([a-zA-Z_$][\w$]*)", expr - ) - var_ph = {} - if table: - var_ph = var_ph | { - e: table[e].view_as("ak") - for e in table.keys() - if isinstance(table[e], (Array, ArrayOfEqualSizedArrays, VectorOfVectors)) + if table is not None: + pars_dict = { + k: v for k, v in table.items() if isinstance(v, (Array, VectorOfVectors)) } - if para: - var_ph = var_ph | para + + # ...or defined through the configuration + if parameters: + pars_dict = pars_dict | parameters if mode == "function": - # evaluate expression - func, params = expr.split("(") - params = ( - params.replace(f"{dsp_group}.", f"{dsp_group}_") - .replace(f"{hit_group}.", f"{hit_group}_") - .replace(f"{evt_group}.", "") + # syntax: + # + # pygama.evt.modules.spms.my_func([...], arg1=val, arg2=val) + + # get arguments list passed to the function (outermost parentheses) + args_str = re.search(r"\((.*)\)$", expr.strip()).group(1) + + # handle tier scoping: evt.<> + args_str = args_str.replace("evt.", "") + + good_chns = [x for x in channels if x not in channels_skip] + + # replace stuff before first comma with list of mandatory args + full_args_str = "datainfo, tcm, table_names," + ",".join( + args_str.split(",")[1:] ) - params = [ - f_hit, - f_dsp, - f_tcm, - hit_group, - dsp_group, - tcm_group, - tcm_id_table_pattern, - [x for x in chns if x not in chns_rm], - ] + [utils.num_and_pars(e, var_ph) for e in params[:-1].split(",")] - - # load function dynamically - p, m = func.rsplit(".", 1) - met = getattr(import_module(p, package=__package__), m) - return met(*params) + + # get module and function names + func_call = expr.strip().split("(")[0] + subpackage, func = func_call.rsplit(".", 1) + package = subpackage.split(".")[0] + + # import function into current namespace + log.debug(f"importing module {subpackage}") + importlib.import_module(subpackage, package=__package__) + + # declare imported package as globals (see eval() call later) + globs = { + package: importlib.import_module(package), + } + + # lookup dictionary for variables used in function arguments (see eval() call later) + locs = {"datainfo": f, "tcm": tcm, "table_names": good_chns} | pars_dict + + # evil eval() to avoid annoying args casting logic + call_str = f"{func_call}({full_args_str})" + log.debug(f"evaluating {call_str}") + log.debug(f"...globals={globs} and locals={locs}") + log.debug(f"...locals={locs}") + + return eval(call_str, globs, locs) else: + # find parameters in evt file or in parameters + field_list = re.findall( + rf"({'|'.join(f._asdict().keys())}).([a-zA-Z_$][\w$]*)", expr + ) + # check if query is either on channel basis or evt basis (and not a mix) - qry_mask = qry - if qry is not None: - if f"{evt_group}." in qry and ( - f"{hit_group}." in qry or f"{dsp_group}." in qry - ): + query_mask = query + if query is not None: + hit_tiers = [k for k in f._asdict() if k != "evt"] + if "evt." in query and (any([t in query for t in hit_tiers])): raise ValueError( - f"Query can't be a mix of {evt_group} tier and lower tiers." + f"Query can't be a mix of {f.evt.group} tier and lower tiers." ) # if it is an evt query we can evaluate it directly here - if table and f"{evt_group}." in qry: - qry_mask = eval(qry.replace(f"{evt_group}.", ""), table) - - # load TCM data to define an event - ids = store.read(f"/{tcm_group}/array_id", f_tcm)[0].view_as("np") - idx = store.read(f"/{tcm_group}/array_idx", f_tcm)[0].view_as("np") - cumulength = store.read(f"/{tcm_group}/cumulative_length", f_tcm)[0].view_as( - "np" - ) + if table and "evt." in query: + query_mask = eval(query.replace("evt.", ""), table) # switch through modes - if table and (("keep_at_ch:" == mode[:11]) or ("keep_at_idx:" == mode[:12])): - if "keep_at_ch:" == mode[:11]: - ch_comp = table[mode[11:].replace(f"{evt_group}.", "")] + if table and ( + mode.startswith("keep_at_ch:") or mode.startswith("keep_at_idx:") + ): + if mode.startswith("keep_at_ch:"): + ch_comp = table[mode[11:].replace("evt.", "")] else: - ch_comp = table[mode[12:].replace(f"{evt_group}.", "")] + ch_comp = table[mode[12:].replace("evt.", "")] if isinstance(ch_comp, Array): - ch_comp = Array(nda=ids[ch_comp.view_as("np")]) + ch_comp = Array(tcm.id[ch_comp.view_as("np")]) elif isinstance(ch_comp, VectorOfVectors): ch_comp = ch_comp.view_as("ak") ch_comp = VectorOfVectors( - array=ak.unflatten( - ids[ak.flatten(ch_comp)], ak.count(ch_comp, axis=-1) + ak.unflatten( + tcm.id[ak.flatten(ch_comp)], ak.count(ch_comp, axis=-1) ) ) else: @@ -476,114 +495,82 @@ def evaluate_expression( if isinstance(ch_comp, Array): return aggregators.evaluate_at_channel( - cumulength=cumulength, - idx=idx, - ids=ids, - f_hit=f_hit, - f_dsp=f_dsp, - chns_rm=chns_rm, + datainfo=datainfo, + tcm=tcm, + channels_skip=channels_skip, expr=expr, - exprl=exprl, + field_list=field_list, ch_comp=ch_comp, - var_ph=var_ph, - defv=defv, - tcm_id_table_pattern=tcm_id_table_pattern, - evt_group=evt_group, - hit_group=hit_group, - dsp_group=dsp_group, + pars_dict=pars_dict, + default_value=default_value, ) - elif isinstance(ch_comp, VectorOfVectors): + + if isinstance(ch_comp, VectorOfVectors): return aggregators.evaluate_at_channel_vov( - cumulength=cumulength, - idx=idx, - ids=ids, - f_hit=f_hit, - f_dsp=f_dsp, + datainfo=datainfo, + tcm=tcm, expr=expr, - exprl=exprl, + field_list=field_list, ch_comp=ch_comp, - chns_rm=chns_rm, - var_ph=var_ph, - defv=defv, - tcm_id_table_pattern=tcm_id_table_pattern, - evt_group=evt_group, - hit_group=hit_group, - dsp_group=dsp_group, - ) - else: - raise NotImplementedError( - type(ch_comp) - + " not supported (only Array and VectorOfVectors are supported)" + channels_skip=channels_skip, + pars_dict=pars_dict, + default_value=default_value, ) - elif "first_at:" in mode or "last_at:" in mode: + + raise NotImplementedError( + "{type(ch_comp).__name__} not supported " + "(only Array and VectorOfVectors are supported)" + ) + + if "first_at:" in mode or "last_at:" in mode: sorter = tuple( re.findall( - rf"({evt_group}|{hit_group}|{dsp_group}).([a-zA-Z_$][\w$]*)", + rf"({'|'.join(f._asdict().keys())}).([a-zA-Z_$][\w$]*)", mode.split("first_at:")[-1], )[0] ) return aggregators.evaluate_to_first_or_last( - cumulength=cumulength, - idx=idx, - ids=ids, - f_hit=f_hit, - f_dsp=f_dsp, - chns=chns, - chns_rm=chns_rm, + datainfo=datainfo, + tcm=tcm, + channels=channels, + channels_skip=channels_skip, expr=expr, - exprl=exprl, - qry=qry_mask, - nrows=nrows, + field_list=field_list, + query=query_mask, + n_rows=n_rows, sorter=sorter, - var_ph=var_ph, - defv=defv, + pars_dict=pars_dict, + default_value=default_value, is_first=True if "first_at:" in mode else False, - tcm_id_table_pattern=tcm_id_table_pattern, - evt_group=evt_group, - hit_group=hit_group, - dsp_group=dsp_group, ) - elif mode in ["sum", "any", "all"]: + + if mode in ["sum", "any", "all"]: return aggregators.evaluate_to_scalar( + datainfo=datainfo, + tcm=tcm, mode=mode, - cumulength=cumulength, - idx=idx, - ids=ids, - f_hit=f_hit, - f_dsp=f_dsp, - chns=chns, - chns_rm=chns_rm, + channels=channels, + channels_skip=channels_skip, expr=expr, - exprl=exprl, - qry=qry_mask, - nrows=nrows, - var_ph=var_ph, - defv=defv, - tcm_id_table_pattern=tcm_id_table_pattern, - evt_group=evt_group, - hit_group=hit_group, - dsp_group=dsp_group, + field_list=field_list, + query=query_mask, + n_rows=n_rows, + pars_dict=pars_dict, + default_value=default_value, ) - elif "gather" == mode: + if mode == "gather": return aggregators.evaluate_to_vector( - cumulength=cumulength, - idx=idx, - ids=ids, - f_hit=f_hit, - f_dsp=f_dsp, - chns=chns, - chns_rm=chns_rm, + datainfo=datainfo, + tcm=tcm, + channels=channels, + channels_skip=channels_skip, expr=expr, - exprl=exprl, - qry=qry_mask, - nrows=nrows, - var_ph=var_ph, - defv=defv, + field_list=field_list, + query=query_mask, + n_rows=n_rows, + pars_dict=pars_dict, + default_value=default_value, sorter=sorter, - tcm_id_table_pattern=tcm_id_table_pattern, - evt_group=evt_group, - hit_group=hit_group, - dsp_group=dsp_group, ) - else: - raise ValueError(mode + " not a valid mode") + + raise ValueError(f"'{mode}' is not a valid mode") diff --git a/src/pygama/evt/modules/__init__.py b/src/pygama/evt/modules/__init__.py index bd80462f8..a17d33d7a 100644 --- a/src/pygama/evt/modules/__init__.py +++ b/src/pygama/evt/modules/__init__.py @@ -1,21 +1,34 @@ -""" -Contains submodules for evt processing -""" +"""This subpackage provides some custom processors to process hit-structured +data into event-structured data. + +Custom processors must adhere to the following signature: :: + + def my_evt_processor( + datainfo, + tcm, + table_names, + *, # all following arguments are keyword-only + arg1, + arg2, + ... + ) -> LGDO: + # ... -from .spm import ( - get_energy, - get_energy_dplms, - get_etc, - get_majority, - get_majority_dplms, - get_time_shift, -) +The first three arguments are automatically supplied by :func:`.build_evt`, +when the function is called from the :func:`.build_evt` configuration. -__all__ = [ - "get_energy", - "get_majority", - "get_energy_dplms", - "get_majority_dplms", - "get_etc", - "get_time_shift", -] +- `datainfo`: a :obj:`.DataInfo` object that specifies tier names, file names, + HDF5 groups in which data is found and pattern used by hit table names to + encode the channel identifier (e.g. ``ch{}``). +- `tcm`: :obj:`.TCMData` object that holds the TCM data, to be used for event + reconstruction. +- `table_names`: a list of hit table names to read the data from. + +The remaining arguments are characteristic to the processor and can be supplied +in the function call from the :func:`.build_evt` configuration. + +The function must return an :class:`~lgdo.types.lgdo.LGDO` object suitable for +insertion in the final table with event data. + +For examples, have a look at the existing processors provided by this subpackage. +""" diff --git a/src/pygama/evt/modules/geds.py b/src/pygama/evt/modules/geds.py new file mode 100644 index 000000000..131cdeea8 --- /dev/null +++ b/src/pygama/evt/modules/geds.py @@ -0,0 +1,73 @@ +"""Event processors for HPGe data.""" + +from __future__ import annotations + +from collections.abc import Sequence + +import numpy as np +from lgdo import lh5, types + +from .. import utils + + +def apply_recovery_cut( + datainfo: utils.DataInfo, + tcm: utils.TCMData, + table_names: Sequence[str], + *, + timestamps: types.Array, + flag: types.Array, + time_window: float, +) -> types.Array: + + discharge_timestamps = timestamps.nda[flag.nda == 1] + is_recovering = np.full(len(timestamps.nda), False) + for tstamp in discharge_timestamps: + is_recovering = is_recovering | np.where( + ( + ((timestamps.nda - tstamp) < time_window) + & ((timestamps.nda - tstamp) > 0) + ), + True, + False, + ) + + # return the result as LGDO + return types.Array(is_recovering) + + +def apply_xtalk_correction( + datainfo: utils.DataInfo, + tcm: utils.TCMData, + table_names: Sequence[str], + *, + energy_observable: types.VectorOfVectors, + rawids: types.VectorOfVectors, + xtalk_matrix_filename: str, +) -> types.VectorOfVectors: + """Applies the cross-talk correction to the energy observable. + + The format of `xtalk_matrix_filename` should be... + + Parameters + ---------- + datainfo, tcm, table_names + positional arguments automatically supplied by :func:`.build_evt`. + energy_observable + array of energy values to correct, one event per row. The detector + identifier is stored in `rawids`, which has the same layout. + rawids + array of detector identifiers for each energy in `energy_observable`. + xtalk_matrix_filename + name of the file containing the cross-talk matrices. + """ + # read in xtalk matrices + lh5.read_as("", xtalk_matrix_filename, "ak") + + # do the correction + energies_corr = ... + + # return the result as LGDO + return types.VectorOfVectors( + energies_corr, attrs=utils.copy_lgdo_attrs(energy_observable) + ) diff --git a/src/pygama/evt/modules/larveto.py b/src/pygama/evt/modules/larveto.py new file mode 100644 index 000000000..429076a84 --- /dev/null +++ b/src/pygama/evt/modules/larveto.py @@ -0,0 +1,160 @@ +"""Routines to evaluate the correlation between HPGe and SiPM signals.""" + +from __future__ import annotations + +import awkward as ak +import numpy as np +import scipy +from numpy.typing import ArrayLike + + +def l200_combined_test_stat( + t0: ak.Array, + amp: ak.Array, + geds_t0: ak.Array, +) -> ak.Array: + """Combined L200 LAr veto classifier. + + Where combined means taking channel-specific parameters into account. + + `t0` and `amp` must be in the format of a 3-dimensional Awkward array, + where the innermost dimension labels the SiPM pulse, the second one labels + the SiPM channel and the outermost one labels the event. + + Parameters + ---------- + t0 + arrival times of pulses in ns, split by channel. + amp + amplitude of pulses in p.e., split by channel. + geds_t0 + t0 (ns) of the HPGe signal. + """ + # flatten the data in the last axis (i.e. merge all channels together) + # TODO: implement channel distinction + t0 = ak.flatten(t0, axis=-1) + amp = ak.flatten(amp, axis=-1) + + # subtract the HPGe t0 from the SiPM pulse t0s + # HACK: remove 16 when units will be fixed + rel_t0 = 16 * t0 - geds_t0 + + return l200_test_stat(rel_t0, amp) + + +def l200_test_stat(relative_t0, amp): + """Compute the test statistics. + + Parameters + ---------- + relative_t0 + t0 (ns) of the SiPM pulses relative to the HPGe t0. + amp + amplitude in p.e. of the SiPM pulses. + """ + return -ak.sum(ak.transform(_ak_l200_test_stat_terms, relative_t0, amp), axis=-1) + + +# need to define this function and use it with ak.transform() because scipy +# routines are not NumPy universal functions +def _ak_l200_test_stat_terms(layouts, **kwargs): + """Awkward transform to compute the per-pulse terms of the test statistics. + + The two arguments are the pulse times `t0` and their amplitude `amp`. The + function has to be invoked as ``ak.transform(_ak_l200_test_stat_terms, t0, amp, + ...)``. + """ + # sanity check + assert len(layouts) == 2 + + if not all([lay.is_numpy for lay in layouts]): + return + + # these are the two supported arguments + t0 = layouts[0].data + amp = layouts[1].data + + # sanity check + assert len(t0) == len(amp) + + # if there are no pulses return NaN + if len(t0) == 0 or any(np.isnan(t0)): + return ak.contents.NumpyArray([np.nan]) + + # convert to integer number of pes + n_pes = pulse_amp_round(amp) + n_pe_tot = np.sum(n_pes) + + t_stat = n_pes * np.log(l200_tc_time_pdf(t0)) / n_pe_tot + np.log( + l200_rc_amp_pdf(n_pe_tot) + ) + + return ak.contents.NumpyArray(t_stat) + + +def pulse_amp_round(amp: float | ArrayLike): + """Get the most likely (integer) number of photo-electrons.""" + # promote all amps < 1 to 1. standard rounding to nearest for + # amps > 1 + return ak.where(amp < 1, np.ceil(amp), np.floor(amp + 0.5)) + + +def l200_tc_time_pdf( + t0: float | ArrayLike, + *, + domain_ns: tuple[float] = (-1_000, 5_000), + tau_singlet_ns: float = 6, + tau_triplet_ns: float = 1100, + sing2trip_ratio: float = 1 / 3, + t0_res_ns: float = 35, + t0_bias_ns: float = -80, + bkg_prob: float = 0.42, +) -> float | ArrayLike: + """The L200 experimental LAr scintillation pdf + + The theoretical scintillation pdf convoluted with a Normal distribution + (experimental effects) and summed to a uniform distribution (uncorrelated + pulses). + + Parameters + ---------- + t0 + arrival times of the SiPM pulses in ns. + tau_singlet_ns + The lifetime of the LAr singlet state in ns. + tau_triplet_ns + The lifetime of the LAr triplet state in ns. + sing2trip_ratio + The singlet-to-triplet excitation probability ratio. + t0_res_ns + sigma (ns) of the normal distribution. + t0_bias_ns + mean (ns) of the normal distribution. + bkg_prob + probability for a pulse coming from some uncorrelated physics (uniform + distribution). + """ + if not np.all(t0 <= domain_ns[1] and t0 >= domain_ns[0]): + msg = f"{t0=} out of bounds for {domain_ns=}" + raise ValueError(msg) + + # TODO: make this a true pdf, i.e. normalize to integral 1 + return ( + # the triplet + (1 - sing2trip_ratio) + * scipy.stats.exponnorm.pdf( + t0, tau_triplet_ns / t0_res_ns, loc=t0_bias_ns, scale=t0_res_ns + ) + # the singlet + + sing2trip_ratio + * scipy.stats.exponnorm.pdf( + t0, tau_singlet_ns / t0_res_ns, loc=t0_bias_ns, scale=t0_res_ns + ) + # the random coincidences (uniform pdf) + + bkg_prob + * scipy.stats.uniform.pdf(t0, domain_ns[0], domain_ns[1] - domain_ns[0]) + ) + + +def l200_rc_amp_pdf(n): + return np.exp(-n) diff --git a/src/pygama/evt/modules/legend.py b/src/pygama/evt/modules/legend.py index 2ee2d7e8e..5ad620bda 100644 --- a/src/pygama/evt/modules/legend.py +++ b/src/pygama/evt/modules/legend.py @@ -1,6 +1,7 @@ """ Module provides LEGEND internal functions """ + from importlib import import_module from lgdo.lh5 import utils diff --git a/src/pygama/evt/modules/spm.py b/src/pygama/evt/modules/spm.py deleted file mode 100644 index 6e7140d17..000000000 --- a/src/pygama/evt/modules/spm.py +++ /dev/null @@ -1,527 +0,0 @@ -""" -Module for special event level routines for SiPMs - -functions must take as the first 8 args in order: -- path to the hit file -- path to the dsp ak.Array: - if isinstance(trgr, Array): - return ak.fill_none(ak.nan_to_none(trgr.view_as("ak")), tdefault) - - elif isinstance(trgr, (VectorOfVectors)): - return ak.fill_none( - ak.min(ak.fill_none(trgr.view_as("ak"), tdefault), axis=-1), tdefault - ) - - elif isinstance(trgr, (ak.Array, ak.highlevel.Array)): - if trgr.ndim == 1: - return ak.fill_none(ak.nan_to_none(trgr), tdefault) - elif trgr.ndim == 2: - return ak.fill_none( - ak.min(ak.fill_none(ak.nan_to_none(trgr), tdefault), axis=-1), tdefault - ) - else: - raise ValueError(f"Too many dimensions: {trgr.ndim}") - elif isinstance(trgr, (float, int)) and isinstance(length, int): - return ak.Array([trgr] * length) - else: - raise ValueError(f"Can't deal with t0 of type {type(trgr)}") - - -# get SiPM coincidence window mask -def get_spm_mask( - lim: float, trgr: ak.Array, tmin: float, tmax: float, pe: ak.Array, times: ak.Array -) -> ak.Array: - if trgr.ndim != 1: - raise ValueError("trigger array muse be 1 dimensional!") - if (len(trgr) != len(pe)) or (len(trgr) != len(times)): - raise ValueError( - f"All arrays must have same dimension across first axis len(pe)={len(pe)}, len(times)={len(times)}, len(trgr)={len(trgr)}" - ) - - tmi = trgr - tmin - tma = trgr + tmax - - mask = ( - ((times * 16.0) < tma[:, None]) & ((times * 16.0) > tmi[:, None]) & (pe > lim) - ) - return mask - - -# get LAr indices according to mask per event over all channels -# mode 0 -> return pulse indices -# mode 1 -> return tcm indices -# mode 2 -> return rawids -# mode 3 -> return tcm_idx -def get_masked_tcm_idx( - f_hit, - f_dsp, - f_tcm, - hit_group, - dsp_group, - tcm_group, - tcm_id_table_pattern, - chs, - lim, - trgr, - tdefault, - tmin, - tmax, - mode=0, -) -> VectorOfVectors: - # load TCM data to define an event - store = LH5Store() - ids = store.read(f"/{tcm_group}/array_id", f_tcm)[0].view_as("np") - idx = store.read(f"/{tcm_group}/array_idx", f_tcm)[0].view_as("np") - - arr_lst = [] - - if isinstance(trgr, (float, int)): - tge = cast_trigger(trgr, tdefault, length=np.max(idx) + 1) - else: - tge = cast_trigger(trgr, tdefault, length=None) - - for ch in chs: - idx_ch = idx[ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch)] - - pe = store.read(f"{ch}/{hit_group}/energy_in_pe", f_hit, idx=idx_ch)[0].view_as( - "np" - ) - tmp = np.full((np.max(idx) + 1, len(pe[0])), np.nan) - tmp[idx_ch] = pe - pe = ak.drop_none(ak.nan_to_none(ak.Array(tmp))) - - # times are in sample units - times = store.read(f"{ch}/{hit_group}/trigger_pos", f_hit, idx=idx_ch)[ - 0 - ].view_as("np") - tmp = np.full((np.max(idx) + 1, len(times[0])), np.nan) - tmp[idx_ch] = times - times = ak.drop_none(ak.nan_to_none(ak.Array(tmp))) - - mask = get_spm_mask(lim, tge, tmin, tmax, pe, times) - - if mode == 0: - out_idx = ak.local_index(mask)[mask] - - elif mode == 1: - out_idx = np.full((np.max(idx) + 1), np.nan) - out_idx[idx_ch] = np.where( - ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch) - )[0] - out_idx = ak.drop_none(ak.nan_to_none(ak.Array(out_idx)[:, None])) - out_idx = out_idx[mask[mask] - 1] - - elif mode == 2: - out_idx = ak.Array( - [utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch)] * len(mask) - ) - out_idx = out_idx[:, None][mask[mask] - 1] - - elif mode == 3: - out_idx = np.full((np.max(idx) + 1), np.nan) - out_idx[idx_ch] = idx_ch - out_idx = ak.drop_none(ak.nan_to_none(ak.Array(out_idx)[:, None])) - out_idx = out_idx[mask[mask] - 1] - - else: - raise ValueError("Unknown mode") - - arr_lst.append(out_idx) - - return VectorOfVectors(array=ak.concatenate(arr_lst, axis=-1)) - - -def get_spm_ene_or_maj( - f_hit, - f_tcm, - hit_group, - tcm_group, - tcm_id_table_pattern, - chs, - lim, - trgr, - tdefault, - tmin, - tmax, - mode, -): - if mode not in ["energy_hc", "energy_dplms", "majority_hc", "majority_dplms"]: - raise ValueError("Unknown mode") - - # load TCM data to define an event - store = LH5Store() - ids = store.read(f"/{tcm_group}/array_id", f_tcm)[0].view_as("np") - idx = store.read(f"/{tcm_group}/array_idx", f_tcm)[0].view_as("np") - out = np.zeros(np.max(idx) + 1) - - if isinstance(trgr, (float, int)): - tge = cast_trigger(trgr, tdefault, length=np.max(idx) + 1) - else: - tge = cast_trigger(trgr, tdefault, length=None) - - for ch in chs: - idx_ch = idx[ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch)] - - if mode in ["energy_dplms", "majority_dplms"]: - pe = ak.drop_none( - ak.nan_to_none( - store.read( - f"{ch}/{hit_group}/energy_in_pe_dplms", f_hit, idx=idx_ch - )[0].view_as("ak") - ) - ) - - # times are in sample units - times = ak.drop_none( - ak.nan_to_none( - store.read( - f"{ch}/{hit_group}/trigger_pos_dplms", f_hit, idx=idx_ch - )[0].view_as("ak") - ) - ) - - else: - pe = ak.drop_none( - ak.nan_to_none( - store.read(f"{ch}/{hit_group}/energy_in_pe", f_hit, idx=idx_ch)[ - 0 - ].view_as("ak") - ) - ) - - # times are in sample units - times = ak.drop_none( - ak.nan_to_none( - store.read(f"{ch}/{hit_group}/trigger_pos", f_hit, idx=idx_ch)[ - 0 - ].view_as("ak") - ) - ) - - mask = get_spm_mask(lim, tge[idx_ch], tmin, tmax, pe, times) - pe = pe[mask] - - if mode in ["energy_hc", "energy_dplms"]: - out[idx_ch] = out[idx_ch] + ak.to_numpy(ak.nansum(pe, axis=-1)) - - else: - out[idx_ch] = out[idx_ch] + ak.to_numpy( - ak.where(ak.nansum(pe, axis=-1) > lim, 1, 0) - ) - - return Array(nda=out) - - -# get LAr energy per event over all channels -def get_energy( - f_hit, - f_dsp, - f_tcm, - hit_group, - dsp_group, - tcm_group, - tcm_id_table_pattern, - chs, - lim, - trgr, - tdefault, - tmin, - tmax, -) -> Array: - return get_spm_ene_or_maj( - f_hit, - f_tcm, - hit_group, - tcm_group, - tcm_id_table_pattern, - chs, - lim, - trgr, - tdefault, - tmin, - tmax, - "energy_hc", - ) - - -# get LAr majority per event over all channels -def get_majority( - f_hit, - f_dsp, - f_tcm, - hit_group, - dsp_group, - tcm_group, - tcm_id_table_pattern, - chs, - lim, - trgr, - tdefault, - tmin, - tmax, -) -> Array: - return get_spm_ene_or_maj( - f_hit, - f_tcm, - hit_group, - tcm_group, - tcm_id_table_pattern, - chs, - lim, - trgr, - tdefault, - tmin, - tmax, - "majority_hc", - ) - - -# get LAr energy per event over all channels -def get_energy_dplms( - f_hit, - f_dsp, - f_tcm, - hit_group, - dsp_group, - tcm_group, - tcm_id_table_pattern, - chs, - lim, - trgr, - tdefault, - tmin, - tmax, -) -> Array: - return get_spm_ene_or_maj( - f_hit, - f_tcm, - hit_group, - tcm_group, - tcm_id_table_pattern, - chs, - lim, - trgr, - tdefault, - tmin, - tmax, - "energy_dplms", - ) - - -# get LAr majority per event over all channels -def get_majority_dplms( - f_hit, - f_dsp, - f_tcm, - hit_group, - dsp_group, - tcm_group, - tcm_id_table_pattern, - chs, - lim, - trgr, - tdefault, - tmin, - tmax, -) -> Array: - return get_spm_ene_or_maj( - f_hit, - f_tcm, - hit_group, - tcm_group, - tcm_id_table_pattern, - chs, - lim, - trgr, - tdefault, - tmin, - tmax, - "majority_dplms", - ) - - -# Calculate the ETC in different trailing modes: -# trail = 0: Singlet window = [tge,tge+swin] -# trail = 1: Singlet window = [t_first_lar_pulse, t_first_lar_pulse+ swin] -# trail = 2: Like trail = 1, but t_first_lar_pulse <= tge is ensured -# min_first_pls_ene sets the minimum energy of the first pulse (only used in trail > 0) -# max_per_channel, maximum number of pes a channel is allowed to have, if above it gets excluded -def get_etc( - f_hit, - f_dsp, - f_tcm, - hit_group, - dsp_group, - tcm_group, - tcm_id_table_pattern, - chs, - lim, - trgr, - tdefault, - tmin, - tmax, - swin, - trail, - min_first_pls_ene, - max_per_channel, -) -> Array: - # load TCM data to define an event - store = LH5Store() - ids = store.read(f"/{tcm_group}/array_id", f_tcm)[0].view_as("np") - idx = store.read(f"/{tcm_group}/array_idx", f_tcm)[0].view_as("np") - pe_lst = [] - time_lst = [] - - if isinstance(trgr, (float, int)): - tge = cast_trigger(trgr, tdefault, length=np.max(idx) + 1) - else: - tge = cast_trigger(trgr, tdefault, length=None) - - for ch in chs: - idx_ch = idx[ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch)] - - pe = store.read(f"{ch}/{hit_group}/energy_in_pe", f_hit, idx=idx_ch)[0].view_as( - "np" - ) - tmp = np.full((np.max(idx) + 1, len(pe[0])), np.nan) - tmp[idx_ch] = pe - pe = ak.drop_none(ak.nan_to_none(ak.Array(tmp))) - - # times are in sample units - times = store.read(f"{ch}/{hit_group}/trigger_pos", f_hit, idx=idx_ch)[ - 0 - ].view_as("np") - tmp = np.full((np.max(idx) + 1, len(times[0])), np.nan) - tmp[idx_ch] = times - times = ak.drop_none(ak.nan_to_none(ak.Array(tmp))) - - mask = get_spm_mask(lim, tge, tmin, tmax, pe, times) - - pe = pe[mask] - - # max pe mask - max_pe_mask = ak.nansum(pe, axis=-1) < max_per_channel - pe = ak.drop_none( - ak.nan_to_none(ak.where(max_pe_mask, pe, ak.Array([[np.nan]]))) - ) - pe_lst.append(pe) - - times = times[mask] * 16 - times = ak.drop_none( - ak.nan_to_none(ak.where(max_pe_mask, times, ak.Array([[np.nan]]))) - ) - time_lst.append(times) - - pe_all = ak.concatenate(pe_lst, axis=-1) - time_all = ak.concatenate(time_lst, axis=-1) - - if trail > 0: - t1d = ak.min(time_all[pe_all > min_first_pls_ene], axis=-1) - - if trail == 2: - t1d = ak.where(t1d > tge, tge, t1d) - - mask_total = time_all > t1d - mask_singlet = (time_all > t1d) & (time_all < t1d + swin) - - else: - mask_total = time_all > tge - mask_singlet = (time_all > tge) & (time_all < tge + swin) - - pe_singlet = ak.to_numpy( - ak.fill_none(ak.nansum(pe_all[mask_singlet], axis=-1), 0), allow_missing=False - ) - pe_total = ak.to_numpy( - ak.fill_none(ak.nansum(pe_all[mask_total], axis=-1), 0), allow_missing=False - ) - etc = np.divide( - pe_singlet, pe_total, out=np.full_like(pe_total, np.nan), where=pe_total != 0 - ) - - return Array(nda=etc) - - -# returns relative time shift of the first LAr pulse relative to the Ge trigger -def get_time_shift( - f_hit, - f_dsp, - f_tcm, - hit_group, - dsp_group, - tcm_group, - tcm_id_table_pattern, - chs, - lim, - trgr, - tdefault, - tmin, - tmax, -) -> Array: - store = LH5Store() - # load TCM data to define an event - ids = store.read(f"/{tcm_group}/array_id", f_tcm)[0].view_as("np") - idx = store.read(f"/{tcm_group}/array_idx", f_tcm)[0].view_as("np") - time_all = ak.Array([[] for x in range(np.max(idx) + 1)]) - - if isinstance(trgr, (float, int)): - tge = cast_trigger(trgr, tdefault, length=np.max(idx) + 1) - else: - tge = cast_trigger(trgr, tdefault, length=None) - - for ch in chs: - idx_ch = idx[ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch)] - - pe = store.read(f"{ch}/{hit_group}/energy_in_pe", f_hit, idx=idx_ch)[0].view_as( - "np" - ) - tmp = np.full((np.max(idx) + 1, len(pe[0])), np.nan) - tmp[idx_ch] = pe - pe = ak.drop_none(ak.nan_to_none(ak.Array(tmp))) - - # times are in sample units - times = store.read(f"{ch}/{hit_group}/trigger_pos", f_hit, idx=idx_ch)[ - 0 - ].view_as("np") - tmp = np.full((np.max(idx) + 1, len(times[0])), np.nan) - tmp[idx_ch] = times - times = ak.drop_none(ak.nan_to_none(ak.Array(tmp))) - - mask = get_spm_mask(lim, tge, tmin, tmax, pe, times) - - # apply mask and convert sample units to ns - times = times[mask] * 16 - - time_all = ak.concatenate((time_all, times), axis=-1) - - out = ak.min(time_all, axis=-1) - - # Convert to 1D numpy array - out = ak.to_numpy(ak.fill_none(out, np.inf), allow_missing=False) - tge = ak.to_numpy(tge, allow_missing=False) - - return Array(out - tge) diff --git a/src/pygama/evt/modules/spms.py b/src/pygama/evt/modules/spms.py new file mode 100644 index 000000000..2f40c472a --- /dev/null +++ b/src/pygama/evt/modules/spms.py @@ -0,0 +1,381 @@ +"""Event processors for SiPM data.""" + +from __future__ import annotations + +from collections.abc import Sequence + +import awkward as ak +import numpy as np +from lgdo import lh5, types + +from .. import utils +from . import larveto + + +def gather_pulse_data( + datainfo: utils.DataInfo, + tcm: utils.TCMData, + table_names: Sequence[str], + *, + observable: str, + pulse_mask: types.VectorOfVectors = None, + a_thr_pe: float = None, + t_loc_ns: float = None, + dt_range_ns: Sequence[float] = None, + t_loc_default_ns: float = None, + drop_empty: bool = True, +) -> types.VectorOfVectors: + """Gathers SiPM pulse data into a 3D :class:`~lgdo.types.vectorofvectors.VectorOfVectors`. + + The returned data structure specifies the event in the first axis, the SiPM + channel in the second and the pulse index in the last. + + Pulse data can be optionally masked with `pulse_mask` or a mask can be + built on the fly from the `a_thr_pe`, `t_loc_ns`, `dt_range_ns`, + `t_loc_default_ns` arguments (see :func:`make_pulse_data_mask`). + + If `pulse_mask`, `a_thr_pe`, `t_loc_ns`, `dt_range_ns`, `t_loc_default_ns` + are all ``None``, no masking is applied and the full data set is returned. + + Parameters + ---------- + datainfo, tcm, table_names + positional arguments automatically supplied by :func:`.build_evt`. + observable + name of the pulse parameter to be gathered, optionally prefixed by tier + name (e.g. ``hit.energy_in_pe``). If no tier is specified, it defaults + to ``hit``. + pulse_mask + 3D mask object used to filter out pulse data. See + :func:`make_pulse_data_mask`. + a_thr_pe + amplitude threshold (in photoelectrons) used to build a pulse mask with + :func:`make_pulse_data_mask`, if `pulse_mask` is ``None``. The output + pulse data will be such that the pulse amplitude is above this value. + t_loc_ns + location of the time window in which pulses must sit. If a 1D array is + provided, it is interpreted as a list of locations for each event (can + be employed to e.g. provide the actual HPGe pulse position) + dt_range_ns + tuple with dimension of the time window in which pulses must sit + relative to `t_loc_ns`. If, for example, `t_loc_ns` is 48000 ns and + `dt_range_ns` is (-1000, 5000) ns, the resulting window will be (47000, + 53000) ns. + t_loc_default_ns + default value for `t_loc_ns`, in case the supplied value is + :any:`numpy.nan`. + drop_empty + if ``True``, drop empty arrays at the last axis (the pulse axis), i.e. + drop channels with no pulse data. The filtering is applied after the + application of the mask. + """ + # parse observables string. default to hit tier + p = observable.split(".") + tier = p[0] if len(p) > 1 else "hit" + column = p[1] if len(p) > 1 else p[0] + + tierinfo = datainfo._asdict()[tier] + + # loop over selected table_names and load hit data + concatme = [] + for channel in table_names: + table_id = utils.get_tcm_id_by_pattern(tierinfo.table_fmt, channel) + + # determine list of indices found in the TCM that we want to load for channel + idx = tcm.idx[tcm.id == table_id] + + # read the data in + lgdo_obj = lh5.read( + f"/{channel}/{tierinfo.group}/{column}", tierinfo.file, idx=idx + ) + data = lgdo_obj.view_as(library="ak") + + # remove nans (this happens when SiPM data is stored as ArrayOfEqualSizedArrays) + data = ak.drop_none(ak.nan_to_none(data)) + + # increase the dimensionality by one (events) + data = ak.unflatten(data, np.full(data.layout.length, 1, dtype="uint8")) + + concatme.append(data) + + # concatenate along the event axes (i.e. gather table_names together) + data = ak.concatenate(concatme, axis=1) + + # check if user wants to apply a mask + if pulse_mask is None and any( + [kwarg is not None for kwarg in (a_thr_pe, t_loc_ns, dt_range_ns)] + ): + # generate the time/amplitude mask from parameters + pulse_mask = make_pulse_data_mask( + datainfo, + tcm, + table_names, + a_thr_pe=a_thr_pe, + t_loc_ns=t_loc_ns, + dt_range_ns=dt_range_ns, + t_loc_default_ns=t_loc_default_ns, + ) + + if pulse_mask is not None: + if not isinstance(pulse_mask, ak.Array): + pulse_mask = pulse_mask.view_as("ak") + + # apply the mask + data = data[pulse_mask] + + # remove empty arrays = table_names with no pulses + if drop_empty: + data = data[ak.count(data, axis=-1) > 0] + + return types.VectorOfVectors(data, attrs=utils.copy_lgdo_attrs(lgdo_obj)) + + +def gather_tcm_data( + datainfo: utils.DataInfo, + tcm: utils.TCMData, + table_names: Sequence[str], + *, + tcm_field="id", + pulse_mask=None, + a_thr_pe=None, + t_loc_ns=None, + dt_range_ns=None, + t_loc_default_ns=None, + drop_empty=True, +) -> types.VectorOfVectors: + """Gather TCM data into a 2D :class:`~lgdo.types.vectorofvectors.VectorOfVectors`. + + The returned data structure specifies the event on the first axis and the + TCM data (`id` or `idx`) on the second. Can be used to filter out data from + :func:`gather_pulse_data` based on SiPM channel provenance (`id`) or to + load hit data from lower tiers (with `idx`). + + If `drop_empty` is ``True``, channel ids with no pulse data associated are + removed. + + See :func:`gather_pulse_data` for documentation about the other function + arguments. + """ + # unflatten the tcm data with cumulative_length, i.e. make a VoV + tcm_vov = {} + for field in ("id", "idx"): + tcm_vov[field] = types.VectorOfVectors( + flattened_data=tcm._asdict()[field], cumulative_length=tcm.cumulative_length + ).view_as("ak") + + # list user wanted table names + table_ids = [ + utils.get_tcm_id_by_pattern(datainfo.hit.table_fmt, id) for id in table_names + ] + # find them in tcm.id (we'll filter the rest out) + locs = np.isin(tcm_vov["id"], table_ids) + + # select tcm field requested by the user + data = tcm_vov[tcm_field] + + # apply mask + # NOTE: need to cast to irregular axes, otherwise the masking result is + # non-nested + data = data[ak.from_regular(locs)] + + # check if user wants to apply a custom mask + if drop_empty: + if pulse_mask is None: + # generate the time/amplitude mask from parameters + # if all parameters are None, a dummy mask (the identity) will be made + pulse_mask = make_pulse_data_mask( + datainfo, + tcm, + table_names, + a_thr_pe=a_thr_pe, + t_loc_ns=t_loc_ns, + dt_range_ns=dt_range_ns, + t_loc_default_ns=t_loc_default_ns, + ) + + if not isinstance(pulse_mask, ak.Array): + pulse_mask = pulse_mask.view_as("ak") + + if pulse_mask.ndim != 3: + msg = "pulse_mask must be 3D" + raise ValueError(msg) + + # convert the 3D mask to a 2D mask (can be used to filter table_ids) + ch_mask = ak.sum(pulse_mask, axis=-1) > 0 + + # apply the mask + data = data[ch_mask] + + return types.VectorOfVectors(data) + + +# NOTE: the mask never gets the empty arrays removed +def make_pulse_data_mask( + datainfo: utils.DataInfo, + tcm: utils.TCMData, + table_names: Sequence[str], + *, + a_thr_pe=None, + t_loc_ns=None, + dt_range_ns=None, + t_loc_default_ns=None, +) -> types.VectorOfVectors: + """Calculate a 3D :class:`~lgdo.types.vectorofvectors.VectorOfVectors` pulse data mask. + + Useful to filter any pulse data based on pulse amplitude and start time. + + Parameters + ---------- + datainfo, tcm, table_names + positional arguments automatically supplied by :func:`.build_evt`. + a_thr_pe + amplitude threshold (in photoelectrons) used to build a pulse mask with + :func:`make_pulse_data_mask`, if `pulse_mask` is ``None``. The output + pulse data will be such that the pulse amplitude is above this value. + t_loc_ns + location of the time window in which pulses must sit. If a 1D array is + provided, it is interpreted as a list of locations for each event (can + be employed to e.g. provide the actual HPGe pulse position) + dt_range_ns + tuple with dimension of the time window in which pulses must sit + relative to `t_loc_ns`. If, for example, `t_loc_ns` is 48000 ns and + `dt_range_ns` is (-1000, 5000) ns, the resulting window will be (47000, + 53000) ns. + t_loc_default_ns + default value for `t_loc_ns`, in case the supplied value is + :any:`numpy.nan`. + """ + # get the t0 of each single pulse + pulse_t0 = gather_pulse_data( + datainfo, + tcm, + table_names, + observable="hit.trigger_pos", + drop_empty=False, + ) + + # HACK: handle units + # HACK: remove me once units are fixed in the dsp tier + if "units" in pulse_t0.attrs and pulse_t0.attrs["units"] == "ns": + pulse_t0_ns = pulse_t0.view_as("ak") + else: + pulse_t0_ns = pulse_t0.view_as("ak") * 16 + + pulse_amp = gather_pulse_data( + datainfo, + tcm, + table_names, + observable="hit.energy_in_pe", + drop_empty=False, + ).view_as("ak") + + # (HPGe) trigger position can vary among events! + if isinstance(t_loc_ns, types.Array): + t_loc_ns = t_loc_ns.view_as("ak") + + if isinstance(t_loc_ns, ak.Array): + if t_loc_ns.ndim != 1: + msg = "t_loc_ns must be 0- or 1-dimensional" + raise ValueError(msg) + + # NOTE: the assumption is that t0 is np.nan when missing -> replace + # with default value + t_loc_ns = ak.fill_none(ak.nan_to_none(t_loc_ns), t_loc_default_ns) + + # start with all-true mask + mask = pulse_t0_ns == pulse_t0_ns + + # apply p.e. threshold + if a_thr_pe is not None: + mask = mask & (pulse_amp > a_thr_pe) + + # apply time windowing + if t_loc_ns is not None and dt_range_ns is not None: + if not isinstance(dt_range_ns, (tuple, list)): + msg = "dt_range_ns must be a tuple" + raise ValueError(msg) + + mask = mask & ( + (pulse_t0_ns < (t_loc_ns + dt_range_ns[1])) + & (pulse_t0_ns > (t_loc_ns + dt_range_ns[0])) + ) + + return types.VectorOfVectors(mask) + + +def geds_coincidence_classifier( + datainfo: utils.DataInfo, + tcm: utils.TCMData, + table_names: Sequence[str], + *, + geds_t0_ns: types.Array, +) -> types.Array: + """Calculate the HPGe / SiPMs coincidence classifier. + + The value represents the likelihood of a physical correlation between HPGe + and SiPM signals. + + Parameters + ---------- + datainfo, tcm, table_names + positional arguments automatically supplied by :func:`.build_evt`. + """ + # mask for windowing data around the HPGe t0 + pulse_mask = make_pulse_data_mask( + datainfo, + tcm, + table_names, + a_thr_pe=None, + t_loc_ns=geds_t0_ns, + dt_range_ns=(-1_000, 5_000), + t_loc_default_ns=48_000, + ) + + # load the data + data = {} + for k, obs in {"amp": "hit.energy_in_pe", "t0": "hit.trigger_pos"}.items(): + data[k] = gather_pulse_data( + datainfo, + tcm, + table_names, + observable=obs, + pulse_mask=pulse_mask, + drop_empty=True, + ).view_as("ak") + + # load the channel info + # rawids = spms.gather_tcm_id_data( + # datainfo, + # tcm, + # table_names, + # pulse_mask=pulse_mask, + # drop_empty=True, + # ) + + # (HPGe) trigger position can vary among events! + if isinstance(geds_t0_ns, types.Array): + geds_t0_ns = geds_t0_ns.view_as("ak") + + ts_data = larveto.l200_combined_test_stat(data["t0"], data["amp"], geds_t0_ns) + + return types.Array(ts_data) + + +# REMOVE ME: not needed anymore with VectorOfVectors DSP outputs +def gather_is_valid_hit(datainfo, tcm, table_names): + data = {} + for field in ("is_valid_hit", "trigger_pos"): + data[field] = gather_pulse_data( + datainfo, + tcm, + table_names, + observable=f"hit.{field}", + pulse_mask=None, + drop_empty=False, + ).view_as("ak") + + return types.VectorOfVectors( + data["is_valid_hit"][ + ak.local_index(data["is_valid_hit"]) < ak.num(data["trigger_pos"], axis=-1) + ] + ) diff --git a/src/pygama/evt/utils.py b/src/pygama/evt/utils.py index 175cd868a..30d14639a 100644 --- a/src/pygama/evt/utils.py +++ b/src/pygama/evt/utils.py @@ -4,192 +4,189 @@ from __future__ import annotations +import copy import re +from collections import namedtuple import awkward as ak import numpy as np -from lgdo.lh5 import LH5Store +from lgdo import lh5 from numpy.typing import NDArray +H5DataLoc = namedtuple( + "H5DataLoc", ("file", "group", "table_fmt"), defaults=3 * (None,) +) -def get_tcm_id_by_pattern(tcm_id_table_pattern: str, ch: str) -> int: - pre = tcm_id_table_pattern.split("{")[0] - post = tcm_id_table_pattern.split("}")[1] +DataInfo = namedtuple( + "DataInfo", ("raw", "tcm", "dsp", "hit", "evt"), defaults=5 * (None,) +) + +TCMData = namedtuple("TCMData", ("id", "idx", "cumulative_length")) + + +def make_files_config(data: dict): + if not isinstance(data, DataInfo): + return DataInfo( + *[ + H5DataLoc(*data[tier]) if tier in data else H5DataLoc() + for tier in DataInfo._fields + ] + ) + + return data + + +def make_numpy_full(size, fill_value, try_dtype): + if np.can_cast(fill_value, try_dtype): + return np.full(size, fill_value, dtype=try_dtype) + else: + return np.full(size, fill_value) + + +def copy_lgdo_attrs(obj): + attrs = copy.copy(obj.attrs) + attrs.pop("datatype") + return attrs + + +def get_tcm_id_by_pattern(table_id_fmt: str, ch: str) -> int: + pre = table_id_fmt.split("{")[0] + post = table_id_fmt.split("}")[1] return int(ch.strip(pre).strip(post)) -def get_table_name_by_pattern(tcm_id_table_pattern: str, ch_id: int) -> str: - # check tcm_id_table_pattern validity - pattern_check = re.findall(r"{([^}]*?)}", tcm_id_table_pattern)[0] +def get_table_name_by_pattern(table_id_fmt: str, ch_id: int) -> str: + # check table_id_fmt validity + pattern_check = re.findall(r"{([^}]*?)}", table_id_fmt)[0] if pattern_check == "" or ":" == pattern_check[0]: - return tcm_id_table_pattern.format(ch_id) + return table_id_fmt.format(ch_id) else: raise NotImplementedError( - "Only empty placeholders with format specifications are currently implemented" + "only empty placeholders {} in format specifications are currently supported" ) -def num_and_pars(value: str, par_dic: dict): - # function tries to convert a string to a int, float, bool - # or returns the value if value is a key in par_dic - if value in par_dic.keys(): - return par_dic[value] - try: - value = int(value) - except ValueError: - try: - value = float(value) - except ValueError: - try: - value = bool(value) - except ValueError: - pass - return value - - def find_parameters( - f_hit: str, - f_dsp: str, - ch: str, - idx_ch: NDArray, - exprl: list, - hit_group: str = "hit", - dsp_group: str = "dsp", + datainfo, + ch, + idx_ch, + field_list, ) -> dict: - """Wraps :func:`load_vars_to_nda` to return parameters from `hit` and `dsp` - tiers. + """Finds and returns parameters from `hit` and `dsp` tiers. Parameters ---------- - f_hit - path to `hit` tier file. - f_dsp - path to `dsp` tier file. + datainfo + input and output LH5 datainfo with HDF5 groups where tables are found. ch "rawid" in the tiers. idx_ch - index array of entries to be read from files. - exprl + index array of entries to be read from datainfo. + field_list list of tuples ``(tier, field)`` to be found in the `hit/dsp` tiers. - dsp_group - LH5 root group in dsp file. - hit_group - LH5 root group in hit file. """ + f = make_files_config(datainfo) # find fields in either dsp, hit - dsp_flds = [e[1] for e in exprl if e[0] == dsp_group] - hit_flds = [e[1] for e in exprl if e[0] == hit_group] + dsp_flds = [e[1] for e in field_list if e[0] == f.dsp.group] + hit_flds = [e[1] for e in field_list if e[0] == f.hit.group] - store = LH5Store() hit_dict, dsp_dict = {}, {} + if len(hit_flds) > 0: - hit_ak = store.read( - f"{ch.replace('/','')}/{hit_group}/", f_hit, field_mask=hit_flds, idx=idx_ch - )[0].view_as("ak") + hit_ak = lh5.read_as( + f"{ch.replace('/','')}/{f.hit.group}/", + f.hit.file, + field_mask=hit_flds, + idx=idx_ch, + library="ak", + ) + hit_dict = dict( - zip([f"{hit_group}_" + e for e in ak.fields(hit_ak)], ak.unzip(hit_ak)) + zip([f"{f.hit.group}_" + e for e in ak.fields(hit_ak)], ak.unzip(hit_ak)) ) + if len(dsp_flds) > 0: - dsp_ak = store.read( - f"{ch.replace('/','')}/{dsp_group}/", f_dsp, field_mask=dsp_flds, idx=idx_ch - )[0].view_as("ak") + dsp_ak = lh5.read_as( + f"{ch.replace('/','')}/{f.dsp.group}/", + f.dsp.file, + field_mask=dsp_flds, + idx=idx_ch, + library="ak", + ) + dsp_dict = dict( - zip([f"{dsp_group}_" + e for e in ak.fields(dsp_ak)], ak.unzip(dsp_ak)) + zip([f"{f.dsp.group}_" + e for e in ak.fields(dsp_ak)], ak.unzip(dsp_ak)) ) return hit_dict | dsp_dict def get_data_at_channel( - ch: str, - ids: NDArray, - idx: NDArray, - expr: str, - exprl: list, - var_ph: dict, - is_evaluated: bool, - f_hit: str, - f_dsp: str, - defv, - tcm_id_table_pattern: str = "ch{}", - evt_group: str = "evt", - hit_group: str = "hit", - dsp_group: str = "dsp", -) -> np.ndarray: + datainfo, + ch, + tcm, + expr, + field_list, + pars_dict, +) -> NDArray: """Evaluates an expression and returns the result. Parameters ---------- + datainfo + input and output LH5 datainfo with HDF5 groups where tables are found. ch "rawid" of channel to be evaluated. - idx - `tcm` index array. - ids - `tcm` id array. + tcm + TCM data arrays in an object that can be accessed by attribute. expr expression to be evaluated. - exprl + field_list list of parameter-tuples ``(root_group, field)`` found in the expression. - var_ph + pars_dict dict of additional parameters that are not channel dependent. is_evaluated if false, the expression does not get evaluated but an array of default values is returned. - f_hit - path to `hit` tier file. - f_dsp - path to `dsp` tier file. - defv + default_value default value. - tcm_id_table_pattern - Pattern to format tcm id values to table name in higher tiers. Must have one - placeholder which is the tcm id. - dsp_group - LH5 root group in dsp file. - hit_group - LH5 root group in hit file. - evt_group - LH5 root group in evt file. """ + f = make_files_config(datainfo) + table_id = get_tcm_id_by_pattern(f.hit.table_fmt, ch) # get index list for this channel to be loaded - idx_ch = idx[ids == get_tcm_id_by_pattern(tcm_id_table_pattern, ch)] + idx_ch = tcm.idx[tcm.id == table_id] outsize = len(idx_ch) - if not is_evaluated: - res = np.full(outsize, defv, dtype=type(defv)) - elif "tcm.array_id" == expr: - res = np.full( - outsize, get_tcm_id_by_pattern(tcm_id_table_pattern, ch), dtype=int - ) - elif "tcm.index" == expr: - res = np.where(ids == get_tcm_id_by_pattern(tcm_id_table_pattern, ch))[0] + if expr == "tcm.array_id": + res = np.full(outsize, table_id, dtype=int) + elif expr == "tcm.array_idx": + res = idx_ch + elif expr == "tcm.index": + res = np.where(tcm.id == table_id)[0] else: var = find_parameters( - f_hit=f_hit, - f_dsp=f_dsp, + datainfo=datainfo, ch=ch, idx_ch=idx_ch, - exprl=exprl, - hit_group=hit_group, - dsp_group=dsp_group, + field_list=field_list, ) - if var_ph is not None: - var = var | var_ph + if pars_dict is not None: + var = var | pars_dict # evaluate expression # move tier+dots in expression to underscores (e.g. evt.foo -> evt_foo) res = eval( - expr.replace(f"{dsp_group}.", f"{dsp_group}_") - .replace(f"{hit_group}.", f"{hit_group}_") - .replace(f"{evt_group}.", ""), + expr.replace(f"{f.dsp.group}.", f"{f.dsp.group}_") + .replace(f"{f.hit.group}.", f"{f.hit.group}_") + .replace(f"{f.evt.group}.", ""), var, ) # in case the expression evaluates to a single value blow it up - if (not hasattr(res, "__len__")) or (isinstance(res, str)): + if not hasattr(res, "__len__") or isinstance(res, str): return np.full(outsize, res) # the resulting arrays need to be 1D from the operation, @@ -200,27 +197,28 @@ def get_data_at_channel( # in this method only 1D values are allowed if res.ndim > 1: raise ValueError( - f"expression '{expr}' must return 1D array. If you are using VectorOfVectors or ArrayOfEqualSizedArrays, use awkward reduction functions to reduce the dimension" + f"expression '{expr}' must return 1D array. If you are using " + "VectorOfVectors or ArrayOfEqualSizedArrays, use awkward " + "reduction functions to reduce the dimension" ) return res def get_mask_from_query( - qry: str | NDArray, - length: int, - ch: str, - idx_ch: NDArray, - f_hit: str, - f_dsp: str, - hit_group: str = "hit", - dsp_group: str = "dsp", -) -> np.ndarray: + datainfo, + query, + length, + ch, + idx_ch, +) -> NDArray: """Evaluates a query expression and returns a mask accordingly. Parameters ---------- - qry + datainfo + input and output LH5 datainfo with HDF5 groups where tables are found. + query query expression. length length of the return mask. @@ -228,33 +226,23 @@ def get_mask_from_query( "rawid" of channel to be evaluated. idx_ch channel indices to be read. - f_hit - path to `hit` tier file. - f_dsp - path to `dsp` tier file. - hit_group - LH5 root group in hit file. - dsp_group - LH5 root group in dsp file. """ + f = make_files_config(datainfo) # get sub evt based query condition if needed - if isinstance(qry, str): - qry_lst = re.findall(r"(hit|dsp).([a-zA-Z_$][\w$]*)", qry) - qry_var = find_parameters( - f_hit=f_hit, - f_dsp=f_dsp, + if isinstance(query, str): + query_lst = re.findall(r"(hit|dsp).([a-zA-Z_$][\w$]*)", query) + query_var = find_parameters( + datainfo=datainfo, ch=ch, idx_ch=idx_ch, - exprl=qry_lst, - hit_group=hit_group, - dsp_group=dsp_group, + field_list=query_lst, ) limarr = eval( - qry.replace(f"{dsp_group}.", f"{dsp_group}_").replace( - f"{hit_group}.", f"{hit_group}_" + query.replace(f"{f.dsp.group}.", f"{f.dsp.group}_").replace( + f"{f.hit.group}.", f"{f.hit.group}_" ), - qry_var, + query_var, ) # in case the expression evaluates to a single value blow it up @@ -264,12 +252,14 @@ def get_mask_from_query( limarr = ak.to_numpy(limarr, allow_missing=False) if limarr.ndim > 1: raise ValueError( - f"query '{qry}' must return 1D array. If you are using VectorOfVectors or ArrayOfEqualSizedArrays, use awkward reduction functions to reduce the dimension" + f"query '{query}' must return 1D array. If you are using " + "VectorOfVectors or ArrayOfEqualSizedArrays, use awkward " + "reduction functions to reduce the dimension" ) # or forward the array - elif isinstance(qry, np.ndarray): - limarr = qry + elif isinstance(query, np.ndarray): + limarr = query # if no condition, it must be true else: diff --git a/src/pygama/flow/data_loader.py b/src/pygama/flow/data_loader.py index 7e5c38616..aa65d2a3c 100644 --- a/src/pygama/flow/data_loader.py +++ b/src/pygama/flow/data_loader.py @@ -17,7 +17,7 @@ from lgdo.lh5 import LH5Iterator, LH5Store from lgdo.lh5.utils import expand_vars from lgdo.types import Array, Struct, Table -from lgdo.types.vectorofvectors import build_cl, explode_arrays, explode_cl +from lgdo.types.vovutils import build_cl, explode_arrays, explode_cl from tqdm.auto import tqdm from . import utils diff --git a/src/pygama/flow/file_db.py b/src/pygama/flow/file_db.py index fdca65b2d..954c07f50 100644 --- a/src/pygama/flow/file_db.py +++ b/src/pygama/flow/file_db.py @@ -1,4 +1,5 @@ """Utilities for LH5 file inventory.""" + from __future__ import annotations import json diff --git a/src/pygama/hit/build_hit.py b/src/pygama/hit/build_hit.py index 2a6d6a066..7a8c6a241 100644 --- a/src/pygama/hit/build_hit.py +++ b/src/pygama/hit/build_hit.py @@ -1,9 +1,9 @@ """ This module implements routines to evaluate expressions to columnar data. """ + from __future__ import annotations -import json import logging import os from collections import OrderedDict @@ -13,6 +13,8 @@ import numpy as np from lgdo.lh5 import LH5Iterator, LH5Store, ls +from .. import utils + log = logging.getLogger(__name__) @@ -95,20 +97,17 @@ def build_hit( tbl_cfg = lh5_tables_config # sanitize config if isinstance(tbl_cfg, str): - with open(tbl_cfg) as f: - tbl_cfg = json.load(f) + tbl_cfg = utils.load_dict(tbl_cfg) for k, v in tbl_cfg.items(): if isinstance(v, str): - with open(v) as f: - tbl_cfg[k] = json.load(f) + tbl_cfg[k] = utils.load_dict(v) lh5_tables_config = tbl_cfg else: if isinstance(hit_config, str): # sanitize config - with open(hit_config) as f: - hit_config = json.load(f) + hit_config = utils.load_dict(hit_config) if lh5_tables is None: lh5_tables_config = {} diff --git a/src/pygama/logging.py b/src/pygama/logging.py index 8271dc08e..7c494bafe 100644 --- a/src/pygama/logging.py +++ b/src/pygama/logging.py @@ -1,4 +1,5 @@ """This module implements some helpers for setting up logging.""" + import logging import colorlog diff --git a/src/pygama/math/binned_fitting.py b/src/pygama/math/binned_fitting.py index aef242e74..81f6455d3 100644 --- a/src/pygama/math/binned_fitting.py +++ b/src/pygama/math/binned_fitting.py @@ -90,7 +90,9 @@ def fit_binned(func: Callable, hist: np.ndarray, bins: np.ndarray, var: np.ndarr m = Minuit(cost_func, *guess) if bounds is not None: - m.limits = bounds + if isinstance(bounds, dict): + for key, val in bounds.items(): + m.limits[key] = val if fixed is not None: for fix in fixed: m.fixed[fix] = True diff --git a/src/pygama/math/functions/exgauss.py b/src/pygama/math/functions/exgauss.py index eaa54a446..6af84269f 100644 --- a/src/pygama/math/functions/exgauss.py +++ b/src/pygama/math/functions/exgauss.py @@ -60,6 +60,8 @@ def nb_gauss_tail_exact(x: float, mu: float, sigma: float, tau: float, tmp: floa abstau = np.absolute(tau) if tmp < limit: tmp = tmp else: tmp = limit + if sigma ==0 or abstau ==0: + return x*0 z = (x-mu)/sigma tail_f = (1/(2*abstau)) * np.exp(tmp) * erfc( (tau*z + sigma)/(np.sqrt(2)*abstau)) return tail_f @@ -88,7 +90,10 @@ def nb_gauss_tail_approx(x: np.ndarray, mu: float, sigma: float, tau: float) -> -------- :func:`nb_exgauss_pdf` """ - + if sigma ==0: + return x*0 + elif (sigma + tau*(x-mu)/sigma)==0: + return x*0 den = 1/(sigma + tau*(x-mu)/sigma) tail_f = sigma * nb_gauss_pdf(x, mu, sigma) * den * (1.-tau*tau*den*den) return tail_f @@ -125,11 +130,14 @@ def nb_exgauss_pdf(x: np.ndarray, mu: float, sigma: float, tau: float) -> np.nda x = np.asarray(x) tail_f = np.empty_like(x, dtype=np.float64) for i in nb.prange(x.shape[0]): - tmp = ((x[i]-mu)/tau) + ((sigma**2)/(2*tau**2)) - if tmp < limit: - tail_f[i] = nb_gauss_tail_exact(x[i], mu, sigma, tau, tmp) + if tau == 0: + tail_f[i] = np.nan else: - tail_f[i] = nb_gauss_tail_approx(x[i], mu, sigma, tau) + tmp = ((x[i]-mu)/tau) + ((sigma**2)/(2*tau**2)) + if tmp < limit: + tail_f[i] = nb_gauss_tail_exact(x[i], mu, sigma, tau, tmp) + else: + tail_f[i] = nb_gauss_tail_approx(x[i], mu, sigma, tau) return tail_f @@ -163,12 +171,15 @@ def nb_exgauss_cdf(x: np.ndarray, mu: float, sigma: float, tau: float) -> np.nda cdf = np.empty_like(x, dtype=np.float64) for i in nb.prange(x.shape[0]): - cdf[i] = (tau/(2*abstau)) * erf((tau*(x[i]-mu) )/(np.sqrt(2)*sigma*abstau)) - tmp = ((x[i]-mu)/tau) + ((sigma**2)/(2*tau**2)) - if tmp < limit: - cdf[i] += tau*nb_gauss_tail_exact(x[i], mu, sigma, tau, tmp) + 0.5 # This is duplicated code from the pdf, but putting it in parallel makes it run faster! + if tau == 0: + cdf[i] = np.nan else: - cdf[i] += tau*nb_gauss_tail_approx(x[i], mu, sigma, tau) + 0.5 + cdf[i] = (tau/(2*abstau)) * erf((tau*(x[i]-mu) )/(np.sqrt(2)*sigma*abstau)) + tmp = ((x[i]-mu)/tau) + ((sigma**2)/(2*tau**2)) + if tmp < limit: + cdf[i] += tau*nb_gauss_tail_exact(x[i], mu, sigma, tau, tmp) + 0.5 # This is duplicated code from the pdf, but putting it in parallel makes it run faster! + else: + cdf[i] += tau*nb_gauss_tail_approx(x[i], mu, sigma, tau) + 0.5 return cdf diff --git a/src/pygama/math/functions/hpge_peak.py b/src/pygama/math/functions/hpge_peak.py index 48151eb53..6adad4dfc 100644 --- a/src/pygama/math/functions/hpge_peak.py +++ b/src/pygama/math/functions/hpge_peak.py @@ -48,7 +48,7 @@ from pygama.math.functions.gauss_on_exgauss import gauss_on_exgauss from pygama.math.functions.step import step -from pygama.math.hpge_peak_fitting import hpge_peak_fwhm +from pygama.math.hpge_peak_fitting import hpge_peak_fwhm, hpge_peak_fwfm, hpge_peak_mode (x_lo, x_hi, n_sig, mu, sigma, frac1, tau, n_bkg, hstep) = range(9) @@ -83,10 +83,12 @@ def hpge_get_fwhm(self, pars: np.ndarray, cov: np.ndarray = None) -> tuple: htail_idx = np.where(req_args == "htail")[0][0] tau_idx = np.where(req_args == "tau")[0][0] # We need to ditch the x_lo and x_hi columns and rows - cov = np.array(cov) - dropped_cov = cov[:, 2:][2:, :] - - return hpge_peak_fwhm(pars[sigma_idx], pars[htail_idx], pars[tau_idx], dropped_cov) + if cov is not None: + cov = np.array(cov) + dropped_cov = cov[:, 2:][2:, :] + return hpge_peak_fwhm(pars[sigma_idx], pars[htail_idx], pars[tau_idx], dropped_cov) + else: + return hpge_peak_fwhm(pars[sigma_idx], pars[htail_idx], pars[tau_idx]) else: if cov is None: @@ -94,5 +96,91 @@ def hpge_get_fwhm(self, pars: np.ndarray, cov: np.ndarray = None) -> tuple: else: return pars[sigma_idx]*2*np.sqrt(2*np.log(2)), np.sqrt(cov[sigma_idx][sigma_idx])*2*np.sqrt(2*np.log(2)) +# This is defined here as to avoid a circular import inside `sum_dists` +def hpge_get_fwfm(self, pars: np.ndarray, frac_max=0.5, cov: np.ndarray = None) -> tuple: + r""" + Get the fwhm value from the output of a fit quickly + Need to overload this to use hpge_peak_fwhm (to avoid a circular import) for when self is an hpge peak, + and otherwise returns 2sqrt(-2log(frac_max))*sigma + + Parameters + ---------- + pars + Array of fit parameters + cov + Optional, array of covariances for calculating error on the fwhm + + + Returns + ------- + fwhm, error + the value of the fwhm and its error + """ + req_args = np.array(self.required_args()) + sigma_idx = np.where(req_args == "sigma")[0][0] + + if ("htail" in req_args) and ("hstep" in req_args): #having both the htail and hstep means it is an exgauss on a step + htail_idx = np.where(req_args == "htail")[0][0] + tau_idx = np.where(req_args == "tau")[0][0] + # We need to ditch the x_lo and x_hi columns and rows + if cov is not None: + cov = np.array(cov) + dropped_cov = cov[:, 2:][2:, :] + return hpge_peak_fwfm(pars[sigma_idx], pars[htail_idx], pars[tau_idx], frac_max=frac_max, cov=dropped_cov) + else: + return hpge_peak_fwfm(pars[sigma_idx], pars[htail_idx], pars[tau_idx], frac_max=frac_max) + + else: + if cov is None: + return pars[sigma_idx]*2*np.sqrt(2*np.log(2)) + else: + return pars[sigma_idx]*2*np.sqrt(-2*np.log(frac_max)), np.sqrt(cov[sigma_idx][sigma_idx])*2*np.sqrt(-2*np.log(frac_max)) + +# This is defined here as to avoid a circular import inside `sum_dists` +def hpge_get_mode(self, pars: np.ndarray, cov: np.ndarray = None) -> tuple: + r""" + Get the fwhm value from the output of a fit quickly + Need to overload this to use hpge_peak_fwhm (to avoid a circular import) for when self is an hpge peak, + and otherwise returns 2sqrt(2log(2))*sigma + + Parameters + ---------- + pars + Array of fit parameters + cov + Optional, array of covariances for calculating error on the fwhm + + + Returns + ------- + fwhm, error + the value of the fwhm and its error + """ + req_args = np.array(self.required_args()) + sigma_idx = np.where(req_args == "sigma")[0][0] + mu_idx = np.where(req_args == "mu")[0][0] + + if ("htail" in req_args) and ("hstep" in req_args): #having both the htail and hstep means it is an exgauss on a step + htail_idx = np.where(req_args == "htail")[0][0] + tau_idx = np.where(req_args == "tau")[0][0] + # We need to ditch the x_lo and x_hi columns and rows + if cov is not None: + cov = np.array(cov) + dropped_cov = cov[:, 2:][2:, :] + + return hpge_peak_mode(pars[mu_idx], pars[sigma_idx], pars[htail_idx], + pars[tau_idx], dropped_cov) + else: + return hpge_peak_mode(pars[mu_idx], pars[sigma_idx], pars[htail_idx], + pars[tau_idx]) + + else: + if cov is None: + return pars[mu_idx] + else: + return np.sqrt(cov[mu_idx][mu_idx]) + # hpge_peak.get_fwhm = hpge_get_fwhm +hpge_peak.get_fwfm = hpge_get_fwfm.__get__(hpge_peak) +hpge_peak.get_mode = hpge_get_mode.__get__(hpge_peak) hpge_peak.get_fwhm = hpge_get_fwhm.__get__(hpge_peak) \ No newline at end of file diff --git a/src/pygama/math/functions/polynomial.py b/src/pygama/math/functions/polynomial.py index d888d53dd..808436083 100644 --- a/src/pygama/math/functions/polynomial.py +++ b/src/pygama/math/functions/polynomial.py @@ -6,7 +6,7 @@ @nb.njit(**nb_defaults(parallel=False)) def nb_poly(x: np.ndarray, pars: np.ndarray) -> np.ndarray: r""" - A polynomial function with pars following the polyfit convention. It computes: + A polynomial function with pars following the numpy polynomial convention. It computes: .. math:: @@ -21,7 +21,7 @@ def nb_poly(x: np.ndarray, pars: np.ndarray) -> np.ndarray: x Input data pars - Coefficients of the polynomial, in polyfit convention + Coefficients of the polynomial, in numpy polynomial convention Returns ------- @@ -30,13 +30,14 @@ def nb_poly(x: np.ndarray, pars: np.ndarray) -> np.ndarray: Notes ----- - This follows the :func:`numpy.polyfit` convention of :math:`a_n x^n + ... + a_1 x + a_0` + This follows the :func:`numpy.polynomial` convention of :math:`a_0 + a_1 x +.... + a_n x^n` """ result = x*0 # do x*0 to keep shape of x (scalar or array) if len(pars) == 0: return result - result += pars[-1] + result += pars[0] + y=x for i in nb.prange(1, len(pars)): - result += pars[-i-1]*x - x = x*x + result += pars[i]*x + x = x*y return result diff --git a/src/pygama/math/functions/sum_dists.py b/src/pygama/math/functions/sum_dists.py index 6c4f770a4..fa2e69a70 100644 --- a/src/pygama/math/functions/sum_dists.py +++ b/src/pygama/math/functions/sum_dists.py @@ -500,6 +500,38 @@ def get_mu(self, pars: np.ndarray, cov:np.ndarray = None, errors:np.ndarray = No """ + req_args = np.array(self.required_args()) + mu_idx = np.where(req_args == "mu")[0][0] + mu = pars[mu_idx] + + if errors is not None: + return mu, errors[mu_idx] + elif cov is not None: + return mu, np.sqrt(cov[mu_idx][mu_idx]) + else: + return mu + + def get_mode(self, pars: np.ndarray, cov:np.ndarray = None, errors:np.ndarray = None) -> tuple: + r""" + Get the mode value from the output of a fit quickly + Need to overload this to use hpge_peak_fwhm (to avoid a circular import) for when self is an hpge peak + + Parameters + ---------- + pars + Array of fit parameters + cov + Array of covariances + errors + Array of erros + + Returns + ------- + mu, error + where mu is the fit value, and error is either from the covariance matrix or directly passed + """ + + req_args = np.array(self.required_args()) mu_idx = np.where(req_args == "mu")[0][0] mu = pars[mu_idx] @@ -539,6 +571,34 @@ def get_fwhm(self, pars: np.ndarray, cov: np.ndarray = None) -> tuple: return pars[sigma_idx]*2*np.sqrt(2*np.log(2)) else: return pars[sigma_idx]*2*np.sqrt(2*np.log(2)), np.sqrt(cov[sigma_idx][sigma_idx])*2*np.sqrt(2*np.log(2)) + + def get_fwfm(self, pars: np.ndarray, cov: np.ndarray = None, frac_max=0.5) -> tuple: + r""" + Get the fwfm value from the output of a fit quickly + Need to overload this to use hpge_peak_fwfm (to avoid a circular import) for when self is an hpge peak, + and otherwise returns 2sqrt(2log(2))*sigma + + Parameters + ---------- + pars + Array of fit parameters + cov + Optional, array of covariances for calculating error on the fwhm + + + Returns + ------- + fwhm, error + the value of the fwhm and its error + """ + + req_args = np.array(self.required_args()) + sigma_idx = np.where(req_args == "sigma")[0][0] + + if cov is None: + return pars[sigma_idx]*2*np.sqrt(-2*np.log(frac_max)) + else: + return pars[sigma_idx]*2*np.sqrt(-2*np.log(frac_max)), np.sqrt(cov[sigma_idx][sigma_idx])*2*np.sqrt(-2*np.log(frac_max)) def get_total_events(self, pars: np.ndarray, cov: np.ndarray = None, errors: np.ndarray =None) -> tuple: diff --git a/src/pygama/math/histogram.py b/src/pygama/math/histogram.py index ff3c0cd22..45cb89c7b 100644 --- a/src/pygama/math/histogram.py +++ b/src/pygama/math/histogram.py @@ -91,7 +91,11 @@ def get_hist(data: np.ndarray, bins: Optional[Union[int, np.ndarray, str]] = Non if wts is not None and np.shape(wts) == (): wts = np.full_like(data, wts) # initialize the boost_histogram object - boost_histogram = bh.Histogram(bh.axis.Regular(bins=bins, start=range[0], stop=range[1]), storage=bh.storage.Weight()) + if isinstance(bins, int): + boost_histogram = bh.Histogram(bh.axis.Regular(bins=bins, start=range[0], stop=range[1]), storage=bh.storage.Weight()) + else: + # if bins are specified need to use variable + boost_histogram = bh.Histogram(bh.axis.Variable(bins), storage=bh.storage.Weight()) # create the histogram boost_histogram.fill(data, weight=wts) # read out the histogram, bins, and variances @@ -628,3 +632,72 @@ def get_bin_estimates(pars: np.ndarray, func: Callable, bins: np.ndarray, is_int else: # func can be an integral functions return func(bins[1:], *pars, **kwargs) - func(bins[:-1], *pars, **kwargs) + +def get_i_local_extrema(data, delta): + """Get lists of indices of the local maxima and minima of data + + The "local" extrema are those maxima / minima that have heights / depths of + at least delta. + Converted from MATLAB script at: http://billauer.co.il/peakdet.html + + Parameters + ---------- + data : array-like + the array of data within which extrema will be found + delta : scalar + the absolute level by which data must vary (in one direction) about an + extremum in order for it to be tagged + + Returns + ------- + imaxes, imins : 2-tuple ( array, array ) + A 2-tuple containing arrays of variable length that hold the indices of + the identified local maxima (first tuple element) and minima (second + tuple element) + """ + + # prepare output + imaxes, imins = [], [] + + # sanity checks + data = np.asarray(data) + if not np.isscalar(delta): + log.error("get_i_local_extrema: Input argument delta must be a scalar") + return np.array(imaxes), np.array(imins) + if delta <= 0: + log.error(f"get_i_local_extrema: delta ({delta}) must be positive") + return np.array(imaxes), np.array(imins) + + # now loop over data + imax, imin = 0, 0 + find_max = True + for i in range(len(data)): + if data[i] > data[imax]: + imax = i + if data[i] < data[imin]: + imin = i + + if find_max: + # if the sample is less than the current max by more than delta, + # declare the previous one a maximum, then set this as the new "min" + if data[i] < data[imax] - delta: + imaxes.append(imax) + imin = i + find_max = False + else: + # if the sample is more than the current min by more than delta, + # declare the previous one a minimum, then set this as the new "max" + if data[i] > data[imin] + delta: + imins.append(imin) + imax = i + find_max = True + + return np.array(imaxes), np.array(imins) + + +def get_i_local_maxima(data, delta): + return get_i_local_extrema(data, delta)[0] + + +def get_i_local_minima(data, delta): + return get_i_local_extrema(data, delta)[1] diff --git a/src/pygama/math/hpge_peak_fitting.py b/src/pygama/math/hpge_peak_fitting.py index d44700f2d..27e61efca 100644 --- a/src/pygama/math/hpge_peak_fitting.py +++ b/src/pygama/math/hpge_peak_fitting.py @@ -75,7 +75,7 @@ def hpge_peak_peak_bgfree_halfmax(E, sigma, htail, tau, half_max): if cov is None: return upper_hm - lower_hm #calculate uncertainty - #amp set to 1, mu to 0, hstep+bg set to 0 + #nsig set to 1, mu to 0, hstep+nbkg set to 0 pars = [1,0, sigma, htail, tau, 0,0] step_norm = 1 gradmax = hpge_peak_parameter_gradient(Emax, pars, step_norm) @@ -93,6 +93,95 @@ def hpge_peak_peak_bgfree_halfmax(E, sigma, htail, tau, half_max): return upper_hm - lower_hm, fwfm_unc +def hpge_peak_fwfm(sigma, htail, tau, frac_max = 0.5, cov = None): + """ + Return the FWHM of the radford_peak function, ignoring background and step + components. If calculating error also need the normalisation for the step + function. + """ + # optimize this to find max value + def neg_radford_peak_bgfree(E, sigma, htail, tau): + return -gauss_on_exgauss.get_pdf(np.array([E]), 0, sigma, htail, tau)[0] + + if htail<0 or htail>1: + raise ValueError("htail outside allowed limits of 0 and 1") + + res = minimize_scalar( neg_radford_peak_bgfree, + args=(sigma, htail, tau), + bounds=(-sigma-htail, sigma+htail) ) + Emax = res.x + val_frac_max = -neg_radford_peak_bgfree(Emax, sigma, htail, tau)*frac_max + + # root find this to find the half-max energies + def radford_peak_bgfree_fracmax(E, sigma, htail, tau, val_frac_max): + return gauss_on_exgauss.get_pdf(np.array([E]), 0, sigma, htail, tau)[0] - val_frac_max + + try: + lower_hm = brentq( radford_peak_bgfree_fracmax, + -(2.5*sigma/2 + htail*tau), Emax, + args = (sigma, htail, tau, val_frac_max) ) + except: + lower_hm = brentq( radford_peak_bgfree_fracmax, + -(5*sigma + htail*tau), Emax, + args = (sigma, htail, tau, val_frac_max) ) + try: + upper_hm = brentq( radford_peak_bgfree_fracmax, + Emax, 2.5*sigma/2, + args = (sigma, htail, tau, val_frac_max) ) + except: + upper_hm = brentq( radford_peak_bgfree_fracmax, + Emax, 5*sigma, + args = (sigma, htail, tau, val_frac_max) ) + + if cov is None: return upper_hm - lower_hm + #calculate uncertainty + #nsig set to 1, mu to 0, hstep+nbkg set to 0 + pars = [1,0, sigma, htail, tau,0,0] + + rng = np.random.default_rng(1) + par_b = rng.multivariate_normal(pars, cov, size=100) + y_b = np.zeros(len(par_b)) + for p in par_b: + try: + y_b[i] = hpge_peak_fwfm(p[2],p[3],p[4], frac_max=frac_max) + except Exception: + y_b[i] = np.nan + yerr_boot = np.nanstd(y_b, axis=0) + + return upper_hm - lower_hm, yerr_boot + +def hpge_peak_mode(mu, sigma, htail, tau, cov = None): + + if htail<0 or htail>1: + if cov is not None: + return np.nan, np.nan + else: + return np.nan + + try: + mode = brentq(hpge_peak_peakshape_derivative, + mu-2*sigma - htail*tau, mu+2*sigma + htail*tau, + args = ([1,mu,sigma,htail,tau,0,0],1 )) + except ValueError: + try: + mode = brentq(hpge_peak_peakshape_derivative, + mu-4*sigma - htail*tau, mu+4*sigma + htail*tau, + args = ([1,mu,sigma,htail,tau,0,0],1 )) + except ValueError: + mode = np.nan + + if cov is None: return mode + else: + #nsig set to 1, hstep+nbkg set to 0 + pars = np.array([1, mu, sigma, htail, tau,0,0]) + rng = np.random.default_rng(1) + par_b = rng.multivariate_normal(pars, cov, size=10000) + modes = np.array([hpge_peak_mode(p[1],p[2],p[3],p[4]) for p in par_b]) + mode_err_boot = np.nanstd(modes, axis=0) + + return mode, mode_err_boot + + def hpge_peak_peakshape_derivative(E: np.ndarray, pars: np.ndarray, step_norm: float) -> np.ndarray: """ Computes the derivative of the hpge_peak peak shape diff --git a/src/pygama/math/unbinned_fitting.py b/src/pygama/math/unbinned_fitting.py index 1f06219f9..93d58a40d 100644 --- a/src/pygama/math/unbinned_fitting.py +++ b/src/pygama/math/unbinned_fitting.py @@ -50,10 +50,16 @@ def fit_unbinned(func: Callable, data: np.ndarray, guess:np.ndarray =None, else: cost_func = cost.UnbinnedNLL(data, func) - - m = Minuit(cost_func, *guess) + if isinstance(guess, dict): + m = Minuit(cost_func, **guess) + else: + m = Minuit(cost_func, *guess) if bounds is not None: - m.limits = bounds + if isinstance(bounds, dict): + for arg, key in bounds: + m.limits[arg] = key + else: + m.limits = bounds if fixed is not None: for fix in fixed: m.fixed[fix] = True @@ -61,5 +67,5 @@ def fit_unbinned(func: Callable, data: np.ndarray, guess:np.ndarray =None, m.simplex().migrad() else: m.migrad() - m.minos() + m.hesse() return m.values, m.errors, m.covariance diff --git a/src/pygama/pargen/AoE_cal.py b/src/pygama/pargen/AoE_cal.py index 227aec4e2..d5bb124bc 100644 --- a/src/pygama/pargen/AoE_cal.py +++ b/src/pygama/pargen/AoE_cal.py @@ -4,755 +4,291 @@ from __future__ import annotations -import json import logging -import os -import pathlib import re from datetime import datetime from typing import Callable -import matplotlib as mpl - -mpl.use("agg") -import lgdo.lh5 as lh5 -import matplotlib.cm as cmx -import matplotlib.colors as mcolors import matplotlib.dates as mdates import matplotlib.pyplot as plt import numpy as np import pandas as pd -from iminuit import Minuit, cost, util -from matplotlib.backends.backend_pdf import PdfPages +from iminuit import Minuit, cost from matplotlib.colors import LogNorm from scipy.stats import chi2 +import pygama.math.binned_fitting as pgf import pygama.math.histogram as pgh -import pygama.math.peak_fitting as pgf -from pygama.math.peak_fitting import nb_erfc -from pygama.pargen.energy_cal import get_i_local_maxima -from pygama.pargen.utils import * +import pygama.pargen.energy_cal as pgc +from pygama.math.distributions import ( + exgauss, + gauss_on_exgauss, + gauss_on_step, + gaussian, + hpge_peak, + nb_erfc, +) +from pygama.math.functions.hpge_peak import hpge_get_fwfm, hpge_get_fwhm, hpge_get_mode +from pygama.math.functions.sum_dists import sum_dists +from pygama.pargen.utils import convert_to_minuit, return_nans log = logging.getLogger(__name__) +(x_lo, x_hi, n_sig, mu, sigma, n_bkg, tau) = range(7) +par_array = [(gaussian, [mu, sigma]), (exgauss, [mu, sigma, tau])] +aoe_peak = sum_dists( + par_array, + [n_sig, n_bkg], + "areas", + parameter_names=["x_lo", "x_hi", "n_sig", "mu", "sigma", "n_bkg", "tau"], + name="aoe_peak", +) + + +(x_lo, x_hi, n_sig, mu, sigma, frac1, tau, n_bkg, tau_bkg) = range(9) +par_array = [ + (gauss_on_exgauss, [mu, sigma, frac1, tau]), + (exgauss, [mu, sigma, tau_bkg]), +] +aoe_peak_with_high_tail = sum_dists( + par_array, + [n_sig, n_bkg], + "areas", + parameter_names=[ + "x_lo", + "x_hi", + "n_sig", + "mu", + "sigma", + "htail", + "tau_sig", + "n_bkg", + "tau", + ], + name="aoe_peak_with_high_tail", +) +aoe_peak_with_high_tail.get_fwfm = hpge_get_fwfm.__get__(aoe_peak_with_high_tail) +aoe_peak_with_high_tail.get_mode = hpge_get_mode.__get__(aoe_peak_with_high_tail) +aoe_peak_with_high_tail.get_fwhm = hpge_get_fwhm.__get__(aoe_peak_with_high_tail) + + +def aoe_peak_guess(func, hist, bins, var, **kwargs): + bin_centers = (bins[:-1] + bins[1:]) / 2 + + mu = bin_centers[np.argmax(hist)] + try: + _, sigma, _ = pgh.get_gaussian_guess(hist, bins) + except Exception: + pars, cov = pgf.gauss_mode_width_max(hist, bins, var, mode_guess=mu, n_bins=20) + _, sigma, _ = pars + ls_guess = 2 * np.sum(hist[(bin_centers > mu) & (bin_centers < (mu + 2.5 * sigma))]) -class PDF: - - """ - Base class for A/E pdfs. - """ - - def pdf(x): - return - - def _replace_values(dic, **kwargs): - for item, value in kwargs.items(): - dic[item] = value - return dic - - -class standard_aoe(PDF): - def pdf( - x: np.array, - n_sig: float, - mu: float, - sigma: float, - n_bkg: float, - tau_bkg: float, - lower_range: float = np.inf, - upper_range: float = np.inf, - components: bool = False, - ) -> np.array: - """ - PDF for A/E consists of a gaussian signal with gaussian tail background - """ - try: - sig = n_sig * pgf.gauss_norm(x, mu, sigma) - bkg = n_bkg * pgf.gauss_tail_norm( - x, mu, sigma, tau_bkg, lower_range, upper_range - ) - except: - sig = np.full_like(x, np.nan) - bkg = np.full_like(x, np.nan) - - if components == False: - return sig + bkg - else: - return sig, bkg - - def extended_pdf( - x: np.array, - n_sig: float, - mu: float, - sigma: float, - n_bkg: float, - tau_bkg: float, - lower_range: float = np.inf, - upper_range: float = np.inf, - components: bool = False, - ) -> tuple(float, np.array): - """ - Extended PDF for A/E consists of a gaussian signal with gaussian tail background - """ - if components == True: - sig, bkg = standard_aoe.pdf( - x, - n_sig, - mu, - sigma, - n_bkg, - tau_bkg, - lower_range, - upper_range, - components, - ) - return n_sig + n_bkg, sig, bkg - else: - return n_sig + n_bkg, standard_aoe.pdf( - x, - n_sig, - mu, - sigma, - n_bkg, - tau_bkg, - lower_range, - upper_range, - components, - ) - - def guess(hist, bins, var, **kwargs): - bin_centers = (bins[:-1] + bins[1:]) / 2 - - mu = bin_centers[np.argmax(hist)] - try: - _, sigma, _ = pgh.get_gaussian_guess(hist, bins) - except: - pars, cov = pgf.gauss_mode_width_max( - hist, bins, var, mode_guess=mu, n_bins=20 - ) - _, sigma, _ = pars - ls_guess = 2 * np.sum( - hist[(bin_centers > mu) & (bin_centers < (mu + 2.5 * sigma))] - ) - + if func == aoe_peak: guess_dict = { + "x_lo": bins[0], + "x_hi": bins[-1], "n_sig": ls_guess, "mu": mu, "sigma": sigma, "n_bkg": np.sum(hist) - ls_guess, - "tau_bkg": 0.1, - "lower_range": np.nanmin(bins), - "upper_range": np.nanmax(bins), - "components": 0, + "tau": 0.1, } for key, guess in guess_dict.items(): if np.isnan(guess): guess_dict[key] = 0 - return standard_aoe._replace_values(guess_dict, **kwargs) - - def bounds(guess, **kwargs): - bounds_dict = { - "n_sig": (0, None), - "mu": (None, None), - "sigma": (0, None), - "n_bkg": (0, None), - "tau_bkg": (0, None), - "lower_range": (None, None), - "upper_range": (None, None), - "components": (None, None), - } - - return [ - bound - for field, bound in standard_aoe._replace_values( - bounds_dict, **kwargs - ).items() - ] - - def fixed(**kwargs): - fixed_dict = { - "n_sig": False, - "mu": False, - "sigma": False, - "n_bkg": False, - "tau_bkg": False, - "lower_range": True, - "upper_range": True, - "components": True, - } - - return [ - fixed - for field, fixed in standard_aoe._replace_values( - fixed_dict, **kwargs - ).items() - ] - - def width(pars, errs, cov): - return pars["sigma"], errs["sigma"] - - def centroid(pars, errs, cov): - return pars["mu"], errs["mu"] - - -class standard_aoe_with_high_tail(PDF): - def pdf( - x: np.array, - n_sig: float, - mu: float, - sigma: float, - htail: float, - tau_sig: float, - n_bkg: float, - tau_bkg: float, - lower_range: float = np.inf, - upper_range: float = np.inf, - components: bool = False, - ) -> np.array: - """ - PDF for A/E consists of a gaussian signal with tail with gaussian tail background - """ - try: - sig = n_sig * ( - (1 - htail) * pgf.gauss_norm(x, mu, sigma) - + htail - * pgf.gauss_tail_norm(x, mu, sigma, tau_sig, lower_range, upper_range) - ) - bkg = n_bkg * pgf.gauss_tail_norm( - x, mu, sigma, tau_bkg, lower_range, upper_range - ) - except: - sig = np.full_like(x, np.nan) - bkg = np.full_like(x, np.nan) - - if components == False: - return sig + bkg - else: - return sig, bkg - - def extended_pdf( - x: np.array, - n_sig: float, - mu: float, - sigma: float, - htail: float, - tau_sig: float, - n_bkg: float, - tau_bkg: float, - lower_range: float = np.inf, - upper_range: float = np.inf, - components: bool = False, - ) -> tuple(float, np.array): - """ - Extended PDF for A/E consists of a gaussian signal with gaussian tail background - """ - if components == True: - sig, bkg = standard_aoe_with_high_tail.pdf( - x, - n_sig, - mu, - sigma, - htail, - tau_sig, - n_bkg, - tau_bkg, - lower_range, - upper_range, - components, - ) - return n_sig + n_bkg, sig, bkg - else: - return n_sig + n_bkg, standard_aoe_with_high_tail.pdf( - x, - n_sig, - mu, - sigma, - htail, - tau_sig, - n_bkg, - tau_bkg, - lower_range, - upper_range, - components, - ) - - def guess(hist, bins, var, **kwargs): - bin_centers = (bins[:-1] + bins[1:]) / 2 - mu = bin_centers[np.argmax(hist)] - try: - _, sigma, _ = pgh.get_gaussian_guess(hist, bins) - except: - pars, cov = pgf.gauss_mode_width_max( - hist, bins, var, mode_guess=mu, n_bins=20 - ) - _, sigma, _ = pars - ls_guess = 2 * np.sum( - hist[(bin_centers > mu) & (bin_centers < (mu + 2.5 * sigma))] - ) - + elif func == aoe_peak_with_high_tail: guess_dict = { + "x_lo": bins[0], + "x_hi": bins[-1], "n_sig": ls_guess, "mu": mu, "sigma": sigma, "htail": 0.1, "tau_sig": -0.1, "n_bkg": np.sum(hist) - ls_guess, - "tau_bkg": 0.1, - "lower_range": np.nanmin(bins), - "upper_range": np.nanmax(bins), - "components": 0, + "tau": 0.1, } for key, guess in guess_dict.items(): if np.isnan(guess): guess_dict[key] = 0 - return standard_aoe_with_high_tail._replace_values(guess_dict, **kwargs) - - def bounds(guess, **kwargs): - bounds_dict = { - "n_sig": (0, None), - "mu": (None, None), - "sigma": (0, None), - "htail": (0, 1), - "tau_sig": (None, 0), - "n_bkg": (0, None), - "tau_bkg": (0, None), - "lower_range": (None, None), - "upper_range": (None, None), - "components": (None, None), - } - - return [ - bound - for field, bound in standard_aoe_with_high_tail._replace_values( - bounds_dict, **kwargs - ).items() - ] - - def fixed(**kwargs): - fixed_dict = { - "n_sig": False, - "mu": False, - "sigma": False, - "htail": False, - "tau_sig": False, - "n_bkg": False, - "tau_bkg": False, - "lower_range": True, - "upper_range": True, - "components": True, - } - - return [ - fixed - for field, fixed in standard_aoe_with_high_tail._replace_values( - fixed_dict, **kwargs - ).items() - ] - - def width(pars, errs, cov): - fwhm, fwhm_err = pgf.radford_fwhm( - pars[2], pars[3], np.abs(pars[4]), cov=cov[:7, :7] - ) - return fwhm / 2.355, fwhm_err / 2.355 - - def centroid(pars, errs, cov): - return pars["mu"], errs["mu"] - - -class standard_aoe_bkg(PDF): - def pdf( - x: np.array, - n_events: float, - mu: float, - sigma: float, - tau_bkg: float, - lower_range: float = np.inf, - upper_range: float = np.inf, - ) -> np.array: - """ - PDF for A/E consists of a gaussian signal with tail with gaussian tail background - """ - try: - sig = n_events * pgf.gauss_tail_norm( - x, mu, sigma, tau_bkg, lower_range, upper_range - ) - except: - sig = np.full_like(x, np.nan) - - return sig - - def extended_pdf( - x: np.array, - n_events: float, - mu: float, - sigma: float, - tau_bkg: float, - lower_range: float = np.inf, - upper_range: float = np.inf, - ) -> tuple(float, np.array): - """ - Extended PDF for A/E consists of a gaussian signal with gaussian tail background - """ - return n_events, standard_aoe_bkg.pdf( - x, n_events, mu, sigma, tau_bkg, lower_range, upper_range - ) - - def guess(hist, bins, var, **kwargs): - bin_centers = (bins[:-1] + bins[1:]) / 2 - - mu = bin_centers[np.argmax(hist)] - try: - _, sigma, _ = pgh.get_gaussian_guess(hist, bins) - except: - pars, cov = pgf.gauss_mode_width_max( - hist, bins, var, mode_guess=mu, n_bins=20 - ) - _, sigma, _ = pars - ls_guess = 2 * np.sum( - hist[(bin_centers > mu) & (bin_centers < (mu + 2.5 * sigma))] - ) - + elif func == exgauss: guess_dict = { - "n_events": np.sum(hist) - ls_guess, + "x_lo": bins[0], + "x_hi": bins[-1], + "area": np.sum(hist) - ls_guess, "mu": mu, "sigma": sigma, - "tau_bkg": 0.1, - "lower_range": np.nanmin(bins), - "upper_range": np.nanmax(bins), + "tau": 0.1, } for key, guess in guess_dict.items(): if np.isnan(guess): guess_dict[key] = 0 - return standard_aoe_bkg._replace_values(guess_dict, **kwargs) - - def bounds(guess, **kwargs): - bounds_dict = { - "n_events": (0, None), - "mu": (None, None), - "sigma": (0, None), - "tau_bkg": (0, None), - "lower_range": (None, None), - "upper_range": (None, None), - } - - return [ - bound - for field, bound in standard_aoe_bkg._replace_values( - bounds_dict, **kwargs - ).items() - ] - - def fixed(**kwargs): - fixed_dict = { - "n_bkg": False, - "mu": False, - "sigma": False, - "tau_bkg": False, - "lower_range": True, - "upper_range": True, + elif func == gaussian: + guess_dict = { + "x_lo": bins[0], + "x_hi": bins[-1], + "area": ls_guess, + "mu": mu, + "sigma": sigma, } - - return [ - fixed - for field, fixed in standard_aoe_bkg._replace_values( - fixed_dict, **kwargs - ).items() - ] - - -class gaussian(PDF): - def pdf(x: np.array, n_events: float, mu: float, sigma: float) -> np.array: - """ - PDF for A/E consists of a gaussian signal with tail with gaussian tail background - """ - try: - sig = n_events * pgf.gauss_norm(x, mu, sigma) - except: - sig = np.full_like(x, np.nan) - - return sig - - def extended_pdf( - x: np.array, n_events: float, mu: float, sigma: float - ) -> tuple(float, np.array): - """ - Extended PDF for A/E consists of a gaussian signal with gaussian tail background - """ - return n_events, gaussian.pdf(x, n_events, mu, sigma) - - def guess(hist, bins, var, **kwargs): - bin_centers = (bins[:-1] + bins[1:]) / 2 - mu = bin_centers[np.argmax(hist)] - try: - _, sigma, _ = pgh.get_gaussian_guess(hist, bins) - except: - pars, cov = pgf.gauss_mode_width_max( - hist, bins, var, mode_guess=mu, n_bins=20 - ) - _, sigma, _ = pars - ls_guess = 2 * np.sum( - hist[(bin_centers > mu) & (bin_centers < (mu + 2.5 * sigma))] - ) - - guess_dict = {"n_events": ls_guess, "mu": mu, "sigma": sigma} for key, guess in guess_dict.items(): if np.isnan(guess): guess_dict[key] = 0 - return gaussian._replace_values(guess_dict, **kwargs) + for item, value in kwargs.items(): + guess_dict[item] = value - def bounds(gpars, **kwargs): - bounds_dict = {"n_events": (0, None), "mu": (None, None), "sigma": (0, None)} + return convert_to_minuit(guess_dict, func).values - return [ - bound - for field, bound in gaussian._replace_values(bounds_dict, **kwargs).items() - ] - def fixed(**kwargs): - fixed_dict = { - "n_events": False, - "mu": False, - "sigma": False, +def aoe_peak_bounds(func, guess, **kwargs): + if func == aoe_peak: + bounds_dict = { + "x_lo": (None, None), + "x_hi": (None, None), + "n_sig": (0, None), + "mu": (guess["x_lo"], guess["x_hi"]), + "sigma": (0, None), + "n_bkg": (0, None), + "tau": (0, None), } - - return [ - fixed - for field, fixed in gaussian._replace_values(fixed_dict, **kwargs).items() - ] - - -class drift_time_distribution(PDF): - def pdf( - x, - n_sig1, - mu1, - sigma1, - htail1, - tau1, - n_sig2, - mu2, - sigma2, - htail2, - tau2, - components, - ): - gauss1 = n_sig1 * pgf.gauss_with_tail_pdf(x, mu1, sigma1, htail1, tau1) - gauss2 = n_sig2 * pgf.gauss_with_tail_pdf(x, mu2, sigma2, tau2, htail2) - if components is True: - return gauss1, gauss2 - else: - return gauss1 + gauss2 - - def extended_pdf( - x, - n_sig1, - mu1, - sigma1, - htail1, - tau1, - n_sig2, - mu2, - sigma2, - htail2, - tau2, - components, - ): - if components is True: - gauss1, gauss2 = drift_time_distribution.pdf( - x, - n_sig1, - mu1, - sigma1, - htail1, - tau1, - n_sig2, - mu2, - sigma2, - htail2, - tau2, - components, - ) - return n_sig1 + n_sig2, gauss1, gauss2 - - else: - return n_sig1 + n_sig2, drift_time_distribution.pdf( - x, - n_sig1, - mu1, - sigma1, - htail1, - tau1, - n_sig2, - mu2, - sigma2, - htail2, - tau2, - components, - ) - - def guess(hist: np.array, bins: np.array, var: np.array, **kwargs) -> list: - """ - Guess for fitting dt spectrum - """ - bcs = pgh.get_bin_centers(bins) - mus = get_i_local_maxima(hist / (np.sqrt(var) + 10**-99), 5) - if len(mus) > 2: - mus = get_i_local_maxima(hist / (np.sqrt(var) + 10**-99), 8) - elif len(mus) < 2: - mus = get_i_local_maxima(hist / (np.sqrt(var) + 10**-99), 3) - mu1 = bcs[mus[0]] - mu2 = bcs[mus[-1]] - - pars, cov = pgf.gauss_mode_width_max( - hist, - bins, - var=None, - mode_guess=mu1, - n_bins=10, - cost_func="Least Squares", - inflate_errors=False, - gof_method="var", - ) - mu1, sigma1, amp = pars - ix = np.where(bcs < mu1 + 3 * sigma1)[0][-1] - n_sig1 = np.sum(hist[:ix]) - pars2, cov2 = pgf.gauss_mode_width_max( - hist, - bins, - var=None, - mode_guess=mu2, - n_bins=10, - cost_func="Least Squares", - inflate_errors=False, - gof_method="var", - ) - mu2, sigma2, amp2 = pars2 - - guess_dict = { - "n_sig1": n_sig1, - "mu1": mu1, - "sigma1": sigma1, - "htail1": 0.5, - "tau1": 0.1, - "n_sig2": np.sum(hist) - n_sig1, - "mu2": mu2, - "sigma2": sigma2, - "htail2": 0.5, - "tau2": 0.1, - "components": 0, + elif func == aoe_peak_with_high_tail: + bounds_dict = { + "x_lo": (None, None), + "x_hi": (None, None), + "n_sig": (0, None), + "mu": (guess["x_lo"], guess["x_hi"]), + "sigma": (0, None), + "htail": (0, 1), + "tau_sig": (None, 0), + "n_bkg": (0, None), + "tau": (0, None), } - for key, guess in guess_dict.items(): - if np.isnan(guess): - guess_dict[key] = 0 - - return drift_time_distribution._replace_values(guess_dict, **kwargs) - - def bounds(guess, **kwargs): + elif func == exgauss: + bounds_dict = { + "x_lo": (None, None), + "x_hi": (None, None), + "area": (0, None), + "mu": (guess["x_lo"], guess["x_hi"]), + "sigma": (0, None), + "tau": (0, None), + } + elif func == gaussian: bounds_dict = { - "n_sig1": (0, None), - "mu1": (None, None), - "sigma1": (0, None), - "htail1": (0, 1), - "tau1": (None, None), - "n_sig2": (0, None), - "mu2": (None, None), - "sigma2": (0, None), - "htail2": (0, 1), - "tau2": (None, None), - "components": (None, None), + "x_lo": (None, None), + "x_hi": (None, None), + "area": (0, None), + "mu": (guess["x_lo"], guess["x_hi"]), + "sigma": (0, None), } - return [ - bound - for field, bound in drift_time_distribution._replace_values( - bounds_dict, **kwargs - ).items() - ] + for item, value in kwargs.items(): + bounds_dict[item] = value + return bounds_dict - def fixed(**kwargs): - fixed_dict = { - "n_sig1": False, - "mu1": False, - "sigma1": False, - "htail1": False, - "tau1": False, - "n_sig2": False, - "mu2": False, - "sigma2": False, - "htail2": False, - "tau2": False, - "components": True, - } - return [ - fixed - for field, fixed in drift_time_distribution._replace_values( - fixed_dict, **kwargs - ).items() - ] +def aoe_peak_fixed(func, **kwargs): + if func == aoe_peak: + fixed = ["x_lo", "x_hi"] + elif func == aoe_peak_with_high_tail: + fixed = ["x_lo", "x_hi"] + elif func == exgauss: + fixed = ["x_lo", "x_hi"] + elif func == gaussian: + fixed = ["x_lo", "x_hi"] + mask = ~np.in1d(func.required_args(), fixed) + return fixed, mask -class pol1: +class Pol1: + @staticmethod def func(x, a, b): return x * a + b + @staticmethod def string_func(input_param): return f"{input_param}*a+b" + @staticmethod def guess(bands, means, mean_errs): return [-1e-06, 5e-01] -class sigma_fit: +class SigmaFit: + @staticmethod def func(x, a, b, c): return np.sqrt(a + (b / (x + 10**-99)) ** c) + @staticmethod def string_func(input_param): return f"(a+(b/({input_param}+10**-99))**c)**(0.5)" + @staticmethod def guess(bands, sigmas, sigma_errs): return [np.nanpercentile(sigmas, 50) ** 2, 2, 2] -class sigmoid_fit: +class SigmoidFit: + @staticmethod def func(x, a, b, c, d): return (a + b * x) * nb_erfc(c * x + d) + @staticmethod def guess(xs, ys, y_errs): return [np.nanmax(ys) / 2, 0, 1, 1.5] def unbinned_aoe_fit( - aoe: np.array, pdf=standard_aoe, display: int = 0, verbose: bool = False + aoe: np.array, + pdf=aoe_peak, + display: int = 0, ) -> tuple(np.array, np.array): """ - Fitting function for A/E, first fits just a gaussian before using the full pdf to fit + Fitting function for A/E, first fits just a Gaussian before using the full pdf to fit if fails will return NaN values + + Args: + aoe: np.array + A/E values + pdf: PDF + PDF to fit to + display: int + Level of display + + Returns: tuple(np.array, np.array) + Tuple of fit values and errors """ + if not isinstance(aoe, np.ndarray): + aoe = np.array(aoe) + + bin_width = ( + 2 + * (np.nanpercentile(aoe, 75) - np.nanpercentile(aoe, 25)) + * len(aoe) ** (-1 / 3) + ) + nbins = int(np.ceil((np.nanmax(aoe) - np.nanmin(aoe)) / bin_width)) hist, bins, var = pgh.get_hist(aoe, bins=500) - gpars = gaussian.guess(hist, bins, var) + gpars = aoe_peak_guess(gaussian, hist, bins, var) c1_min = gpars["mu"] - 2 * gpars["sigma"] c1_max = gpars["mu"] + 3 * gpars["sigma"] # Initial fit just using Gaussian - c1 = cost.UnbinnedNLL(aoe[(aoe < c1_max) & (aoe > c1_min)], gaussian.pdf) + c1 = cost.ExtendedUnbinnedNLL( + aoe[(aoe < c1_max) & (aoe > c1_min)], gaussian.pdf_ext + ) - m1 = Minuit(c1, **gpars) - m1.limits = [ - (0, len(aoe[(aoe < c1_max) & (aoe > c1_min)])), - (gpars["mu"] * 0.8, gpars["mu"] * 1.2), - (0.8 * gpars["sigma"], gpars["sigma"] * 1.2), - ] - m1.fixed = gaussian.fixed() - m1.migrad() + m1 = Minuit(c1, *gpars) - if verbose: - print(m1) + bounds = aoe_peak_bounds(gaussian, gpars) + for arg, val in bounds.items(): + m1.limits[arg] = val + for fix in aoe_peak_fixed(gaussian)[0]: + m1.fixed[fix] = True + m1.migrad() # Range to fit over, below this tail behaviour more exponential, few events above fmin = m1.values["mu"] - 15 * m1.values["sigma"] @@ -761,80 +297,89 @@ def unbinned_aoe_fit( fmax_bkg = m1.values["mu"] - 5 * m1.values["sigma"] fmax = m1.values["mu"] + 5 * m1.values["sigma"] - n_bkg_guess = len(aoe[(aoe < fmax) & (aoe > fmin)]) - m1.values["n_events"] + n_bkg_guess = len(aoe[(aoe < fmax) & (aoe > fmin)]) - m1.values["area"] - bkg_guess = standard_aoe_bkg.guess( + bkg_guess = aoe_peak_guess( + exgauss, hist, bins, var, - n_events=n_bkg_guess, + area=n_bkg_guess, mu=m1.values["mu"], sigma=m1.values["sigma"], - lower_range=fmin, - upper_range=fmax_bkg, + x_lo=fmin, + x_hi=fmax_bkg, ) - c2 = cost.ExtendedUnbinnedNLL( - aoe[(aoe < fmax_bkg) & (aoe > fmin)], standard_aoe_bkg.extended_pdf - ) - m2 = Minuit(c2, **bkg_guess) - m2.fixed = standard_aoe_bkg.fixed(mu=True) - m2.limits = standard_aoe_bkg.bounds( - bkg_guess, n_events=(0, 2 * len(aoe[(aoe < fmax_bkg) & (aoe > fmin)])) - ) + c2 = cost.ExtendedUnbinnedNLL(aoe[(aoe < fmax_bkg) & (aoe > fmin)], exgauss.pdf_ext) + m2 = Minuit(c2, *bkg_guess) + + bounds = aoe_peak_bounds(exgauss, bkg_guess) + + for arg, val in bounds.items(): + m2.limits[arg] = val + for fix in aoe_peak_fixed(exgauss)[0]: + m2.fixed[fix] = True m2.simplex().migrad() m2.hesse() - x0 = pdf.guess( + x0 = aoe_peak_guess( + pdf, hist, bins, var, - n_sig=m1.values["n_events"], + n_sig=m1.values["area"], mu=m1.values["mu"], sigma=m1.values["sigma"], - n_bkg=m2.values["n_events"], - tau_bkg=m2.values["tau_bkg"], - lower_range=fmin, - upper_range=fmax, - ) - if verbose: - print(x0) - - # Full fit using gaussian signal with gaussian tail background - c = cost.ExtendedUnbinnedNLL(aoe[(aoe < fmax) & (aoe > fmin)], pdf.extended_pdf) - m = Minuit(c, **x0) - m.limits = pdf.bounds( - x0, - n_sig=(0, 2 * len(aoe[(aoe < fmax) & (aoe > fmin)])), - n_bkg=(0, 2 * len(aoe[(aoe < fmax) & (aoe > fmin)])), + n_bkg=m2.values["area"], + tau=m2.values["tau"], + x_lo=fmin, + x_hi=fmax, ) - m.fixed = pdf.fixed() + + bounds = aoe_peak_bounds(pdf, x0) + + # Full fit using Gaussian signal with Gaussian tail background + c = cost.ExtendedUnbinnedNLL(aoe[(aoe < fmax) & (aoe > fmin)], pdf.pdf_ext) + m = Minuit(c, *x0) + for arg, val in bounds.items(): + m.limits[arg] = val + fixed, mask = aoe_peak_fixed(pdf) + for fix in fixed: + m.fixed[fix] = True m.migrad() m.hesse() - if verbose: - print(m) - if np.isnan(m.errors).all(): try: m.simplex.migrad() m.hesse() - except: + except Exception: return return_nans(pdf) if display > 1: + aoe = aoe[(aoe < fmax) & (aoe > fmin)] + bin_width = ( + 2 + * (np.nanpercentile(aoe, 75) - np.nanpercentile(aoe, 25)) + * len(aoe) ** (-1 / 3) + ) + nbins = int(np.ceil((np.nanmax(aoe) - np.nanmin(aoe)) / bin_width)) # *5 + plt.figure() xs = np.linspace(fmin, fmax, 1000) - counts, bins, bars = plt.hist( - aoe[(aoe < fmax) & (aoe > fmin)], bins=200, histtype="step", label="Data" - ) + counts, bins, bars = plt.hist(aoe, bins=nbins, histtype="step", label="Data") dx = np.diff(bins) - plt.plot(xs, pdf.pdf(xs, *m.values) * dx[0], label="Full fit") - sig, bkg = pdf.pdf(xs, *m.values[:-1], True) + plt.plot(xs, pdf.get_pdf(xs, *m.values) * dx[0], label="Full fit") + pdf.components = True + sig, bkg = pdf.get_pdf(xs, *m.values) + pdf.components = False plt.plot(xs, sig * dx[0], label="Signal") plt.plot(xs, bkg * dx[0], label="Background") - plt.plot(xs, gaussian.pdf(xs, *m1.values) * dx[0], label="Initial Gaussian") - plt.plot(xs, standard_aoe_bkg.pdf(xs, *m2.values) * dx[0], label="Bkg guess") + plt.plot( + xs, gaussian.pdf_ext(xs, *m1.values)[1] * dx[0], label="Initial Gaussian" + ) + plt.plot(xs, exgauss.pdf_ext(xs, *m2.values)[1] * dx[0], label="Bkg guess") plt.xlabel("A/E") plt.ylabel("Counts") plt.legend(loc="upper left") @@ -856,257 +401,129 @@ def unbinned_aoe_fit( return m.values, m.errors, m.covariance -def fit_time_means(tstamps, means, reses): +def fit_time_means(tstamps, means, sigmas): + """ + Fit the time dependence of the means of the A/E distribution + + Args: + + tstamps: np.array + Timestamps of the data + means: np.array + Means of the A/E distribution + sigmas: np.array + Sigmas of the A/E distribution + + Returns: dict + Dictionary of the time dependence of the means + """ out_dict = {} current_tstamps = [] current_means = [] - current_reses = [] + current_sigmas = [] rolling_mean = means[ np.where( - (np.abs(np.diff(means)) < (0.4 * np.array(reses)[1:])) - & (~np.isnan(np.abs(np.diff(means)) < (0.4 * np.array(reses)[1:]))) + (np.abs(np.diff(means)) < (0.4 * np.array(sigmas)[1:])) + & (~np.isnan(np.abs(np.diff(means)) < (0.4 * np.array(sigmas)[1:]))) )[0][0] ] for i, tstamp in enumerate(tstamps): if ( ( - np.abs(means[i] - rolling_mean) > 0.4 * reses[i] + np.abs(means[i] - rolling_mean) > 0.4 * sigmas[i] and np.abs(means[i] - rolling_mean) > rolling_mean * 0.01 ) or np.isnan(means[i]) - or np.isnan(reses[i]) + or np.isnan(sigmas[i]) ): if i + 1 == len(means): out_dict[tstamp] = np.nan else: - if (np.abs(means[i + 1] - means[i]) < 0.4 * reses[i + 1]) and not ( + if (np.abs(means[i + 1] - means[i]) < 0.4 * sigmas[i + 1]) and not ( np.isnan(means[i]) or np.isnan(means[i + 1]) - or np.isnan(reses[i]) - or np.isnan(reses[i + 1]) + or np.isnan(sigmas[i]) + or np.isnan(sigmas[i + 1]) ): for ts in current_tstamps: out_dict[ts] = rolling_mean rolling_mean = means[i] current_means = [means[i]] current_tstamps = [tstamp] - current_reses = [reses[i]] + current_sigmas = [sigmas[i]] else: out_dict[tstamp] = np.nan else: current_tstamps.append(tstamp) current_means.append(means[i]) - current_reses.append(reses[i]) + current_sigmas.append(sigmas[i]) rolling_mean = np.average( - current_means, weights=1 / np.array(current_reses) + current_means, weights=1 / np.array(current_sigmas) ) for tstamp in current_tstamps: out_dict[tstamp] = rolling_mean return out_dict -def energy_guess(hist, bins, var, func_i, peak, eres, fit_range): +def energy_guess(energy, func_i, fit_range=None, bin_width=1, peak=None, eres=None): """ Simple guess for peak fitting """ - if func_i == pgf.extended_radford_pdf: - bin_cs = (bins[1:] + bins[:-1]) / 2 - sigma = eres / 2.355 - i_0 = np.nanargmax(hist) - mu = peak - height = hist[i_0] - bg0 = np.mean(hist[-10:]) - step = np.mean(hist[:10]) - bg0 - htail = 1.0 / 5 - tau = 0.5 * sigma - - hstep = step / (bg0 + np.mean(hist[:10])) - dx = np.diff(bins)[0] - n_bins_range = int((3 * sigma) // dx) - nsig_guess = np.sum(hist[i_0 - n_bins_range : i_0 + n_bins_range]) - ( - (n_bins_range * 2) * (bg0 - step / 2) + if fit_range is None: + fit_range = (np.nanmin(energy), np.nanmax(energy)) + if func_i == hpge_peak or func_i == gauss_on_step: + parguess = pgc.get_hpge_energy_peak_par_guess( + energy, func_i, fit_range=fit_range ) - nbkg_guess = np.sum(hist) - nsig_guess - if nbkg_guess < 0: - nbkg_guess = 0 - if nsig_guess < 0: - nsig_guess = 0 - parguess = [ - nsig_guess, - mu, - sigma, - htail, - tau, - nbkg_guess, - hstep, - fit_range[0], - fit_range[1], - 0, - ] - for i, guess in enumerate(parguess): - if np.isnan(guess): - parguess[i] = 0 - return parguess - elif func_i == pgf.extended_gauss_step_pdf: - mu = peak - sigma = eres / 2.355 - i_0 = np.argmax(hist) - bg = np.mean(hist[-10:]) - step = bg - np.mean(hist[:10]) - hstep = step / (bg + np.mean(hist[:10])) - dx = np.diff(bins)[0] - n_bins_range = int((3 * sigma) // dx) - nsig_guess = np.sum(hist[i_0 - n_bins_range : i_0 + n_bins_range]) - nbkg_guess = np.sum(hist) - nsig_guess - if nbkg_guess < 0: - nbkg_guess = 0 - if nsig_guess < 0: - nsig_guess = 0 - - parguess = [ - nsig_guess, - mu, - sigma, - nbkg_guess, - hstep, - fit_range[0], - fit_range[1], - 0, - ] + if peak is not None: + parguess["mu"] = peak + + if eres is not None: + parguess["sigma"] = eres / 2.355 + for i, guess in enumerate(parguess): if np.isnan(guess): parguess[i] = 0 - return parguess + else: + log.error(f"energy_guess not implemented for {func_i}") + return None + return parguess -def unbinned_energy_fit( - energy: np.array, - peak: float, - eres: list, - simplex=False, - guess=None, - display=0, - verbose: bool = False, -) -> tuple(np.array, np.array): + +def fix_all_but_nevents(func): """ - Fitting function for energy peaks used to calculate survival fractions + Returns: Sequence list of fixed indexes for fitting and mask for parameters """ - try: - hist, bins, var = pgh.get_hist( - energy, dx=0.5, range=(np.nanmin(energy), np.nanmax(energy)) - ) - except ValueError: - pars, errs, cov = return_nans(pgf.radford_pdf) - return pars, errs - sigma = eres / 2.355 - if guess is None: - x0 = energy_guess( - hist, - bins, - var, - pgf.extended_gauss_step_pdf, - peak, - eres, - (np.nanmin(energy), np.nanmax(energy)), - ) - c = cost.ExtendedUnbinnedNLL(energy, pgf.extended_gauss_step_pdf) - m = Minuit(c, *x0) - m.limits = [ - (0, 2 * np.sum(hist)), - (peak - 1, peak + 1), - (0, None), - (0, 2 * np.sum(hist)), - (-1, 1), - (None, None), - (None, None), - (None, None), - ] - m.fixed[-3:] = True - m.simplex().migrad() - m.hesse() - x0 = m.values[:3] - x0 += [0.2, 0.2 * m.values[2]] - x0 += m.values[3:] - if verbose: - print(m) - bounds = [ - (0, 2 * np.sum(hist)), - (peak - 1, peak + 1), - (0, None), - (0, 1), - (0, None), - (0, 2 * np.sum(hist)), - (-1, 1), - (None, None), - (None, None), - (None, None), - ] - fixed = [7, 8, 9] - else: - x0 = guess - x1 = energy_guess( - hist, - bins, - var, - pgf.extended_radford_pdf, - peak, - eres, - (np.nanmin(energy), np.nanmax(energy)), - ) - x0[0] = x1[0] - x0[5] = x1[5] - bounds = [ - (0, 2 * np.sum(hist)), - (guess[1] - 0.5, guess[1] + 0.5), - sorted((0.8 * guess[2], 1.2 * guess[2])), - sorted((0.8 * guess[3], 1.2 * guess[3])), - sorted((0.8 * guess[4], 1.2 * guess[4])), - (0, 2 * np.sum(hist)), - sorted((0.8 * guess[6], 1.2 * guess[6])), - (None, None), - (None, None), - (None, None), - ] - fixed = [1, 2, 3, 4, 6, 7, 8, 9] - if len(x0) == 0: - pars, errs, cov = return_nans(pgf.extended_radford_pdf) - return pars, errs - - if verbose: - print(x0) - c = cost.ExtendedUnbinnedNLL(energy, pgf.extended_radford_pdf) - m = Minuit(c, *x0) - m.limits = bounds - for fix in fixed: - m.fixed[fix] = True - if simplex == True: - m.simplex().migrad() + + if func == gauss_on_step: + # pars are: n_sig, mu, sigma, n_bkg, hstep, lower, upper, components + fixed = ["x_lo", "x_hi", "mu", "sigma", "hstep"] + + elif func == hpge_peak: + # pars are: , components + fixed = ["x_lo", "x_hi", "mu", "sigma", "htail", "tau", "hstep"] + else: - m.migrad() + log.error(f"get_hpge_E_fixed not implemented for {func}") + return None, None + mask = ~np.in1d(func.required_args(), fixed) + return fixed, mask - m.hesse() - if verbose: - print(m) - if display > 1: - plt.figure() - bcs = (bins[1:] + bins[:-1]) / 2 - plt.step(bcs, hist, where="mid") - plt.plot(bcs, pgf.radford_pdf(bcs, *x0) * np.diff(bcs)[0]) - plt.plot(bcs, pgf.radford_pdf(bcs, *m.values) * np.diff(bcs)[0]) - plt.show() - if not np.isnan(m.errors[:-3]).all(): - return m.values, m.errors +def get_bounds(func, parguess): + if func == hpge_peak or func == gauss_on_step: + bounds = pgc.get_hpge_energy_bounds(func, parguess) + + bounds["mu"] = (parguess["mu"] - 1, parguess["mu"] + 1) + bounds["n_sig"] = (0, 2 * (parguess["n_sig"] + parguess["n_bkg"])) + bounds["n_bkg"] = (0, 2 * (parguess["n_sig"] + parguess["n_bkg"])) + else: - try: - m.simplex().migrad() - m.minos() - if not np.isnan(m.errors[:-3]).all(): - return m.values, m.errors - except: - pars, errs, cov = return_nans(pgf.extended_radford_pdf) - return pars, errs + log.error(f"get_bounds not implemented for {func}") + return None + return bounds def get_peak_label(peak: float) -> str: @@ -1122,22 +539,60 @@ def get_peak_label(peak: float) -> str: return "Tl FEP @" +def update_guess(func, parguess, energies): + if func == gauss_on_step: + total_events = len(energies) + parguess["n_sig"] = len( + energies[ + (energies > parguess["mu"] - 2 * parguess["sigma"]) + & (energies < parguess["mu"] + 2 * parguess["sigma"]) + ] + ) + parguess["n_bkg"] = total_events - parguess["n_sig"] + return parguess + + if func == hpge_peak: + total_events = len(energies) + parguess["n_sig"] = len( + energies[ + (energies > parguess["mu"] - 2 * parguess["sigma"]) + & (energies < parguess["mu"] + 2 * parguess["sigma"]) + ] + ) + parguess["n_bkg"] = total_events - parguess["n_sig"] + return parguess + + else: + log.error(f"update_guess not implemented for {func}") + return parguess + + def get_survival_fraction( energy, cut_param, cut_val, peak, eres_pars, + fit_range=None, high_cut=None, guess_pars_cut=None, guess_pars_surv=None, dt_mask=None, mode="greater", + func=hpge_peak, display=0, ): if dt_mask is None: dt_mask = np.full(len(cut_param), True, dtype=bool) + if not isinstance(energy, np.ndarray): + energy = np.array(energy) + if not isinstance(cut_param, np.ndarray): + cut_param = np.array(cut_param) + + if fit_range is None: + fit_range = (np.nanmin(energy), np.nanmax(energy)) + nan_idxs = np.isnan(cut_param) if high_cut is not None: idxs = (cut_param > cut_val) & (cut_param < high_cut) & dt_mask @@ -1150,31 +605,47 @@ def get_survival_fraction( raise ValueError("mode not recognised") if guess_pars_cut is None or guess_pars_surv is None: - pars, errs = unbinned_energy_fit(energy, peak, eres_pars, simplex=True) + (pars, errs, cov, _, func, _, _, _) = pgc.unbinned_staged_energy_fit( + energy, + func, + guess_func=energy_guess, + bounds_func=get_bounds, + guess_kwargs={"peak": peak, "eres": eres_pars}, + fit_range=fit_range, + ) + guess_pars_cut = pars guess_pars_surv = pars - - cut_pars, ct_errs = unbinned_energy_fit( + # add update guess here for n_sig and n_bkg + guess_pars_cut = update_guess(func, guess_pars_cut, energy[(~nan_idxs) & (~idxs)]) + (cut_pars, cut_errs, cut_cov, _, _, _, _, _) = pgc.unbinned_staged_energy_fit( energy[(~nan_idxs) & (~idxs)], - peak, - eres_pars, + func, guess=guess_pars_cut, - simplex=False, - display=display, - verbose=False, + guess_func=energy_guess, + bounds_func=get_bounds, + fixed_func=fix_all_but_nevents, + guess_kwargs={"peak": peak, "eres": eres_pars}, + lock_guess=True, + allow_tail_drop=False, + fit_range=fit_range, ) - - surv_pars, surv_errs = unbinned_energy_fit( + guess_pars_surv = update_guess(func, guess_pars_cut, energy[(~nan_idxs) & (idxs)]) + (surv_pars, surv_errs, surv_cov, _, _, _, _, _) = pgc.unbinned_staged_energy_fit( energy[(~nan_idxs) & (idxs)], - peak, - eres_pars, + func, guess=guess_pars_surv, - simplex=False, - display=display, + guess_func=energy_guess, + bounds_func=get_bounds, + fixed_func=fix_all_but_nevents, + guess_kwargs={"peak": peak, "eres": eres_pars}, + lock_guess=True, + allow_tail_drop=False, + fit_range=fit_range, ) ct_n = cut_pars["n_sig"] - ct_err = ct_errs["n_sig"] + ct_err = cut_errs["n_sig"] surv_n = surv_pars["n_sig"] surv_err = surv_errs["n_sig"] @@ -1189,13 +660,15 @@ def get_survival_fraction( def get_sf_sweep( energy: np.array, cut_param: np.array, - final_cut_value: float, - peak: float, - eres_pars: list, + final_cut_value: float = None, + peak: float = 1592.5, + eres_pars: list = None, dt_mask=None, cut_range=(-5, 5), - n_samples=51, + n_samples=26, mode="greater", + fit_range=None, + debug_mode=False, ) -> tuple(pd.DataFrame, float, float): """ Calculates survival fraction for gamma lines using fitting method as in cut determination @@ -1204,26 +677,68 @@ def get_sf_sweep( if dt_mask is None: dt_mask = np.full(len(cut_param), True, dtype=bool) + if not isinstance(energy, np.ndarray): + energy = np.array(energy) + if not isinstance(cut_param, np.ndarray): + cut_param = np.array(cut_param) + cut_vals = np.linspace(cut_range[0], cut_range[1], n_samples) - out_df = pd.DataFrame(columns=["cut_val", "sf", "sf_err"]) + out_df = pd.DataFrame() + + (pars, _, _, _, func, _, _, _) = pgc.unbinned_staged_energy_fit( + energy, + hpge_peak, + guess_func=energy_guess, + bounds_func=get_bounds, + guess_kwargs={"peak": peak, "eres": eres_pars}, + fit_range=fit_range, + ) + guess_pars_cut = pars + guess_pars_surv = pars + for cut_val in cut_vals: try: - sf, err, cut_pars, surv_pars = get_survival_fraction( - energy, cut_param, cut_val, peak, eres_pars, dt_mask=dt_mask, mode=mode + sf, err, _, _ = get_survival_fraction( + energy, + cut_param, + cut_val, + peak, + eres_pars, + fit_range=fit_range, + dt_mask=dt_mask, + mode=mode, + guess_pars_cut=guess_pars_cut, + guess_pars_surv=guess_pars_surv, + func=func, ) out_df = pd.concat( [out_df, pd.DataFrame([{"cut_val": cut_val, "sf": sf, "sf_err": err}])] ) - except: - pass + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif debug_mode: + raise (e) out_df.set_index("cut_val", inplace=True) - sf, sf_err, cut_pars, surv_pars = get_survival_fraction( - energy, cut_param, final_cut_value, peak, eres_pars, dt_mask=dt_mask, mode=mode - ) + if final_cut_value is not None: + sf, sf_err, cut_pars, surv_pars = get_survival_fraction( + energy, + cut_param, + final_cut_value, + peak, + eres_pars, + fit_range=fit_range, + dt_mask=dt_mask, + mode=mode, + guess_pars_cut=guess_pars_cut, + guess_pars_surv=guess_pars_surv, + func=func, + ) + else: + sf = None + sf_err = None return ( - out_df.query( - f'sf_err<5*{np.nanpercentile(out_df["sf_err"], 50)}& sf_err==sf_err & sf<=100' - ), + out_df, sf, sf_err, ) @@ -1233,6 +748,9 @@ def compton_sf(cut_param, low_cut_val, high_cut_val=None, mode="greater", dt_mas if dt_mask is None: dt_mask = np.full(len(cut_param), True, dtype=bool) + if not isinstance(cut_param, np.ndarray): + cut_param = np.array(cut_param) + if high_cut_val is not None: mask = (cut_param > low_cut_val) & (cut_param < high_cut_val) & dt_mask else: @@ -1243,12 +761,21 @@ def compton_sf(cut_param, low_cut_val, high_cut_val=None, mode="greater", dt_mas else: raise ValueError("mode not recognised") - sf = 100 * len(cut_param[mask]) / len(cut_param) - sf_err = sf * np.sqrt((1 / len(cut_param)) + 1 / (len(cut_param[mask]) + 10**-99)) + ct_n = len(cut_param[~mask]) + ct_err = np.sqrt(len(cut_param[~mask])) + surv_n = len(cut_param[mask]) + surv_err = np.sqrt(len(cut_param[mask])) + + pc_n = ct_n + surv_n + pc_err = np.sqrt(surv_err**2 + ct_err**2) + + sf = (surv_n / pc_n) * 100 + err = sf * np.sqrt((pc_err / pc_n) ** 2 + (surv_err / surv_n) ** 2) + return { "low_cut": low_cut_val, "sf": sf, - "sf_err": sf_err, + "sf_err": err, "high_cut": high_cut_val, } @@ -1258,7 +785,7 @@ def compton_sf_sweep( cut_param: np.array, final_cut_value: float, peak: float, - eres: list[float, float], + eres: list[float, float] = None, dt_mask: np.array = None, cut_range=(-5, 5), n_samples=51, @@ -1267,9 +794,13 @@ def compton_sf_sweep( """ Determines survival fraction for compton continuum by basic counting """ + if not isinstance(energy, np.ndarray): + energy = np.array(energy) + if not isinstance(cut_param, np.ndarray): + cut_param = np.array(cut_param) cut_vals = np.linspace(cut_range[0], cut_range[1], n_samples) - out_df = pd.DataFrame(columns=["cut_val", "sf", "sf_err"]) + out_df = pd.DataFrame() for cut_val in cut_vals: ct_dict = compton_sf(cut_param, cut_val, mode=mode, dt_mask=dt_mask) @@ -1290,26 +821,25 @@ def compton_sf_sweep( return out_df, sf_dict["sf"], sf_dict["sf_err"] -class cal_aoe: +class CalAoE: def __init__( self, - cal_dicts: dict = {}, + cal_dicts: dict = None, cal_energy_param: str = "cuspEmax_ctc_cal", eres_func: callable = lambda x: 1, - pdf=standard_aoe, - selection_string: str = "", + pdf=aoe_peak, + selection_string: str = "index==index", dt_corr: bool = False, - dep_acc: float = 0.9, dep_correct: bool = False, dt_cut: dict = None, dt_param: str = "dt_eff", high_cut_val: int = 3, - mean_func: Callable = pol1, - sigma_func: Callable = sigma_fit, - comptBands_width: int = 20, - plot_options: dict = {}, + mean_func: Callable = Pol1, + sigma_func: Callable = SigmaFit, + compt_bands_width: int = 20, + debug_mode: bool = False, ): - self.cal_dicts = cal_dicts + self.cal_dicts = cal_dicts if cal_dicts is not None else {} self.cal_energy_param = cal_energy_param self.eres_func = eres_func self.pdf = pdf @@ -1318,7 +848,6 @@ def __init__( self.dt_param = "dt_eff" self.dep_correct = dep_correct self.dt_cut = dt_cut - self.dep_acc = dep_acc if self.dt_cut is not None: self.dt_cut_param = dt_cut["out_param"] self.fit_selection = f"{self.selection_string} & {self.dt_cut_param}" @@ -1330,11 +859,13 @@ def __init__( self.high_cut_val = high_cut_val self.mean_func = mean_func self.sigma_func = sigma_func - self.comptBands_width = comptBands_width - self.plot_options = plot_options + self.compt_bands_width = compt_bands_width + self.debug_mode = debug_mode def update_cal_dicts(self, update_dict): - if re.match(r"(\d{8})T(\d{6})Z", list(self.cal_dicts)[0]): + if len(self.cal_dicts) > 0 and re.match( + r"(\d{8})T(\d{6})Z", list(self.cal_dicts)[0] + ): for tstamp in self.cal_dicts: if tstamp in update_dict: self.cal_dicts[tstamp].update(update_dict[tstamp]) @@ -1343,19 +874,11 @@ def update_cal_dicts(self, update_dict): else: self.cal_dicts.update(update_dict) - def aoe_timecorr(self, df, aoe_param, output_name="AoE_Timecorr", display=0): + def time_correction(self, df, aoe_param, output_name="AoE_Timecorr", display=0): log.info("Starting A/E time correction") - self.timecorr_df = pd.DataFrame( - columns=["run_timestamp", "mean", "mean_err", "res", "res_err"] - ) + self.timecorr_df = pd.DataFrame() try: if "run_timestamp" in df: - tstamps = sorted(np.unique(df["run_timestamp"])) - means = [] - errors = [] - reses = [] - res_errs = [] - final_tstamps = [] for tstamp, time_df in df.groupby("run_timestamp", sort=True): try: pars, errs, cov = unbinned_aoe_fit( @@ -1374,6 +897,8 @@ def aoe_timecorr(self, df, aoe_param, output_name="AoE_Timecorr", display=0): "run_timestamp": tstamp, "mean": pars["mu"], "mean_err": errs["mu"], + "sigma": pars["sigma"], + "sigma_err": errs["sigma"], "res": pars["sigma"] / pars["mu"], "res_err": (pars["sigma"] / pars["mu"]) * np.sqrt( @@ -1385,7 +910,11 @@ def aoe_timecorr(self, df, aoe_param, output_name="AoE_Timecorr", display=0): ), ] ) - except: + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif self.debug_mode: + raise (e) self.timecorr_df = pd.concat( [ self.timecorr_df, @@ -1395,6 +924,8 @@ def aoe_timecorr(self, df, aoe_param, output_name="AoE_Timecorr", display=0): "run_timestamp": tstamp, "mean": np.nan, "mean_err": np.nan, + "sigma": np.nan, + "sigma_err": np.nan, "res": np.nan, "res_err": np.nan, } @@ -1403,26 +934,41 @@ def aoe_timecorr(self, df, aoe_param, output_name="AoE_Timecorr", display=0): ] ) self.timecorr_df.set_index("run_timestamp", inplace=True) - time_dict = fit_time_means( - np.array(self.timecorr_df.index), - np.array(self.timecorr_df["mean"]), - np.array(self.timecorr_df["res"]), - ) + if len(self.timecorr_df) > 1: + time_dict = fit_time_means( + np.array(self.timecorr_df.index), + np.array(self.timecorr_df["mean"]), + np.array(self.timecorr_df["sigma"]), + ) - df[output_name] = df[aoe_param] / np.array( - [time_dict[tstamp] for tstamp in df["run_timestamp"]] - ) - self.update_cal_dicts( - { - tstamp: { + df[output_name] = df[aoe_param] / np.array( + [time_dict[tstamp] for tstamp in df["run_timestamp"]] + ) + self.update_cal_dicts( + { + tstamp: { + output_name: { + "expression": f"{aoe_param}/a", + "parameters": {"a": t_dict}, + } + } + for tstamp, t_dict in time_dict.items() + } + ) + else: + df[output_name] = ( + df[aoe_param] / np.array(self.timecorr_df["mean"])[0] + ) + self.update_cal_dicts( + { output_name: { "expression": f"{aoe_param}/a", - "parameters": {"a": t_dict}, + "parameters": { + "a": np.array(self.timecorr_df["mean"])[0] + }, } } - for tstamp, t_dict in time_dict.items() - } - ) + ) log.info("A/E time correction finished") else: try: @@ -1439,8 +985,11 @@ def aoe_timecorr(self, df, aoe_param, output_name="AoE_Timecorr", display=0): pd.DataFrame( [ { + "run_timestamp": np.nan, "mean": pars["mu"], "mean_err": errs["mu"], + "sigma": pars["sigma"], + "sigma_err": errs["sigma"], "res": pars["sigma"] / pars["mu"], "res_err": (pars["sigma"] / pars["mu"]) * np.sqrt( @@ -1452,15 +1001,23 @@ def aoe_timecorr(self, df, aoe_param, output_name="AoE_Timecorr", display=0): ), ] ) - except: + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif self.debug_mode: + raise (e) + self.timecorr_df = pd.concat( [ self.timecorr_df, pd.DataFrame( [ { + "run_timestamp": np.nan, "mean": np.nan, "mean_err": np.nan, + "sigma": np.nan, + "sigma_err": np.nan, "res": np.nan, "res_err": np.nan, } @@ -1477,9 +1034,14 @@ def aoe_timecorr(self, df, aoe_param, output_name="AoE_Timecorr", display=0): } } ) - log.info("A/E time correction finished") - except: + log.info("Finished A/E time correction") + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif self.debug_mode: + raise (e) log.error("A/E time correction failed") + df[output_name] = df[aoe_param] / np.nan self.update_cal_dicts( { output_name: { @@ -1493,6 +1055,7 @@ def drift_time_correction( self, data: pd.DataFrame, aoe_param, + out_param="AoE_DTcorr", display: int = 0, ): """ @@ -1518,9 +1081,9 @@ def drift_time_correction( np.nanpercentile(dep_events[self.dt_param], 99), ] - self.dt_res_dict[ - "final_selection" - ] = f"{aoe_param}>{aoe_range[0]}&{aoe_param}<{aoe_range[1]}&{self.dt_param}>{dt_range[0]}&{self.dt_param}<{dt_range[1]}&{self.dt_param}=={self.dt_param}" + self.dt_res_dict["final_selection"] = ( + f"{aoe_param}>{aoe_range[0]}&{aoe_param}<{aoe_range[1]}&{self.dt_param}>{dt_range[0]}&{self.dt_param}<{dt_range[1]}&{self.dt_param}=={self.dt_param}" + ) final_df = dep_events.query(self.dt_res_dict["final_selection"]) @@ -1533,68 +1096,97 @@ def drift_time_correction( ), ) - gpars = self.dt_res_dict["dt_guess"] = drift_time_distribution.guess( - hist, bins, var - ) - cost_func = cost.ExtendedUnbinnedNLL( - final_df[self.dt_param], drift_time_distribution.extended_pdf + bcs = pgh.get_bin_centers(bins) + mus = pgc.get_i_local_maxima(hist / (np.sqrt(var) + 10**-99), 2) + pk_pars, pk_covs = pgc.hpge_fit_energy_peak_tops( + hist, + bins, + var=var, + peak_locs=mus, + n_to_fit=5, ) - m = Minuit(cost_func, **gpars) - m.limits = drift_time_distribution.bounds(gpars) - m.fixed = drift_time_distribution.fixed() - m.simplex().migrad() - m.hesse() - self.dt_res_dict["dt_fit"] = { - "pars": m.values, - "errs": m.errors, - "object": m, - } - aoe_grp1 = self.dt_res_dict[ - "aoe_grp1" - ] = f'{self.dt_param}>{m.values["mu1"] - 2 * m.values["sigma1"]} & {self.dt_param}<{m.values["mu1"] + 2 * m.values["sigma1"]}' - aoe_grp2 = self.dt_res_dict[ - "aoe_grp2" - ] = f'{self.dt_param}>{m.values["mu2"] - 2 * m.values["sigma2"]} & {self.dt_param}<{m.values["mu2"] + 2 * m.values["sigma2"]}' - - aoe_pars, aoe_errs, _ = unbinned_aoe_fit( - final_df.query(aoe_grp1)[aoe_param], pdf=self.pdf, display=display - ) + mus = pk_pars[:, 0] + sigmas = pk_pars[:, 1] + amps = pk_pars[:, 2] - self.dt_res_dict["aoe_fit1"] = {"pars": aoe_pars, "errs": aoe_errs} + if len(mus) > 2: + ids = np.array( + sorted([np.argmax(amps), np.argmax(amps[amps != np.argmax(amps)])]) + ) + else: + ids = np.full(len(mus), True, dtype=bool) + mus = [bcs[int(mu)] for mu in mus[ids]] + sigmas = sigmas[ids] + amps = amps[ids] - aoe_pars2, aoe_errs2, _ = unbinned_aoe_fit( - final_df.query(aoe_grp2)[aoe_param], pdf=self.pdf, display=display - ) + self.dt_res_dict["dt_fit"] = {"mus": mus, "sigmas": sigmas, "amps": amps} - self.dt_res_dict["aoe_fit2"] = {"pars": aoe_pars2, "errs": aoe_errs2} + if len(mus) < 2: + log.info("Only 1 drift time peak found, no correction needed") + self.alpha = 0 - try: - self.alpha = (aoe_pars["mu"] - aoe_pars2["mu"]) / ( - (m.values["mu2"] * aoe_pars2["mu"]) - - (m.values["mu1"] * aoe_pars["mu"]) + else: + aoe_grp1 = self.dt_res_dict["aoe_grp1"] = ( + f"{self.dt_param}>{mus[0] - 2 * sigmas[0]} & {self.dt_param}<{mus[0] + 2 * sigmas[0]}" ) - except ZeroDivisionError: - self.alpha = 0 - self.dt_res_dict["alpha"] = self.alpha - log.info(f"dtcorr successful alpha:{self.alpha}") - data["AoE_DTcorr"] = data[aoe_param] * ( - 1 + self.alpha * data[self.dt_param] - ) - except: + aoe_grp2 = self.dt_res_dict["aoe_grp2"] = ( + f"{self.dt_param}>{mus[1] - 2 * sigmas[1]} & {self.dt_param}<{mus[1] + 2 * sigmas[1]}" + ) + + aoe_pars, aoe_errs, _ = unbinned_aoe_fit( + final_df.query(aoe_grp1)[aoe_param], pdf=self.pdf, display=display + ) + + self.dt_res_dict["aoe_fit1"] = { + "pars": aoe_pars.to_dict(), + "errs": aoe_errs.to_dict(), + } + + aoe_pars2, aoe_errs2, _ = unbinned_aoe_fit( + final_df.query(aoe_grp2)[aoe_param], pdf=self.pdf, display=display + ) + + self.dt_res_dict["aoe_fit2"] = { + "pars": aoe_pars2.to_dict(), + "errs": aoe_errs2.to_dict(), + } + + try: + self.alpha = (aoe_pars["mu"] - aoe_pars2["mu"]) / ( + (mus[0] * aoe_pars2["mu"]) - (mus[1] * aoe_pars["mu"]) + ) + except ZeroDivisionError: + self.alpha = 0 + self.dt_res_dict["alpha"] = self.alpha + log.info(f"dtcorr successful alpha:{self.alpha}") + + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif self.debug_mode: + raise (e) log.error("Drift time correction failed") - self.alpha = np.nan + self.alpha = 0 + data[out_param] = data[aoe_param] * (1 + self.alpha * data[self.dt_param]) self.update_cal_dicts( { - "AoE_DTcorr": { + out_param: { "expression": f"{aoe_param}*(1+a*{self.dt_param})", "parameters": {"a": self.alpha}, } } ) - def AoEcorrection(self, data: pd.DataFrame, aoe_param: str, display: int = 0): + def energy_correction( + self, + data: pd.DataFrame, + aoe_param: str, + corrected_param="AoE_Corrected", + classifier_param="AoE_Classifier", + display: int = 0, + ): """ Calculates the corrections needed for the energy dependence of the A/E. Does this by fitting the compton continuum in slices and then applies fits to the centroid and variance. @@ -1603,20 +1195,20 @@ def AoEcorrection(self, data: pd.DataFrame, aoe_param: str, display: int = 0): log.info("Starting A/E energy correction") self.energy_corr_res_dict = {} - comptBands = np.arange(900, 2350, self.comptBands_width) + compt_bands = np.arange(900, 2350, self.compt_bands_width) peaks = np.array( [1080, 1094, 1459, 1512, 1552, 1592, 1620, 1650, 1670, 1830, 2105] ) allowed = np.array([], dtype=bool) - for i, band in enumerate(comptBands): + for band in compt_bands: allow = True for peak in peaks: - if (peak - 5) > band and (peak - 5) < (band + self.comptBands_width): + if (peak - 5) > band and (peak - 5) < (band + self.compt_bands_width): allow = False - elif (peak + 5 > band) and (peak + 5) < (band + self.comptBands_width): + elif (peak + 5 > band) and (peak + 5) < (band + self.compt_bands_width): allow = False allowed = np.append(allowed, allow) - comptBands = comptBands[allowed] + compt_bands = compt_bands[allowed] self.energy_corr_fits = pd.DataFrame( columns=[ @@ -1634,18 +1226,20 @@ def AoEcorrection(self, data: pd.DataFrame, aoe_param: str, display: int = 0): select_df = data.query(f"{self.fit_selection} & {aoe_param}>0") # Fit each compton band - for band in comptBands: + for band in compt_bands: try: pars, errs, cov = unbinned_aoe_fit( select_df.query( - f"{self.cal_energy_param}>{band}&{self.cal_energy_param}< {self.comptBands_width+band}" + f"{self.cal_energy_param}>{band}&{self.cal_energy_param}< {self.compt_bands_width+band}" )[aoe_param], pdf=self.pdf, display=display, ) - mean, mean_err = self.pdf.centroid(pars, errs, cov) - sigma, sigma_err = self.pdf.width(pars, errs, cov) + mean, mean_err = self.pdf.get_mu(pars, cov) + sigma, sigma_err = self.pdf.get_fwhm(pars, cov) + sigma = sigma / 2.355 + sigma_err = sigma_err / 2.355 self.energy_corr_fits = pd.concat( [ @@ -1653,7 +1247,8 @@ def AoEcorrection(self, data: pd.DataFrame, aoe_param: str, display: int = 0): pd.DataFrame( [ { - "compt_bands": band + self.comptBands_width / 2, + "compt_bands": band + + self.compt_bands_width / 2, "mean": mean, "mean_err": mean_err, "sigma": sigma, @@ -1670,7 +1265,11 @@ def AoEcorrection(self, data: pd.DataFrame, aoe_param: str, display: int = 0): ] ) - except: + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif self.debug_mode: + raise (e) self.energy_corr_fits = pd.concat( [ self.energy_corr_fits, @@ -1724,7 +1323,7 @@ def AoEcorrection(self, data: pd.DataFrame, aoe_param: str, display: int = 0): ) / valid_fits["mean_err"] ) - dof_mu = len(valid_fits["mean"]) - len(pars) + dof_mu = len(valid_fits["mean"]) - len(mu_pars) p_val_mu = chi2.sf(csqr_mu, dof_mu) self.mean_fit_obj = m_mu @@ -1760,7 +1359,7 @@ def AoEcorrection(self, data: pd.DataFrame, aoe_param: str, display: int = 0): dof_sig = len(valid_fits["sigma"]) - len(sig_pars) p_val_sig = chi2.sf(csqr_sig, dof_sig) - self.sigma_fit_obj = m_sig + self.SigmaFit_obj = m_sig # Get DEP fit n_sigma = 4 @@ -1776,25 +1375,37 @@ def AoEcorrection(self, data: pd.DataFrame, aoe_param: str, display: int = 0): pdf=self.pdf, display=display, ) - except: + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif self.debug_mode: + raise (e) + dep_pars, dep_err, _ = return_nans(self.pdf) - data["AoE_Corrected"] = data[aoe_param] / self.mean_func.func( + data[corrected_param] = data[aoe_param] / self.mean_func.func( data[self.cal_energy_param], *mu_pars ) - data["AoE_Classifier"] = (data["AoE_Corrected"] - 1) / self.sigma_func.func( + data[classifier_param] = (data[corrected_param] - 1) / self.sigma_func.func( data[self.cal_energy_param], *sig_pars ) log.info("Finished A/E energy successful") log.info(f"mean pars are {mu_pars.to_dict()}") log.info(f"sigma pars are {sig_pars.to_dict()}") - except: + + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif self.debug_mode: + raise (e) log.error("A/E energy correction failed") mu_pars, mu_errs, mu_cov = return_nans(self.mean_func.func) csqr_mu, dof_mu, p_val_mu = (np.nan, np.nan, np.nan) csqr_sig, dof_sig, p_val_sig = (np.nan, np.nan, np.nan) sig_pars, sig_errs, sig_cov = return_nans(self.sigma_func.func) dep_pars, dep_err, dep_cov = return_nans(self.pdf) + data[corrected_param] = data[aoe_param] * np.nan + data[classifier_param] = data[aoe_param] * np.nan self.energy_corr_res_dict["mean_fits"] = { "func": self.mean_func.__name__, @@ -1806,7 +1417,7 @@ def AoEcorrection(self, data: pd.DataFrame, aoe_param: str, display: int = 0): "csqr_mu": (csqr_mu, dof_mu), } - self.energy_corr_res_dict["sigma_fits"] = { + self.energy_corr_res_dict["SigmaFits"] = { "func": self.sigma_func.__name__, "module": self.sigma_func.__module__, "expression": self.sigma_func.string_func("x"), @@ -1817,20 +1428,19 @@ def AoEcorrection(self, data: pd.DataFrame, aoe_param: str, display: int = 0): } self.energy_corr_res_dict["dep_fit"] = { - "func": self.pdf.__name__, - "module": self.pdf.__module__, + "func": self.pdf.name, "pars": dep_pars.to_dict(), "errs": dep_err.to_dict(), } self.update_cal_dicts( { - "AoE_Corrected": { + corrected_param: { "expression": f"{aoe_param}/({self.mean_func.string_func(self.cal_energy_param)})", "parameters": mu_pars.to_dict(), }, - "AoE_Classifier": { - "expression": f"(AoE_Corrected-1)/({self.sigma_func.string_func(self.cal_energy_param)})", + classifier_param: { + "expression": f"({corrected_param}-1)/({self.sigma_func.string_func(self.cal_energy_param)})", "parameters": sig_pars.to_dict(), }, } @@ -1843,6 +1453,7 @@ def get_aoe_cut_fit( peak: float, ranges: tuple, dep_acc: float, + output_cut_param: str = "AoE_Low_Cut", display: int = 1, ): """ @@ -1852,47 +1463,33 @@ def get_aoe_cut_fit( log.info("Starting A/E low cut determination") self.low_cut_res_dict = {} - self.cut_fits = pd.DataFrame(columns=["cut_val", "sf", "sf_err"]) + self.cut_fits = pd.DataFrame() min_range, max_range = ranges - + erange = (peak - min_range, peak + max_range) try: select_df = data.query( - f"{self.fit_selection}&({self.cal_energy_param} > {peak - min_range}) & ({self.cal_energy_param} < {peak + max_range})" + f"{self.fit_selection}&({self.cal_energy_param} > {erange[0]}) & ({self.cal_energy_param} < {erange[1]})" ) # if dep_correct is True: # peak_aoe = (select_df[aoe_param] / dep_mu(select_df[self.cal_energy_param])) - 1 # peak_aoe = select_df[aoe_param] / sig_func(select_df[self.cal_energy_param]) - cut_vals = np.arange(-8, 0, 0.2) - sfs = [] - sf_errs = [] - for cut_val in cut_vals: - sf, err, cut_pars, surv_pars = get_survival_fraction( - select_df[self.cal_energy_param].to_numpy(), - select_df[aoe_param].to_numpy(), - cut_val, - peak, - self.eres_func(peak), - guess_pars_cut=None, - guess_pars_surv=None, - ) - self.cut_fits = pd.concat( - [ - self.cut_fits, - pd.DataFrame( - [ - { - "cut_val": cut_val, - "sf": sf, - "sf_err": err, - } - ] - ), - ] - ) - self.cut_fits.set_index("cut_val", inplace=True) + self.cut_fits, _, _ = get_sf_sweep( + select_df[self.cal_energy_param], + select_df[aoe_param], + None, + peak, + self.eres_func(peak), + fit_range=erange, + dt_mask=None, + cut_range=(-8, 0), + n_samples=40, + mode="greater", + debug_mode=self.debug_mode, + ) + valid_fits = self.cut_fits.query( f'sf_err<{(1.5 * np.nanpercentile(self.cut_fits["sf_err"],85))}&sf_err==sf_err' ) @@ -1901,12 +1498,12 @@ def get_aoe_cut_fit( valid_fits.index, valid_fits["sf"], valid_fits["sf_err"], - sigmoid_fit.func, + SigmoidFit.func, ) c.loss = "soft_l1" m1 = Minuit( c, - *sigmoid_fit.guess( + *SigmoidFit.guess( valid_fits.index, valid_fits["sf"], valid_fits["sf_err"] ), ) @@ -1914,28 +1511,32 @@ def get_aoe_cut_fit( xs = np.arange( np.nanmin(valid_fits.index), np.nanmax(valid_fits.index), 0.01 ) - p = sigmoid_fit.func(xs, *m1.values) + p = SigmoidFit.func(xs, *m1.values) self.cut_fit = { - "function": sigmoid_fit.__name__, + "function": SigmoidFit.__name__, "pars": m1.values.to_dict(), "errs": m1.errors.to_dict(), } - self.low_cut_val = round(xs[np.argmin(np.abs(p - (100 * self.dep_acc)))], 3) + self.low_cut_val = round(xs[np.argmin(np.abs(p - (100 * dep_acc)))], 3) log.info(f"Cut found at {self.low_cut_val}") - data["AoE_Low_Cut"] = data[aoe_param] > self.low_cut_val + data[output_cut_param] = data[aoe_param] > self.low_cut_val if self.dt_cut_param is not None: - data["AoE_Low_Cut"] = data["AoE_Low_Cut"] & (data[self.dt_cut_param]) - data["AoE_Double_Sided_Cut"] = data["AoE_Low_Cut"] & ( - data[aoe_param] < self.high_cut_val - ) - except: + data[output_cut_param] = data[output_cut_param] & ( + data[self.dt_cut_param] + ) + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif self.debug_mode: + raise (e) log.error("A/E cut determination failed") self.low_cut_val = np.nan - if self.dt_cut_param is not None and self.dt_cut_hard == True: + data[output_cut_param] = False + if self.dt_cut_param is not None and self.dt_cut_hard is True: self.update_cal_dicts( { - "AoE_Low_Cut": { + output_cut_param: { "expression": f"({aoe_param}>a) & ({self.dt_cut_param})", "parameters": {"a": self.low_cut_val}, } @@ -1944,67 +1545,29 @@ def get_aoe_cut_fit( else: self.update_cal_dicts( { - "AoE_Low_Cut": { + output_cut_param: { "expression": f"({aoe_param}>a)", "parameters": {"a": self.low_cut_val}, } } ) - self.update_cal_dicts( - { - "AoE_Double_Sided_Cut": { - "expression": f"(a>{aoe_param}) & (AoE_Low_Cut)", - "parameters": {"a": self.high_cut_val}, - } - } - ) - - def get_results_dict(self): - return { - "cal_energy_param": self.cal_energy_param, - "dt_param": self.dt_param, - "rt_correction": self.dt_corr, - "pdf": self.pdf.__name__, - "1000-1300keV": self.timecorr_df.to_dict("index"), - "correction_fit_results": self.energy_corr_res_dict, - "low_cut": self.low_cut_val, - "high_cut": self.high_cut_val, - "low_side_sfs": self.low_side_sf.to_dict("index"), - "2_side_sfs": self.two_side_sf.to_dict("index"), - } - def fill_plot_dict(self, data, plot_dict={}): - for key, item in self.plot_options.items(): - if item["options"] is not None: - plot_dict[key] = item["function"](self, data, **item["options"]) - else: - plot_dict[key] = item["function"](self, data) - return plot_dict - - def calibrate(self, df, initial_aoe_param): - self.aoe_timecorr(df, initial_aoe_param) - log.info("Finished A/E time correction") - - if self.dt_corr == True: - aoe_param = "AoE_DTcorr" - self.drift_time_correction(df, "AoE_Timecorr") - else: - aoe_param = "AoE_Timecorr" - - self.AoEcorrection(df, aoe_param) - - self.get_aoe_cut_fit(df, "AoE_Classifier", 1592, (40, 20), 0.9) - - aoe_param = "AoE_Classifier" - log.info(" Compute low side survival fractions: ") - self.low_side_sf = pd.DataFrame(columns=["peak", "sf", "sf_err"]) - peaks_of_interest = [1592.5, 1620.5, 2039, 2103.53, 2614.50] - fit_widths = [(40, 25), (25, 40), (0, 0), (25, 40), (50, 50)] - self.low_side_peak_dfs = {} + def calculate_survival_fractions_sweep( + self, + data, + aoe_param, + peaks, + fit_widths, + n_samples=26, + cut_range=(-5, 5), + mode="greater", + ): + sfs = pd.DataFrame() + peak_dfs = {} - for i, peak in enumerate(peaks_of_interest): + for i, peak in enumerate(peaks): try: - select_df = df.query( + select_df = data.query( f"{self.selection_string}&{aoe_param}=={aoe_param}" ) fwhm = self.eres_func(peak) @@ -2021,19 +1584,25 @@ def calibrate(self, df, initial_aoe_param): self.low_cut_val, peak, fwhm, - dt_mask=peak_df[self.dt_cut_param].to_numpy() - if self.dt_cut_param is not None - else None, + cut_range=cut_range, + n_samples=n_samples, + mode=mode, + dt_mask=( + peak_df[self.dt_cut_param].to_numpy() + if self.dt_cut_param is not None + else None + ), ) - self.low_side_sf = pd.concat( + sfs = pd.concat( [ - self.low_side_sf, + sfs, pd.DataFrame([{"peak": peak, "sf": sf, "sf_err": sf_err}]), ] ) - self.low_side_peak_dfs[peak] = cut_df + peak_dfs[peak] = cut_df else: emin, emax = fit_widths[i] + fit_range = (peak - emin, peak + emax) peak_df = select_df.query( f"({self.cal_energy_param}>{peak-emin})&({self.cal_energy_param}<{peak+emax})" ) @@ -2043,39 +1612,58 @@ def calibrate(self, df, initial_aoe_param): self.low_cut_val, peak, fwhm, - dt_mask=peak_df[self.dt_cut_param].to_numpy() - if self.dt_cut_param is not None - else None, + fit_range=fit_range, + cut_range=cut_range, + n_samples=n_samples, + mode=mode, + dt_mask=( + peak_df[self.dt_cut_param].to_numpy() + if self.dt_cut_param is not None + else None + ), + debug_mode=self.debug_mode, + ) + + cut_df = cut_df.query( + f'sf_err<5*{np.nanpercentile(cut_df["sf_err"], 50)}& sf_err==sf_err & sf<=100' ) - self.low_side_sf = pd.concat( + + sfs = pd.concat( [ - self.low_side_sf, + sfs, pd.DataFrame([{"peak": peak, "sf": sf, "sf_err": sf_err}]), ] ) - self.low_side_peak_dfs[peak] = cut_df + peak_dfs[peak] = cut_df log.info(f"{peak}keV: {sf:2.1f} +/- {sf_err:2.1f} %") - except: - self.low_side_sf = pd.concat( + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif self.debug_mode: + raise (e) + sfs = pd.concat( [ - self.low_side_sf, + sfs, pd.DataFrame([{"peak": peak, "sf": np.nan, "sf_err": np.nan}]), ] ) log.error( - f"A/E Low side Survival fraction determination failed for {peak} peak" + f"A/E Survival fraction sweep determination failed for {peak} peak" ) - self.low_side_sf.set_index("peak", inplace=True) + sfs.set_index("peak", inplace=True) + return sfs, peak_dfs - self.two_side_sf = pd.DataFrame(columns=["peak", "sf", "sf_err"]) - log.info("Calculating 2 sided cut sfs") - for i, peak in enumerate(peaks_of_interest): + def calculate_survival_fractions( + self, data, aoe_param, peaks, fit_widths, mode="greater" + ): + sfs = pd.DataFrame() + for i, peak in enumerate(peaks): fwhm = self.eres_func(peak) try: if peak == 2039: emin = 2 * fwhm emax = 2 * fwhm - peak_df = select_df.query( + peak_df = data.query( f"({self.cal_energy_param}>{peak-emin})&({self.cal_energy_param}<{peak+emax})" ) @@ -2083,21 +1671,25 @@ def calibrate(self, df, initial_aoe_param): peak_df[aoe_param].to_numpy(), self.low_cut_val, self.high_cut_val, - dt_mask=peak_df[self.dt_cut_param].to_numpy() - if self.dt_cut_param is not None - else None, + mode=mode, + dt_mask=( + peak_df[self.dt_cut_param].to_numpy() + if self.dt_cut_param is not None + else None + ), ) sf = sf_dict["sf"] sf_err = sf_dict["sf_err"] - self.two_side_sf = pd.concat( + sfs = pd.concat( [ - self.two_side_sf, + sfs, pd.DataFrame([{"peak": peak, "sf": sf, "sf_err": sf_err}]), ] ) else: emin, emax = fit_widths[i] - peak_df = select_df.query( + fit_range = (peak - emin, peak + emax) + peak_df = data.query( f"({self.cal_energy_param}>{peak-emin})&({self.cal_energy_param}<{peak+emax})" ) sf, sf_err, _, _ = get_survival_fraction( @@ -2106,34 +1698,156 @@ def calibrate(self, df, initial_aoe_param): self.low_cut_val, peak, fwhm, + fit_range=fit_range, + mode=mode, high_cut=self.high_cut_val, - dt_mask=peak_df[self.dt_cut_param].to_numpy() - if self.dt_cut_param is not None - else None, + dt_mask=( + peak_df[self.dt_cut_param].to_numpy() + if self.dt_cut_param is not None + else None + ), ) - self.two_side_sf = pd.concat( + sfs = pd.concat( [ - self.two_side_sf, + sfs, pd.DataFrame([{"peak": peak, "sf": sf, "sf_err": sf_err}]), ] ) log.info(f"{peak}keV: {sf:2.1f} +/- {sf_err:2.1f} %") - except: - self.two_side_sf = pd.concat( + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif self.debug_mode: + raise (e) + sfs = pd.concat( [ - self.two_side_sf, + sfs, pd.DataFrame([{"peak": peak, "sf": np.nan, "sf_err": np.nan}]), ] ) - log.error( - f"A/E two side Survival fraction determination failed for {peak} peak" + log.error(f"A/E survival fraction determination failed for {peak} peak") + sfs.set_index("peak", inplace=True) + return sfs + + def calibrate( + self, + df, + initial_aoe_param, + peaks_of_interest=None, + fit_widths=None, + cut_peak_idx=0, + dep_acc=0.9, + sf_nsamples=11, + sf_cut_range=(-5, 5), + ): + if peaks_of_interest is None: + peaks_of_interest = [1592.5, 1620.5, 2039, 2103.53, 2614.50] + if fit_widths is None: + fit_widths = [(40, 25), (25, 40), (0, 0), (25, 40), (50, 50)] + + self.time_correction(df, initial_aoe_param, output_name="AoE_Timecorr") + + if self.dt_corr is True: + aoe_param = "AoE_DTcorr" + self.drift_time_correction(df, "AoE_Timecorr", out_param=aoe_param) + else: + aoe_param = "AoE_Timecorr" + + self.energy_correction( + df, + aoe_param, + corrected_param="AoE_Corrected", + classifier_param="AoE_Classifier", + ) + + self.get_aoe_cut_fit( + df, + "AoE_Classifier", + peaks_of_interest[cut_peak_idx], + fit_widths[cut_peak_idx], + dep_acc, + output_cut_param="AoE_Low_Cut", + ) + + df["AoE_Double_Sided_Cut"] = df["AoE_Low_Cut"] & ( + df["AoE_Classifier"] < self.high_cut_val + ) + + if self.dt_cut_param is not None and self.dt_cut_hard is True: + self.update_cal_dicts( + { + "AoE_High_Side_Cut": { + "expression": f"(a>AoE_Classifier)& ({self.dt_cut_param})", + "parameters": {"a": self.high_cut_val}, + } + } + ) + else: + self.update_cal_dicts( + { + "AoE_High_Side_Cut": { + "expression": "(a>AoE_Classifier)", + "parameters": {"a": self.high_cut_val}, + } + } + ) + + self.update_cal_dicts( + { + "AoE_Double_Sided_Cut": { + "expression": "(AoE_High_Side_Cut) & (AoE_Low_Cut)", + "parameters": {}, + } + } + ) + + log.info("Compute low side survival fractions: ") + ( + self.low_side_sfs, + self.low_side_peak_dfs, + ) = self.calculate_survival_fractions_sweep( + df, + "AoE_Classifier", + peaks_of_interest, + fit_widths, + n_samples=sf_nsamples, + cut_range=sf_cut_range, + mode="greater", + ) + + log.info("Compute 2 side survival fractions: ") + self.two_side_sfs = self.calculate_survival_fractions( + df, "AoE_Classifier", peaks_of_interest, fit_widths, mode="greater" + ) + + if re.match(r"(\d{8})T(\d{6})Z", list(self.cal_dicts)[0]): + self.low_side_sfs_by_run = {} + self.two_side_sfs_by_run = {} + for tstamp in self.cal_dicts: + log.info(f"Compute survival fractions for {tstamp}: ") + self.low_side_sfs_by_run[tstamp] = self.calculate_survival_fractions( + df.query(f"run_timestamp == '{tstamp}'"), + "AoE_Classifier", + peaks_of_interest, + fit_widths, + mode="greater", ) - self.two_side_sf.set_index("peak", inplace=True) + + self.two_side_sfs_by_run[tstamp] = self.calculate_survival_fractions( + df.query(f"run_timestamp == '{tstamp}'"), + "AoE_Classifier", + peaks_of_interest, + fit_widths, + mode="greater", + ) + else: + self.low_side_sfs_by_run = None + self.two_side_sfs_by_run = None def plot_aoe_mean_time( - aoe_class, data, time_param="AoE_Timecorr", figsize=[12, 8], fontsize=12 + aoe_class, data, time_param="AoE_Timecorr", figsize=(12, 8), fontsize=12 ): plt.rcParams["figure.figsize"] = figsize plt.rcParams["font.size"] = fontsize @@ -2181,18 +1895,18 @@ def plot_aoe_mean_time( color="yellow", alpha=0.2, ) - except: + except Exception: pass ax.set_xlabel("time") ax.set_ylabel("A/E mean") - myFmt = mdates.DateFormatter("%b %d") - ax.xaxis.set_major_formatter(myFmt) + myfmt = mdates.DateFormatter("%b %d") + ax.xaxis.set_major_formatter(myfmt) plt.close() return fig def plot_aoe_res_time( - aoe_class, data, time_param="AoE_Timecorr", figsize=[12, 8], fontsize=12 + aoe_class, data, time_param="AoE_Timecorr", figsize=(12, 8), fontsize=12 ): plt.rcParams["figure.figsize"] = figsize plt.rcParams["font.size"] = fontsize @@ -2207,12 +1921,12 @@ def plot_aoe_res_time( yerr=aoe_class.timecorr_df["res_err"], linestyle=" ", ) - except: + except Exception: pass ax.set_xlabel("time") ax.set_ylabel("A/E res") - myFmt = mdates.DateFormatter("%b %d") - ax.xaxis.set_major_formatter(myFmt) + myfmt = mdates.DateFormatter("%b %d") + ax.xaxis.set_major_formatter(myfmt) plt.close() return fig @@ -2222,7 +1936,7 @@ def drifttime_corr_plot( data, aoe_param="AoE_Timecorr", aoe_param_corr="AoE_DTcorr", - figsize=[12, 8], + figsize=(12, 8), fontsize=12, ): plt.rcParams["figure.figsize"] = figsize @@ -2239,18 +1953,20 @@ def drifttime_corr_plot( plt.subplot(2, 2, 1) aoe_pars = aoe_class.dt_res_dict["aoe_fit1"]["pars"] - xs = np.linspace(aoe_pars["lower_range"], aoe_pars["upper_range"], 100) + xs = np.linspace(aoe_pars["x_lo"], aoe_pars["x_hi"], 100) counts, aoe_bins, bars = plt.hist( final_df.query( - f'{aoe_class.dt_res_dict["aoe_grp1"]}&({aoe_param}<{aoe_pars["upper_range"]})&({aoe_param}>{aoe_pars["lower_range"]})' + f'{aoe_class.dt_res_dict["aoe_grp1"]}&({aoe_param}<{aoe_pars["x_hi"]})&({aoe_param}>{aoe_pars["x_lo"]})' )[aoe_param], bins=400, histtype="step", label="data", ) dx = np.diff(aoe_bins) - plt.plot(xs, aoe_class.pdf.pdf(xs, *aoe_pars) * dx[0], label="full fit") - sig, bkg = aoe_class.pdf.pdf(xs, *aoe_pars[:-1], True) + plt.plot(xs, aoe_class.pdf.get_pdf(xs, *aoe_pars) * dx[0], label="full fit") + aoe_class.pdf.components = True + sig, bkg = aoe_class.pdf.get_pdf(xs, *aoe_pars) + aoe_class.pdf.components = False plt.plot(xs, sig * dx[0], label="peak fit") plt.plot(xs, bkg * dx[0], label="bkg fit") plt.legend(loc="upper left") @@ -2259,18 +1975,20 @@ def drifttime_corr_plot( aoe_pars2 = aoe_class.dt_res_dict["aoe_fit2"]["pars"] plt.subplot(2, 2, 2) - xs = np.linspace(aoe_pars2["lower_range"], aoe_pars2["upper_range"], 100) + xs = np.linspace(aoe_pars2["x_lo"], aoe_pars2["x_hi"], 100) counts, aoe_bins2, bars = plt.hist( final_df.query( - f'{aoe_class.dt_res_dict["aoe_grp2"]}&({aoe_param}<{aoe_pars2["upper_range"]})&({aoe_param}>{aoe_pars2["lower_range"]})' + f'{aoe_class.dt_res_dict["aoe_grp2"]}&({aoe_param}<{aoe_pars2["x_hi"]})&({aoe_param}>{aoe_pars2["x_lo"]})' )[aoe_param], bins=400, histtype="step", label="Data", ) dx = np.diff(aoe_bins2) - plt.plot(xs, aoe_class.pdf.pdf(xs, *aoe_pars2) * dx[0], label="full fit") - sig, bkg = aoe_class.pdf.pdf(xs, *aoe_pars2[:-1], True) + plt.plot(xs, aoe_class.pdf.get_pdf(xs, *aoe_pars2) * dx[0], label="full fit") + aoe_class.pdf.components = True + sig, bkg = aoe_class.pdf.get_pdf(xs, *aoe_pars2) + aoe_class.pdf.components = False plt.plot(xs, sig * dx[0], label="peak fit") plt.plot(xs, bkg * dx[0], label="bkg fit") plt.legend(loc="upper left") @@ -2288,25 +2006,18 @@ def drifttime_corr_plot( plt.subplot(2, 2, 3) plt.step(pgh.get_bin_centers(bins), hist, label="data") - plt.plot( - pgh.get_bin_centers(bins), - drift_time_distribution.pdf( - pgh.get_bin_centers(bins), **aoe_class.dt_res_dict["dt_guess"] - ) - * np.diff(bins)[0], - label="Guess", - ) - plt.plot( - pgh.get_bin_centers(bins), - drift_time_distribution.pdf( - pgh.get_bin_centers(bins), *aoe_class.dt_res_dict["dt_fit"]["pars"] + + mus = aoe_class.dt_res_dict["dt_fit"]["mus"] + sigmas = aoe_class.dt_res_dict["dt_fit"]["sigmas"] + amps = aoe_class.dt_res_dict["dt_fit"]["amps"] + + for mu, sigma, amp in zip(mus, sigmas, amps): + plt.plot( + pgh.get_bin_centers(bins), + gaussian.get_pdf(pgh.get_bin_centers(bins), mu, sigma) * amp, ) - * np.diff(bins)[0], - label="fit", - ) plt.xlabel("drift time (ns)") plt.ylabel("Counts") - plt.legend(loc="upper left") plt.subplot(2, 2, 4) bins = np.linspace( @@ -2323,7 +2034,7 @@ def drifttime_corr_plot( plt.legend(loc="upper left") plt.tight_layout() plt.xlim(bins[0], bins[-1]) - except: + except Exception: pass plt.close() return fig @@ -2338,7 +2049,7 @@ def plot_compt_bands_overlayed( title="Compton Bands", density=True, n_bins=50, - figsize=[12, 8], + figsize=(12, 8), fontsize=12, ) -> None: """ @@ -2368,7 +2079,7 @@ def plot_compt_bands_overlayed( label=f"{erange[0]}-{erange[1]}", density=density, ) - except: + except Exception: pass plt.ylabel("counts") plt.xlabel(aoe_param) @@ -2384,9 +2095,9 @@ def plot_dt_dep( eranges: list[tuple], titles: list = None, aoe_param="AoE_Timecorr", - bins=[200, 100], + bins=(200, 100), dt_max=2000, - figsize=[12, 8], + figsize=(12, 8), fontsize=12, ) -> None: """ @@ -2423,14 +2134,14 @@ def plot_dt_dep( plt.title(f"{erange[0]}-{erange[1]}") else: plt.title(titles[i]) - except: + except Exception: pass plt.tight_layout() plt.close() return fig -def plot_mean_fit(aoe_class, data, figsize=[12, 8], fontsize=12) -> plt.figure: +def plot_mean_fit(aoe_class, data, figsize=(12, 8), fontsize=12) -> plt.figure: plt.rcParams["figure.figsize"] = figsize plt.rcParams["font.size"] = fontsize fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True) @@ -2439,7 +2150,7 @@ def plot_mean_fit(aoe_class, data, figsize=[12, 8], fontsize=12) -> plt.figure: aoe_class.energy_corr_fits.index, aoe_class.energy_corr_fits["mean"], yerr=aoe_class.energy_corr_fits["mean_err"], - xerr=aoe_class.comptBands_width / 2, + xerr=aoe_class.compt_bands_width / 2, label="data", linestyle=" ", ) @@ -2496,7 +2207,7 @@ def plot_mean_fit(aoe_class, data, figsize=[12, 8], fontsize=12) -> plt.figure: lw=1, c="g", ) - except: + except Exception: pass ax2.set_ylabel("residuals %", ha="right", y=1) ax2.set_xlabel("energy (keV)", ha="right", x=1) @@ -2505,7 +2216,7 @@ def plot_mean_fit(aoe_class, data, figsize=[12, 8], fontsize=12) -> plt.figure: return fig -def plot_sigma_fit(aoe_class, data, figsize=[12, 8], fontsize=12) -> plt.figure: +def plot_sigma_fit(aoe_class, data, figsize=(12, 8), fontsize=12) -> plt.figure: plt.rcParams["figure.figsize"] = figsize plt.rcParams["font.size"] = fontsize @@ -2515,15 +2226,13 @@ def plot_sigma_fit(aoe_class, data, figsize=[12, 8], fontsize=12) -> plt.figure: aoe_class.energy_corr_fits.index, aoe_class.energy_corr_fits["sigma"], yerr=aoe_class.energy_corr_fits["sigma_err"], - xerr=aoe_class.comptBands_width / 2, + xerr=aoe_class.compt_bands_width / 2, label="data", linestyle=" ", ) - sig_pars = aoe_class.energy_corr_res_dict["sigma_fits"]["pars"] - if aoe_class.sigma_func == sigma_fit: + sig_pars = aoe_class.energy_corr_res_dict["SigmaFits"]["pars"] + if aoe_class.sigma_func == SigmaFit: label = f'sqrt model: \nsqrt({sig_pars["a"]:1.4f}+({sig_pars["b"]:1.1f}/E)^{sig_pars["c"]:1.1f})' - elif aoe_class.sigma_func == sigma_fit_quadratic: - label = f'quad model: \n({sig_pars["a"]:1.4f}+({sig_pars["b"]:1.6f}*E)+\n({sig_pars["c"]:1.6f}*E)^2)' else: raise ValueError("unknown sigma function") ax1.plot( @@ -2567,7 +2276,7 @@ def plot_sigma_fit(aoe_class, data, figsize=[12, 8], fontsize=12) -> plt.figure: lw=1, c="g", ) - except: + except Exception: pass ax2.set_ylabel("residuals", ha="right", y=1) ax2.set_xlabel("energy (keV)", ha="right", x=1) @@ -2576,7 +2285,9 @@ def plot_sigma_fit(aoe_class, data, figsize=[12, 8], fontsize=12) -> plt.figure: return fig -def plot_cut_fit(aoe_class, data, figsize=[12, 8], fontsize=12) -> plt.figure: +def plot_cut_fit( + aoe_class, data, dep_acc=0.9, figsize=(12, 8), fontsize=12 +) -> plt.figure: plt.rcParams["figure.figsize"] = figsize plt.rcParams["font.size"] = fontsize fig = plt.figure() @@ -2590,12 +2301,12 @@ def plot_cut_fit(aoe_class, data, figsize=[12, 8], fontsize=12) -> plt.figure: plt.plot( aoe_class.cut_fits.index.to_numpy(), - sigmoid_fit.func( + SigmoidFit.func( aoe_class.cut_fits.index.to_numpy(), **aoe_class.cut_fit["pars"] ), ) plt.hlines( - (100 * aoe_class.dep_acc), + (100 * dep_acc), -8.1, aoe_class.low_cut_val, color="red", @@ -2604,7 +2315,7 @@ def plot_cut_fit(aoe_class, data, figsize=[12, 8], fontsize=12) -> plt.figure: plt.vlines( aoe_class.low_cut_val, np.nanmin(aoe_class.cut_fits["sf"]) * 0.9, - (100 * aoe_class.dep_acc), + (100 * dep_acc), color="red", linestyle="--", ) @@ -2612,7 +2323,7 @@ def plot_cut_fit(aoe_class, data, figsize=[12, 8], fontsize=12) -> plt.figure: vals, labels = plt.yticks() plt.yticks(vals, [f"{x:,.0f} %" for x in vals]) plt.ylim([np.nanmin(aoe_class.cut_fits["sf"]) * 0.9, 102]) - except: + except Exception: pass plt.xlabel("cut value") plt.ylabel("survival percentage") @@ -2621,7 +2332,7 @@ def plot_cut_fit(aoe_class, data, figsize=[12, 8], fontsize=12) -> plt.figure: def plot_survival_fraction_curves( - aoe_class, data, figsize=[12, 8], fontsize=12 + aoe_class, data, figsize=(12, 8), fontsize=12 ) -> plt.figure: plt.rcParams["figure.figsize"] = figsize plt.rcParams["font.size"] = fontsize @@ -2642,11 +2353,11 @@ def plot_survival_fraction_curves( survival_df.index, survival_df["sf"], yerr=survival_df["sf_err"], - label=f'{get_peak_label(peak)} {peak} keV: {aoe_class.low_side_sf.loc[peak]["sf"]:2.1f} +/- {aoe_class.low_side_sf.loc[peak]["sf_err"]:2.1f} %', + label=f'{get_peak_label(peak)} {peak} keV: {aoe_class.low_side_sfs.loc[peak]["sf"]:2.1f} +/- {aoe_class.low_side_sfs.loc[peak]["sf_err"]:2.1f} %', ) - except: + except Exception: pass - except: + except Exception: pass vals, labels = plt.yticks() plt.yticks(vals, [f"{x:,.0f} %" for x in vals]) @@ -2665,7 +2376,7 @@ def plot_spectra( n_bins=2101, xrange_inset=(1580, 1640), n_bins_inset=200, - figsize=[12, 8], + figsize=(12, 8), fontsize=12, ) -> plt.figure: plt.rcParams["figure.figsize"] = figsize @@ -2736,7 +2447,7 @@ def plot_spectra( bins=bins, histtype="step", ) - except: + except Exception: pass ax.set_xlim(xrange) ax.set_yscale("log") @@ -2748,7 +2459,7 @@ def plot_spectra( def plot_sf_vs_energy( - aoe_class, data, xrange=(900, 3000), n_bins=701, figsize=[12, 8], fontsize=12 + aoe_class, data, xrange=(900, 3000), n_bins=701, figsize=(12, 8), fontsize=12 ) -> plt.figure: plt.rcParams["figure.figsize"] = figsize plt.rcParams["font.size"] = fontsize @@ -2769,7 +2480,7 @@ def plot_sf_vs_energy( survival_fracs = counts_pass / (counts + 10**-99) plt.step(pgh.get_bin_centers(bins_pass), 100 * survival_fracs) - except: + except Exception: pass plt.ylim([0, 100]) vals, labels = plt.yticks() @@ -2788,7 +2499,7 @@ def plot_classifier( yrange=(-50, 10), xn_bins=700, yn_bins=500, - figsize=[12, 8], + figsize=(12, 8), fontsize=12, ) -> plt.figure: plt.rcParams["figure.figsize"] = figsize @@ -2805,7 +2516,7 @@ def plot_classifier( ], norm=LogNorm(), ) - except: + except Exception: pass plt.xlabel("energy (keV)") plt.ylabel(aoe_param) diff --git a/src/pygama/pargen/cuts.py b/src/pygama/pargen/cuts.py deleted file mode 100644 index 638199f64..000000000 --- a/src/pygama/pargen/cuts.py +++ /dev/null @@ -1,412 +0,0 @@ -""" -This module provides routines for calculating and applying quality cuts -""" - -from __future__ import annotations - -import glob -import json -import logging -import os - -import lgdo.lh5 as lh5 -import numpy as np -import pandas as pd -from lgdo.types import Table -from scipy import stats - -import pygama.math.histogram as pgh -import pygama.math.peak_fitting as pgf -import pygama.pargen.energy_cal as pgc - -log = logging.getLogger(__name__) - - -def get_keys(in_data, parameters): - out_params = [] - if isinstance(in_data, dict): - possible_keys = in_data.keys() - elif isinstance(in_data, list): - possible_keys = in_data - for param in parameters: - for key in possible_keys: - if key in param: - out_params.append(key) - return np.unique(out_params).tolist() - - -def generate_cuts( - data: dict[str, np.ndarray], parameters: dict[str, int], rounding: int = 4 -) -> dict: - """ - Finds double sided cut boundaries for a file for the parameters specified - - Parameters - ---------- - data : lh5 table or dictionary of arrays - data to calculate cuts on - parameters : dict - dictionary with the parameter to be cut and the number of sigmas to cut at - """ - - output_dict = {} - if isinstance(data, pd.DataFrame): - pass - elif isinstance(data, Table): - data = {entry: data[entry].nda for entry in get_keys(data, parameters)} - data = pd.DataFrame.from_dict(data) - elif isinstance(data, dict): - data = pd.DataFrame.from_dict(data) - for par in parameters.keys(): - if isinstance(parameters[par], dict): - if "Lower Boundary" in list(parameters[par]) or "Upper Boundary" in list( - parameters[par] - ): - output_dict[par] = parameters[par].copy() - if "Lower Boundary" not in parameters[par]: - output_dict[par]["Lower Boundary"] = -np.inf - if "Upper Boundary" not in parameters[par]: - output_dict[par]["Upper Boundary"] = np.inf - continue - num_sigmas = parameters[par] - try: - all_par_array = data[par].to_numpy() - except KeyError: - all_par_array = data.eval(par).to_numpy() - idxs = (all_par_array > np.nanpercentile(all_par_array, 1)) & ( - all_par_array < np.nanpercentile(all_par_array, 99) - ) - par_array = all_par_array[idxs] - bin_width = ( - np.nanpercentile(par_array, 70) - np.nanpercentile(par_array, 50) - ) / 5 - - counts, start_bins, var = pgh.get_hist( - par_array, range=(np.nanmin(par_array), np.nanmax(par_array)), dx=bin_width - ) - max_idx = np.argmax(counts) - mu = start_bins[max_idx] - try: - fwhm = pgh.get_fwhm(counts, start_bins)[0] - guess_sig = fwhm / 2.355 - - lower_bound = mu - 10 * guess_sig - - upper_bound = mu + 10 * guess_sig - - except: - bin_range = 1000 - - if max_idx < bin_range: - lower_bound_idx = 0 - else: - lower_bound_idx = max_idx - bin_range - lower_bound = start_bins[lower_bound_idx] - - if max_idx > len(start_bins) - bin_range: - upper_bound_idx = -1 - else: - upper_bound_idx = max_idx + bin_range - - upper_bound = start_bins[upper_bound_idx] - - if (lower_bound < np.nanmin(par_array)) or (lower_bound > np.nanmax(par_array)): - lower_bound = np.nanmin(par_array) - if (upper_bound > np.nanmax(par_array)) or (upper_bound < np.nanmin(par_array)): - upper_bound = np.nanmax(par_array) - - try: - counts, bins, var = pgh.get_hist( - par_array, bins=200, range=(lower_bound, upper_bound) - ) - - bin_centres = pgh.get_bin_centers(bins) - - fwhm = pgh.get_fwhm(counts, bins)[0] - mean = float(bin_centres[np.argmax(counts)]) - pars, cov = pgf.gauss_mode_width_max( - counts, - bins, - mode_guess=mean, - n_bins=20, - cost_func="Least Squares", - inflate_errors=False, - gof_method="var", - ) - mean = pars[0] - std = fwhm / 2.355 - - if mean < np.nanmin(bins) or mean > np.nanmax(bins): - raise IndexError - except IndexError: - bin_range = 5000 - - if max_idx < bin_range: - lower_bound_idx = 0 - else: - lower_bound_idx = max_idx - bin_range - lower_bound = start_bins[lower_bound_idx] - - if max_idx > len(start_bins) - bin_range: - upper_bound_idx = -1 - else: - upper_bound_idx = max_idx + bin_range - upper_bound = start_bins[upper_bound_idx] - counts, bins, var = pgh.get_hist( - par_array, bins=200, range=(lower_bound, upper_bound) - ) - - bin_centres = pgh.get_bin_centers(bins) - - fwhm = pgh.get_fwhm(counts, bins)[0] - mean = float(bin_centres[np.argmax(counts)]) - std = fwhm / 2.355 - - if isinstance(num_sigmas, (int, float)): - num_sigmas_left = num_sigmas - num_sigmas_right = num_sigmas - elif isinstance(num_sigmas, dict): - if "left" in num_sigmas: - num_sigmas_left = num_sigmas["left"] - else: - num_sigmas["left"] = np.inf - num_sigmas_left = np.inf - if "right" in num_sigmas: - num_sigmas_right = num_sigmas["right"] - else: - num_sigmas["right"] = np.inf - num_sigmas_right = np.inf - upper = float((num_sigmas_right * std) + mean) - lower = float((-num_sigmas_left * std) + mean) - output_dict[par] = { - "Mean Value": round(mean, rounding), - "Sigmas Cut": num_sigmas, - "Upper Boundary": round(upper, rounding), - "Lower Boundary": round(lower, rounding), - } - return output_dict - - -def get_cut_indexes( - all_data: dict[str, np.ndarray], cut_dict: dict, energy_param: str = "trapTmax" -) -> list[int]: - """ - Returns a mask of the data, for a single file, that passes cuts based on dictionary of cuts - in form of cut boundaries above - Parameters - ---------- - File : dict or lh5_table - dictionary of parameters + array such as load_nda or lh5 table of params - Cut_dict : string - Dictionary file with cuts - """ - - indexes = None - keys = cut_dict.keys() - if isinstance(all_data, pd.DataFrame): - pass - elif isinstance(all_data, Table): - cut_keys = list(cut_dict) - cut_keys.append(energy_param) - all_data = { - entry: all_data[entry].nda for entry in get_keys(all_data, cut_keys) - } - all_data = pd.DataFrame.from_dict(all_data) - elif isinstance(all_data, dict): - all_data = pd.DataFrame.from_dict(all_data) - for cut in keys: - try: - data = all_data[cut] - except KeyError: - data = all_data.eval(cut).to_numpy() - upper = cut_dict[cut]["Upper Boundary"] - lower = cut_dict[cut]["Lower Boundary"] - idxs = (data < upper) & (data > lower) & (~np.isnan(data)) - percent = 100 * len(np.where(idxs)[0]) / len(idxs) - log.info(f"{percent:.2f}% passed {cut} cut") - - # Combine masks - if indexes is not None: - indexes = indexes & idxs - - else: - indexes = idxs - log.debug(f"{cut} loaded") - percent = 100 * len(np.where(indexes)[0]) / len(indexes) - log.info(f"{percent:.2f}% passed all cuts") - return indexes - - -def cut_dict_to_hit_dict(cut_dict, final_cut_field="is_valid_cal"): - out_dict = {} - symbols = "/-+*" - replacewith = "_" - for i, param in enumerate(cut_dict): - out_dict[ - f"{''.join(replacewith if c in symbols else c for c in param).replace('(','').replace(')','')}_cut" - ] = { - "expression": f"(a<({param}))&(({param}) 8000: - hist, bins, var = pgh.get_hist( - df[energy], dx=1, range=(1000, np.nanmax(df[energy])) - ) - allowed_err = 200 - else: - hist, bins, var = pgh.get_hist( - df[energy], dx=0.2, range=(500, np.nanmax(df[energy])) - ) - allowed_err = 50 - if np.any(var == 0): - var[np.where(var == 0)] = 1 - imaxes = pgc.get_i_local_maxima(hist / np.sqrt(var), 3) - peak_energies = pgh.get_bin_centers(bins)[imaxes] - pt_pars, pt_covs = pgc.hpge_fit_E_peak_tops( - hist, bins, var, peak_energies, n_to_fit=10 - ) - peak_e_err = pt_pars[:, 1] * 4 - - allowed_mask = np.ones(len(peak_energies), dtype=bool) - for i, e in enumerate(peak_energies[1:-1]): - i += 1 - if peak_e_err[i] > allowed_err: - continue - if i == 1: - if ( - e - peak_e_err[i] < peak_energies[i - 1] + peak_e_err[i - 1] - and peak_e_err[i - 1] < allowed_err - ): - overlap = ( - peak_energies[i - 1] - + peak_e_err[i - 1] - - (peak_energies[i] - peak_e_err[i]) - ) - peak_e_err[i] -= overlap * ( - peak_e_err[i] / (peak_e_err[i] + peak_e_err[i - 1]) - ) - peak_e_err[i - 1] -= overlap * ( - peak_e_err[i - 1] / (peak_e_err[i] + peak_e_err[i - 1]) - ) - - if ( - e + peak_e_err[i] > peak_energies[i + 1] - peak_e_err[i + 1] - and peak_e_err[i + 1] < allowed_err - ): - overlap = (e + peak_e_err[i]) - (peak_energies[i + 1] - peak_e_err[i + 1]) - total = peak_e_err[i] + peak_e_err[i + 1] - peak_e_err[i] -= (overlap) * (peak_e_err[i] / total) - peak_e_err[i + 1] -= (overlap) * (peak_e_err[i + 1] / total) - - out_pulsers = [] - for i, e in enumerate(peak_energies[allowed_mask]): - if peak_e_err[i] > allowed_err: - continue - - try: - e_cut = (df[energy] > e - peak_e_err[i]) & (df[energy] < e + peak_e_err[i]) - df_peak = df[e_cut] - - time_since_last = ( - df_peak.timestamp.values[1:] - df_peak.timestamp.values[:-1] - ) - - tsl = time_since_last[ - (time_since_last >= 0) - & (time_since_last < np.percentile(time_since_last, 99.9)) - ] - - bins = np.arange(0.1, 5, 0.001) - bcs = pgh.get_bin_centers(bins) - hist, bins, var = pgh.get_hist(tsl, bins=bins) - - maxs = pgc.get_i_local_maxima(hist, 45) - maxs = maxs[maxs > 20] - - super_max = pgc.get_i_local_maxima(hist, 500) - super_max = super_max[super_max > 20] - if len(maxs) < 2: - continue - else: - max_locs = np.array([0.0]) - max_locs = np.append(max_locs, bcs[np.array(maxs)]) - if ( - len(np.where(np.abs(np.diff(np.diff(max_locs))) <= 0.001)[0]) > 1 - or (np.abs(np.diff(np.diff(max_locs))) <= 0.001).all() - or len(super_max) > 0 - ): - pulser_e = e - period = stats.mode(tsl).mode[0] - if period > 0.1: - out_pulsers.append((pulser_e, peak_e_err[i], period, energy)) - - else: - continue - except: - continue - return out_pulsers - - -def tag_pulsers(df, chan_info, window=0.01): - df["isPulser"] = 0 - - if isinstance(chan_info, tuple): - chan_info = [chan_info] - final_mask = None - for chan_i in chan_info: - pulser_energy, peak_e_err, period, energy_name = chan_i - - e_cut = (df[energy_name] < pulser_energy + peak_e_err) & ( - df[energy_name] > pulser_energy - peak_e_err - ) - df_pulser = df[e_cut] - - time_since_last = np.zeros(len(df_pulser)) - time_since_last[1:] = ( - df_pulser.timestamp.values[1:] - df_pulser.timestamp.values[:-1] - ) - - mode_idxs = (time_since_last > period - window) & ( - time_since_last < period + window - ) - - pulser_events = np.count_nonzero(mode_idxs) - # print(f"pulser events: {pulser_events}") - if pulser_events < 3: - return df - df_pulser = df_pulser[mode_idxs] - - ts = df_pulser.timestamp.values - diff_zero = np.zeros(len(ts)) - diff_zero[1:] = np.around(np.divide(np.subtract(ts[1:], ts[:-1]), period)) - diff_cum = np.cumsum(diff_zero) - z = np.polyfit(diff_cum, ts, 1) - p = np.poly1d(z) - - period = z[0] - phase = z[1] - pulser_mod = np.abs(df_pulser.timestamp - phase) % period - mod = np.abs(df.timestamp - phase) % period - - period_cut = (mod < 0.1) | ((period - mod) < 0.1) # 0.1) - - if final_mask is None: - final_mask = e_cut & period_cut - else: - final_mask = final_mask | (e_cut & period_cut) - - df.loc[final_mask, "isPulser"] = 1 - - return df diff --git a/src/pygama/pargen/data_cleaning.py b/src/pygama/pargen/data_cleaning.py index 4a1ceb93e..91e4eb4ad 100644 --- a/src/pygama/pargen/data_cleaning.py +++ b/src/pygama/pargen/data_cleaning.py @@ -1,207 +1,1024 @@ """ -mainly pulser tagging -- gaussian_cut (fits data to a gaussian, returns mean +/- cut_sigma values) -- xtalball_cut (fits data to a crystalball, returns mean +/- cut_sigma values) -- find_pulser_properties (find pulser by looking for which peak has a constant time between events) -- tag_pulsers +This module provides routines for calculating and applying quality cuts """ -import matplotlib.gridspec as gs + +from __future__ import annotations + +import logging +import re + +import lgdo.lh5 as lh5 +import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np +import pandas as pd +from lgdo.types import Table from scipy import stats +from scipy.stats import chi2, skewnorm + +import pygama.math.binned_fitting as pgf +import pygama.math.histogram as pgh +import pygama.pargen.energy_cal as pgc +from pygama.math.binned_fitting import goodness_of_fit +from pygama.math.distributions import exgauss, gaussian +from pygama.math.functions.sum_dists import sum_dists +from pygama.math.unbinned_fitting import fit_unbinned + +(x_lo, x_hi, n_sig, mu, sigma, n_bkg, tau) = range(7) +par_array = [(gaussian, [mu, sigma]), (exgauss, [mu, sigma, tau])] +gauss_on_exgauss_areas = sum_dists( + par_array, + [n_sig, n_bkg], + "areas", + parameter_names=["x_lo", "x_hi", "n_sig", "mu", "sigma", "n_bkg", "tau"], + name="gauss_on_exgauss_areas", +) + +(x_lo, x_hi, n_sig, mu, sigma, tau1, n_bkg, tau2) = range(8) +par_array = [(exgauss, [mu, sigma, tau1]), (exgauss, [mu, sigma, tau2])] +double_exgauss = sum_dists( + par_array, + [n_sig, n_bkg], + "areas", + parameter_names=["x_lo", "x_hi", "n_sig", "mu", "sigma", "tau1", "n_bkg", "tau2"], + name="double_exgauss", +) + + +def skewed_fit(x, n_sig, mu, sigma, alpha): + return n_sig, n_sig * skewnorm.pdf(x, alpha, mu, sigma) -from pygama.math.peak_fitting import * +def skewed_pdf(x, n_sig, mu, sigma, alpha): + return n_sig * skewnorm.pdf(x, alpha, mu, sigma) -def gaussian_cut(data, cut_sigma=3, plotAxis=None): + +log = logging.getLogger(__name__) +sto = lh5.LH5Store() +mpl.use("agg") + + +def get_keys(in_data, cut_dict): """ - fits data to a gaussian, returns mean +/- cut_sigma values for a cut + Get the keys of the data that are used in the cut dictionary """ + parameters = [] + for _, entry in cut_dict.items(): + if "cut_parameter" in entry: + parameters.append(entry["cut_parameter"]) + else: + parameters.append(entry["expression"]) - nbins = 100 + out_params = [] + if isinstance(in_data, dict): + possible_keys = in_data.keys() + elif isinstance(in_data, list): + possible_keys = in_data + for param in parameters: + for key in possible_keys: + if key in param: + out_params.append(key) + return np.unique(out_params).tolist() - median = np.median(data) - width = np.percentile(data, 80) - np.percentile(data, 20) - good_data = data[(data > (median - 4 * width)) & (data < (median + 4 * width))] +def get_mode_stdev(par_array): + idxs = (par_array > np.nanpercentile(par_array, 1)) & ( + par_array < np.nanpercentile(par_array, 99) + ) + par_array = par_array[idxs] + bin_width = np.nanpercentile(par_array, 55) - np.nanpercentile(par_array, 50) + + counts, start_bins, var = pgh.get_hist( + par_array, + range=(np.nanmin(par_array), np.nanmax(par_array)), + dx=bin_width, + ) + max_idx = np.argmax(counts) + mu = start_bins[max_idx] + try: + fwhm = pgh.get_fwhm(counts, start_bins)[0] + guess_sig = fwhm / 2.355 - hist, bins = np.histogram(good_data, bins=101) # np.linspace(1,5,101) - bin_centers = bins[:-1] + (bins[1] - bins[0]) / 2 + lower_bound = mu - 10 * guess_sig - # fit gaussians to that - # result = fit_unbinned(gauss, hist, [median, width/2] ) - # print("unbinned: {}".format(result)) + upper_bound = mu + 10 * guess_sig + + except Exception: + lower_bound = np.nanpercentile(par_array, 5) + upper_bound = np.nanpercentile(par_array, 95) + + if (lower_bound < np.nanmin(par_array)) or (lower_bound > np.nanmax(par_array)): + lower_bound = np.nanmin(par_array) + if (upper_bound > np.nanmax(par_array)) or (upper_bound < np.nanmin(par_array)): + upper_bound = np.nanmax(par_array) + + try: + counts, bins, var = pgh.get_hist( + par_array, + dx=(np.nanpercentile(par_array, 52) - np.nanpercentile(par_array, 50)), + range=(lower_bound, upper_bound), + ) + + bin_centres = pgh.get_bin_centers(bins) + + fwhm = pgh.get_fwhm(counts, bins)[0] + mean = float(bin_centres[np.argmax(counts)]) + pars, cov = pgf.gauss_mode_width_max( + counts, + bins, + mode_guess=mean, + n_bins=20, + cost_func="Least Squares", + inflate_errors=False, + gof_method="var", + ) + mean = pars[0] + std = fwhm / 2.355 + + if ( + mean < np.nanmin(bins) + or mean > np.nanmax(bins) + or (mean + std) < mu + or (mean - std) > mu + ): + raise IndexError + except IndexError: + try: + fwhm = pgh.get_fwhm(counts, bins)[0] + mean = float(bin_centres[np.argmax(counts)]) + std = fwhm / 2.355 + except Exception: + lower_bound = np.nanpercentile(par_array, 5) + upper_bound = np.nanpercentile(par_array, 95) + + counts, bins, var = pgh.get_hist( + par_array, + dx=np.nanpercentile(par_array, 52) - np.nanpercentile(par_array, 50), + range=(lower_bound, upper_bound), + ) + + bin_centres = pgh.get_bin_centers(bins) + + try: + fwhm = pgh.get_fwhm(counts, bins)[0] + mean = float(bin_centres[np.argmax(counts)]) + std = fwhm / 2.355 + except Exception: + mean = float(bin_centres[np.argmax(counts)]) + std = np.nanstd(par_array) + return mean, std + + +def fit_distributions(x_lo, x_hi, norm_par_array, display=0): + peak_par_array = norm_par_array[(norm_par_array > x_lo) & (norm_par_array < x_hi)] + + hist, bins, var = pgh.get_hist(peak_par_array, dx=0.1, range=(x_lo, x_hi)) + var = np.where(var == 0, 1, var) + + exgauss_pars, _, _ = fit_unbinned( + exgauss.pdf_ext, + peak_par_array, + [x_lo, x_hi, len(peak_par_array), 0, 1, -0.1], + simplex=True, + bounds=[ + (None, None), + (None, None), + (0, None), + (-0.8, 0.8), + (0.8, 1.2), + (None, None), + ], + fixed=["x_lo", "x_hi"], + ) - result = fit_binned( - gauss, + gauss_pars, _, _ = fit_unbinned( + gaussian.pdf_ext, + peak_par_array, + [x_lo, x_hi, len(peak_par_array), 0, 1], + simplex=True, + bounds=[(None, None), (None, None), (0, None), (-0.5, 0.5), (0.5, 1.5)], + fixed=["x_lo", "x_hi"], + ) + + gauss_on_exgauss_pars, _, _ = fit_unbinned( + gauss_on_exgauss_areas.pdf_ext, + peak_par_array, + [x_lo, x_hi, len(peak_par_array) * 0.9, 0, 1, len(peak_par_array) * 0.1, -0.1], + simplex=True, + bounds=[ + (None, None), + (None, None), + (0, None), + (-0.5, 0.5), + (0, None), + (0, None), + (None, None), + ], + fixed=["x_lo", "x_hi"], + ) + + skewed_pars, _, _ = fit_unbinned( + skewed_fit, + peak_par_array, + [len(peak_par_array), 0, 1, 0.1], + simplex=True, + bounds=[(0, None), (None, None), (0, None), (None, None)], + ) + + double_exgauss_pars, _, _ = fit_unbinned( + double_exgauss.pdf_ext, + peak_par_array, + [ + x_lo, + x_hi, + len(peak_par_array) * 0.5, + 0, + 1, + -0.1, + len(peak_par_array) * 0.5, + 0.1, + ], + simplex=True, + bounds=[ + (None, None), + (None, None), + (0, None), + (-0.5, 0.5), + (0, None), + (None, 0), + (0, None), + (0, None), + ], + fixed=["x_lo", "x_hi"], + ) + + gauss_csqr = goodness_of_fit( + hist, + bins, + var, + lambda x, *args: gaussian.pdf_ext(x, *args)[1], + gauss_pars, + method="var", + scale_bins=True, + ) + + exgauss_csqr = goodness_of_fit( + hist, + bins, + var, + lambda x, *args: exgauss.pdf_ext(x, *args)[1], + exgauss_pars, + method="var", + scale_bins=True, + ) + + skewed_csqr = goodness_of_fit( + hist, + bins, + var, + lambda x, *args: skewed_fit(x, *args)[1], + skewed_pars, + method="var", + scale_bins=True, + ) + + gauss_on_exgauss_csqr = goodness_of_fit( hist, - bin_centers, - [median, width / 2, np.amax(hist) * (width / 2) * np.sqrt(2 * np.pi)], + bins, + var, + gauss_on_exgauss_areas.get_pdf, + gauss_on_exgauss_pars, + method="var", + scale_bins=True, ) - # print("binned: {}".format(result)) - cut_lo = result[0] - cut_sigma * result[1] - cut_hi = result[0] + cut_sigma * result[1] - if plotAxis is not None: - plotAxis.plot(bin_centers, hist, ls="steps-mid", color="k", label="data") - fit = gauss(bin_centers, *result) - plotAxis.plot(bin_centers, fit, label="gaussian fit") - plotAxis.axvline(result[0], color="g", label="fit mean") - plotAxis.axvline(cut_lo, color="r", label=f"+/- {cut_sigma} sigma") - plotAxis.axvline(cut_hi, color="r") - plotAxis.legend() - # plt.xlabel(params[i]) + double_exgauss_csqr = goodness_of_fit( + hist, + bins, + var, + double_exgauss.get_pdf, + double_exgauss_pars, + method="var", + scale_bins=True, + ) - return cut_lo, cut_hi, result[0], cut_sigma + if display > 0: + bcs = pgh.get_bin_centers(bins) + plt.figure() + plt.step(bcs, hist) + plt.plot( + bcs, double_exgauss.get_pdf(bcs, *double_exgauss_pars) * np.diff(bins)[0] + ) + plt.plot( + bcs, + gauss_on_exgauss_areas.pdf_ext(bcs, *gauss_on_exgauss_pars)[1] + * np.diff(bins)[0], + ) + plt.plot(bcs, skewed_fit(bcs, *skewed_pars)[1] * np.diff(bins)[0]) + plt.plot(bcs, gaussian.pdf_ext(bcs, *gauss_pars)[1] * np.diff(bins)[0]) + plt.plot(bcs, exgauss.pdf_ext(bcs, *exgauss_pars)[1] * np.diff(bins)[0]) + plt.show() + gauss_p_val = chi2.sf(gauss_csqr[0], gauss_csqr[1] + 2) + exgauss_p_val = chi2.sf(exgauss_csqr[0], exgauss_csqr[1] + 2) + skewed_p_val = chi2.sf(skewed_csqr[0], skewed_csqr[1]) + gauss_on_exgauss_p_val = chi2.sf( + gauss_on_exgauss_csqr[0], gauss_on_exgauss_csqr[1] + 2 + ) + double_exgauss_p_val = chi2.sf(double_exgauss_csqr[0], double_exgauss_csqr[1] + 2) -def xtalball_cut(data, cut_sigma=3, plotFigure=None): + funcs = [gaussian, exgauss, skewed_fit, gauss_on_exgauss_areas, double_exgauss] + pars = [ + gauss_pars, + exgauss_pars, + skewed_pars, + gauss_on_exgauss_pars, + double_exgauss_pars, + ] + pvals = np.array( + [ + gauss_p_val, + exgauss_p_val, + skewed_p_val, + gauss_on_exgauss_p_val, + double_exgauss_p_val, + ] + ) + csqrs = [ + gauss_csqr[0], + exgauss_csqr[0], + skewed_csqr[0], + gauss_on_exgauss_csqr[0], + double_exgauss_csqr[0], + ] + + if (pvals == 0).all(): + idx = np.nanargmin(csqrs) + else: + idx = np.nanargmax(pvals) + func = funcs[idx] + pars = pars[idx] + return func, pars + + +def generate_cuts( + data: dict[str, np.ndarray], + cut_dict: dict[str, int], + rounding: int = 4, + display: int = 0, +) -> dict: """ - fits data to a crystalball, returns mean +/- cut_sigma values for a cut + Finds double sided cut boundaries for a file for the parameters specified + + Parameters + ---------- + data : lh5 table, dictionary of arrays or pandas dataframe + data to calculate cuts on + parameters : dict + dictionary of the form: + { + "output_parameter_name": { + "cut_parameter": "parameter_to_cut_on", + "cut_level": number_of_sigmas, + "mode": "inclusive" or "exclusive" + } + } + number of sigmas can instead be a dictionary to specify different cut levels for low and high side + or to only have a one sided cut only specify one of the low or high side + e.g. + { + "output_parameter_name": { + "cut_parameter": "parameter_to_cut_on", + "cut_level": {"low_side": 3, "high_side": 2}, + "mode": "inclusive" or "exclusive" + } + } + alternatively can specify hit dict fields to just copy dict into output dict e.g. + { + "is_valid_t0":{ + "expression":"(tp_0_est>a)&(tp_0_est (median - 4 * width)) & (data < (median + 4 * width))] + if isinstance(num_sigmas, (int, float)): + num_sigmas_left = num_sigmas + num_sigmas_right = num_sigmas + elif isinstance(num_sigmas, dict): + if "low_side" in num_sigmas: + num_sigmas_left = num_sigmas["low_side"] + else: + num_sigmas_left = None + if "high_side" in num_sigmas: + num_sigmas_right = num_sigmas["high_side"] + else: + num_sigmas_right = None + upper = round(float((num_sigmas_right * std) + mean), rounding) + lower = round(float((-num_sigmas_left * std) + mean), rounding) + if mode == "inclusive": + if upper is not None and lower is not None: + cut_string = f"({par}>a) & ({par}a" + par_dict = {"a": lower} + elif lower is None: + cut_string = f"{par}b)" + par_dict = {"a": lower, "b": upper} + elif upper is None: + cut_string = f"{par}a" + par_dict = {"a": upper} - hist, bins = np.histogram(good_data, bins=101) # np.linspace(1,5,101) - bin_centers = bins[:-1] + (bins[1] - bins[0]) / 2 + output_dict[out_par] = {"expression": cut_string, "parameters": par_dict} - # fit gaussians to that - # result = fit_unbinned(gauss, hist, [median, width/2] ) - # print("unbinned: {}".format(result)) - p0 = get_gaussian_guess(hist, bin_centers) - bounds = [ - (p0[0] * 0.5, p0[1] * 0.5, p0[2] * 0.2, 0, 1), - (p0[0] * 1.5, p0[1] * 1.5, p0[2] * 5, np.inf, np.inf), - ] - result = fit_binned( - xtalball, hist, bin_centers, [p0[0], p0[1], p0[2], 10, 1], bounds=bounds - ) - # print("binned: {}".format(result)) - cut_lo = result[0] - cut_sigma * result[1] - cut_hi = result[0] + cut_sigma * result[1] - - if plotFigure is not None: - plt.figure(plotFigure.number) - plt.plot(bin_centers, hist, ls="steps-mid", color="k", label="data") - fit = xtalball(bin_centers, *result) - plt.plot(bin_centers, fit, label="xtalball fit") - plt.axvline(result[0], color="g", label="fit mean") - plt.axvline(cut_lo, color="r", label=f"+/- {cut_sigma} sigma") - plt.axvline(cut_hi, color="r") - plt.legend() - # plt.xlabel(params[i]) - - return cut_lo, cut_hi - - -def find_pulser_properties(df, energy="trap_max"): - from .calibration import get_most_prominent_peaks - - # print (df[energy]) - # exit() - # find pulser by looking for which peak has a constant time between events - # df should already be grouped by channel - - peak_energies, peak_e_err = get_most_prominent_peaks(df[energy], max_num_peaks=10) - peak_e_err *= 3 - - for e in peak_energies: - e_cut = (df[energy] > e - peak_e_err) & (df[energy] < e + peak_e_err) - df_peak = df[e_cut] - # df_after_0 = df_peak.iloc[1:] - time_since_last = df_peak.timestamp.values[1:] - df_peak.timestamp.values[:-1] - - tsl = time_since_last[ - (time_since_last >= 0) - & (time_since_last < np.percentile(time_since_last, 99.9)) - ] - last_ten = np.percentile(tsl, 97) - np.percentile(tsl, 90) - first_ten = np.percentile(tsl, 10) - np.percentile(tsl, 3) - # print("{:e}, {:e}".format(last_ten,first_ten)) + if display > 0: + fig = plt.figure() + low_val = np.nanpercentile(all_par_array, 5) + up_val = np.nanpercentile(all_par_array, 95) + if upper is not None: + plt.axvline(upper) + if up_val < upper: + up_val = upper + if lower is not None: + plt.axvline(lower) + if low_val > lower: + low_val = lower - if last_ten > first_ten: - # print("...no pulser?") - continue + plt.hist( + all_par_array, + bins=np.linspace( + low_val, + up_val, + 100, + ), + histtype="step", + ) + + plt.ylabel("counts") + plt.xlabel(out_par) + plot_dict[out_par] = fig + plt.close() + if display > 0: + return output_dict, plot_dict + else: + return output_dict + + +def get_cut_indexes(data, cut_parameters): + """ + Get the indexes of the data that pass the cuts in + """ + cut_dict = generate_cuts(data, cut_dict=cut_parameters) + log.debug(f"Cuts are {cut_dict}") + + if isinstance(data, Table): + ct_mask = np.full(len(data), True, dtype=bool) + for outname, info in cut_dict.items(): + outcol = data.eval(info["expression"], info.get("parameters", None)) + data.add_column(outname, outcol) + log.debug("Applied Cuts") + + for cut in cut_dict: + ct_mask = data[cut].nda & ct_mask + elif isinstance(data, pd.DataFrame): + ct_mask = np.full(len(data), True, dtype=bool) + + for outname, info in cut_dict.items(): + # convert to pandas eval + exp = info["expression"] + for key in info.get("parameters", None): + exp = re.sub(f"(? dict: + """ + Finds double sided cut boundaries for a file for the parameters specified + + Parameters + ---------- + data : lh5 table, dictionary of arrays or pandas dataframe + data to calculate cuts on + parameters : dict + dictionary of the form: + { + "output_parameter_name": { + "cut_parameter": "parameter_to_cut_on", + "cut_level": number_of_sigmas, + "mode": "inclusive" or "exclusive" + } + } + number of sigmas can instead be a dictionary to specify different cut levels for low and high side + or to only have a one sided cut only specify one of the low or high side + e.g. + { + "output_parameter_name": { + "cut_parameter": "parameter_to_cut_on", + "cut_level": {"low_side": 3, "high_side": 2}, + "mode": "inclusive" or "exclusive" + } + } + alternatively can specify hit dict fields to just copy dict into output dict e.g. + { + "is_valid_t0":{ + "expression":"(tp_0_est>a)&(tp_0_est pulser_energy - peak_e_err - ) - df_pulser = df[e_cut] + if isinstance(percentile, (int, float)): + cut_left = xs[np.argmin(np.abs(cdf - (1 - (percentile / 100))))] + cut_right = xs[np.argmin(np.abs(cdf - (percentile / 100)))] - time_since_last = np.zeros(len(df_pulser)) - time_since_last[1:] = ( - df_pulser.timestamp.values[1:] - df_pulser.timestamp.values[:-1] - ) + elif isinstance(percentile, dict): + if "low_side" in percentile: + cut_left = xs[ + np.argmin(np.abs(cdf - (1 - (percentile / 100)))) + ] + else: + cut_left = None + if "high_side" in percentile: + cut_right = xs[np.argmin(np.abs(cdf - (percentile / 100)))] + else: + cut_right = None + + else: + if isinstance(percentile, (int, float)): + cut_left = np.nanpercentile(norm_par_array, 100 - percentile) + cut_right = np.nanpercentile(norm_par_array, percentile) + + elif isinstance(percentile, dict): + if "low_side" in percentile: + cut_left = np.nanpercentile(norm_par_array, percentile) + else: + cut_left = None + if "high_side" in percentile: + cut_right = np.nanpercentile(norm_par_array, percentile) + else: + cut_right = None + + if default is not None: + value = default["value"] + default_mode = default["mode"] + if isinstance(value, (int, float)): + default_cut_left = -value + default_cut_right = value + else: + if "low_side" in default: + default_cut_left = value["low_side"] + else: + default_cut_left = np.nan + if "high_side" in default: + default_cut_right = value["high_side"] + else: + default_cut_right = np.nan + + if default_mode == "higher_limit": + if cut_left is not None: + if cut_left < default_cut_left: + cut_left = default_cut_left + if cut_right is not None: + if cut_right > default_cut_right: + cut_right = default_cut_right + elif default_mode == "lower_limit": + if cut_left is not None: + if cut_left > default_cut_left: + cut_left = default_cut_left + if cut_right is not None: + if cut_right < default_cut_right: + cut_right = default_cut_right + else: + raise ValueError("unknown mode") - # plt.figure() - # plt.hist(time_since_last, bins=1000) - # plt.show() + if mode == "inclusive": + if cut_right is not None and cut_left is not None: + cut_string = f"({out_par}_classifier>a) & ({out_par}_classifiera" + par_dict = {"a": cut_left} + elif cut_left is None: + cut_string = f"{out_par}_classifierb)" + par_dict = {"a": cut_left, "b": cut_right} + elif cut_right is None: + cut_string = f"{out_par}_classifiera" + par_dict = {"a": cut_right} - mode_idxs = (time_since_last > period - window) & ( - time_since_last < period + window + output_dict[f"{out_par}_classifier"] = { + "expression": f"(({par})-a)/b", + "parameters": {"a": mean, "b": std}, + } + + output_dict[out_par] = {"expression": cut_string, "parameters": par_dict} + if display > 0: + fig = plt.figure() + low = -10 if cut_left is None or cut_left > -10 else cut_left + hi = 10 if cut_right is None or cut_right < 10 else cut_right + hist, _, _ = plt.hist( + norm_par_array, + bins=np.arange(low, hi, 0.1), + histtype="step", + ) + if percentile is not None and method == "fit": + xs = np.arange(low, hi, 0.1) + if func == skewed_fit: + pdf_values = func(xs, *pars)[1] * 0.1 + else: + pdf_values = func.pdf_ext(xs, *pars)[1] * 0.1 + plt.plot(xs, pdf_values) + if cut_left is not None: + plt.axvline(cut_left) + if cut_right is not None: + plt.axvline(cut_right) + + plt.ylabel("counts") + plt.xlabel(f"{out_par}_classifier") + plot_dict[out_par] = fig + plt.close() + if display > 0: + return output_dict, plot_dict + else: + return output_dict + + +def find_pulser_properties(df, energy="daqenergy"): + """ + Searches for pulser in the energy spectrum using time between events in peaks + """ + if np.nanmax(df[energy]) > 8000: + hist, bins, var = pgh.get_hist( + df[energy], dx=1, range=(1000, np.nanmax(df[energy])) + ) + allowed_err = 200 + else: + hist, bins, var = pgh.get_hist( + df[energy], dx=0.2, range=(500, np.nanmax(df[energy])) + ) + allowed_err = 50 + if np.any(var == 0): + var[np.where(var == 0)] = 1 + imaxes = pgc.get_i_local_maxima(hist / np.sqrt(var), 3) + peak_energies = pgh.get_bin_centers(bins)[imaxes] + pt_pars, pt_covs = pgc.hpge_fit_E_peak_tops( + hist, bins, var, peak_energies, n_to_fit=10 ) + peak_e_err = pt_pars[:, 1] * 4 + + allowed_mask = np.ones(len(peak_energies), dtype=bool) + for i, e in enumerate(peak_energies[1:-1]): + i += 1 + if peak_e_err[i] > allowed_err: + continue + if i == 1: + if ( + e - peak_e_err[i] < peak_energies[i - 1] + peak_e_err[i - 1] + and peak_e_err[i - 1] < allowed_err + ): + overlap = ( + peak_energies[i - 1] + + peak_e_err[i - 1] + - (peak_energies[i] - peak_e_err[i]) + ) + peak_e_err[i] -= overlap * ( + peak_e_err[i] / (peak_e_err[i] + peak_e_err[i - 1]) + ) + peak_e_err[i - 1] -= overlap * ( + peak_e_err[i - 1] / (peak_e_err[i] + peak_e_err[i - 1]) + ) + + if ( + e + peak_e_err[i] > peak_energies[i + 1] - peak_e_err[i + 1] + and peak_e_err[i + 1] < allowed_err + ): + overlap = (e + peak_e_err[i]) - (peak_energies[i + 1] - peak_e_err[i + 1]) + total = peak_e_err[i] + peak_e_err[i + 1] + peak_e_err[i] -= (overlap) * (peak_e_err[i] / total) + peak_e_err[i + 1] -= (overlap) * (peak_e_err[i + 1] / total) + + out_pulsers = [] + for i, e in enumerate(peak_energies[allowed_mask]): + if peak_e_err[i] > allowed_err: + continue + + try: + e_cut = (df[energy] > e - peak_e_err[i]) & (df[energy] < e + peak_e_err[i]) + df_peak = df[e_cut] + + time_since_last = ( + df_peak.timestamp.values[1:] - df_peak.timestamp.values[:-1] + ) + + tsl = time_since_last[ + (time_since_last >= 0) + & (time_since_last < np.percentile(time_since_last, 99.9)) + ] + + bins = np.arange(0.1, 5, 0.001) + bcs = pgh.get_bin_centers(bins) + hist, bins, var = pgh.get_hist(tsl, bins=bins) - pulser_events = np.count_nonzero(mode_idxs) - # print("pulser events: {}".format(pulser_events)) - if pulser_events < 3: - return df + maxs = pgh.get_i_local_maxima(hist, 45) + maxs = maxs[maxs > 20] - df_pulser = df_pulser[mode_idxs] + super_max = pgh.get_i_local_maxima(hist, 500) + super_max = super_max[super_max > 20] + if len(maxs) < 2: + continue + else: + max_locs = np.array([0.0]) + max_locs = np.append(max_locs, bcs[np.array(maxs)]) + if ( + len(np.where(np.abs(np.diff(np.diff(max_locs))) <= 0.001)[0]) > 1 + or (np.abs(np.diff(np.diff(max_locs))) <= 0.001).all() + or len(super_max) > 0 + ): + pulser_e = e + period = stats.mode(tsl).mode[0] + if period > 0.1: + out_pulsers.append((pulser_e, peak_e_err[i], period, energy)) - ts = df_pulser.timestamp.values - diff_zero = np.zeros(len(ts)) - diff_zero[1:] = np.around((ts[1:] - ts[:-1]) / period) - diff_cum = np.cumsum(diff_zero) - z = np.polyfit(diff_cum, ts, 1) - p = np.poly1d(z) + else: + continue + except Exception: + continue + return out_pulsers + + +def get_tcm_pulser_ids(tcm_file, channel, multiplicity_threshold): + if isinstance(channel, str): + if channel[:2] == "ch": + chan = int(channel[2:]) + else: + chan = int(channel) + else: + chan = channel + if isinstance(tcm_file, list): + mask = np.array([], dtype=bool) + for file in tcm_file: + _, file_mask = get_tcm_pulser_ids(file, chan, multiplicity_threshold) + mask = np.append(mask, file_mask) + ids = np.where(mask)[0] + else: + data = pd.DataFrame( + { + "array_id": sto.read("hardware_tcm_1/array_id", tcm_file)[0].view_as( + "np" + ), + "array_idx": sto.read("hardware_tcm_1/array_idx", tcm_file)[0].view_as( + "np" + ), + } + ) + cumulength = sto.read("hardware_tcm_1/cumulative_length", tcm_file)[0].view_as( + "np" + ) + cumulength = np.append(np.array([0]), cumulength) + n_channels = np.diff(cumulength) + evt_numbers = np.repeat(np.arange(0, len(cumulength) - 1), np.diff(cumulength)) + evt_mult = np.repeat(np.diff(cumulength), np.diff(cumulength)) + data["evt_number"] = evt_numbers + data["evt_mult"] = evt_mult + high_mult_events = np.where(n_channels > multiplicity_threshold)[ # noqa: F841 + 0 + ] + + ids = data.query(f"array_id=={channel} and evt_number in @high_mult_events")[ + "array_idx" + ].to_numpy() + mask = np.zeros(len(data.query(f"array_id=={channel}")), dtype="bool") + mask[ids] = True + return ids, mask - # plt.figure() - # xp = np.linspace(0, diff_cum[-1]) - # plt.plot(xp,p(xp)) - # plt.scatter(diff_cum,ts) - # plt.show() - period = z[0] - phase = z[1] +def tag_pulsers(df, chan_info, window=0.01): + df["isPulser"] = 0 + + if isinstance(chan_info, tuple): + chan_info = [chan_info] + final_mask = None + for chan_i in chan_info: + pulser_energy, peak_e_err, period, energy_name = chan_i - mod = np.abs(df.timestamp - phase) % period + e_cut = (df[energy_name] < pulser_energy + peak_e_err) & ( + df[energy_name] > pulser_energy - peak_e_err + ) + df_pulser = df[e_cut] - # pulser_mod =np.abs(df_pulser.timestamp - phase) %period - # pulser_mod[ pulser_mod > 10*window] = period - pulser_mod[ pulser_mod > 10*window] - # plt.hist(pulser_mod , bins="auto") - # plt.show() - period_cut = (mod < window) | ((period - mod) < window) + time_since_last = np.zeros(len(df_pulser)) + time_since_last[1:] = ( + df_pulser.timestamp.values[1:] - df_pulser.timestamp.values[:-1] + ) + + mode_idxs = (time_since_last > period - window) & ( + time_since_last < period + window + ) + + pulser_events = np.count_nonzero(mode_idxs) + # print(f"pulser events: {pulser_events}") + if pulser_events < 3: + return df + df_pulser = df_pulser[mode_idxs] + + ts = df_pulser.timestamp.values + diff_zero = np.zeros(len(ts)) + diff_zero[1:] = np.around(np.divide(np.subtract(ts[1:], ts[:-1]), period)) + diff_cum = np.cumsum(diff_zero) + z = np.polyfit(diff_cum, ts, 1) + + period = z[0] + phase = z[1] + mod = np.abs(df.timestamp - phase) % period + + period_cut = (mod < 0.1) | ((period - mod) < 0.1) # 0.1) + + if final_mask is None: + final_mask = e_cut & period_cut + else: + final_mask = final_mask | (e_cut & period_cut) - # print("pulser events: {}".format(np.count_nonzero(e_cut & period_cut))) - df.loc[e_cut & period_cut, "isPulser"] = 1 + df.loc[final_mask, "isPulser"] = 1 return df diff --git a/src/pygama/pargen/dplms_ge_dict.py b/src/pygama/pargen/dplms_ge_dict.py index 6a155d239..e869a32a0 100644 --- a/src/pygama/pargen/dplms_ge_dict.py +++ b/src/pygama/pargen/dplms_ge_dict.py @@ -5,41 +5,33 @@ from __future__ import annotations import itertools -import json import logging -import os import time import matplotlib.pyplot as plt import numpy as np -from lgdo import Array, Table, lh5 +from lgdo import Table, lh5 from scipy.signal import convolve, convolve2d +from scipy.stats import chi2 -from pygama.math.histogram import get_hist -from pygama.math.peak_fitting import ( - extended_gauss_step_pdf, - extended_radford_pdf, - gauss_step_pdf, - radford_pdf, -) -from pygama.pargen.cuts import generate_cuts, get_cut_indexes +from pygama.math.distributions import gauss_on_step +from pygama.pargen.data_cleaning import generate_cuts from pygama.pargen.dsp_optimize import run_one_dsp -from pygama.pargen.energy_optimisation import fom_FWHM_with_dt_corr_fit +from pygama.pargen.energy_optimisation import fom_fwhm_with_alpha_fit log = logging.getLogger(__name__) sto = lh5.LH5Store() def dplms_ge_dict( - lh5_path: str, raw_fft: Table, raw_cal: Table, dsp_config: dict, par_dsp: dict, - par_dsp_lh5: str, dplms_dict: dict, decay_const: float = 0, ene_par: str = "dplmsEmax", + p_val_lim: float = 10**-20, display: int = 0, ) -> dict: """ @@ -47,8 +39,6 @@ def dplms_ge_dict( Parameters ---------- - lh5_path - Name of channel to process, should be name of lh5 group in raw files fft_files table with fft data raw_cal @@ -57,8 +47,6 @@ def dplms_ge_dict( dsp config file par_dsp Dictionary with db parameters for dsp processing - par_dsp_lh5 - Path for saving dplms coefficients dplms_dict Dictionary with various parameters @@ -68,11 +56,20 @@ def dplms_ge_dict( """ t0 = time.time() - log.info(f"\nSelecting baselines") + log.info("Selecting baselines") + + dsp_fft = run_one_dsp(raw_fft, dsp_config, db_dict=par_dsp) + + cut_dict = generate_cuts(dsp_fft, cut_dict=dplms_dict["bls_cut_pars"]) + log.debug(f"Cuts are {cut_dict}") + idxs = np.full(len(dsp_fft), True, dtype=bool) + for outname, info in cut_dict.items(): + outcol = dsp_fft.eval(info["expression"], info.get("parameters", None)) + dsp_fft.add_column(outname, outcol) + for cut in cut_dict: + idxs = dsp_fft[cut].nda & idxs + log.debug("Applied Cuts") - dsp_fft = run_one_dsp(raw_fft, dsp_config, db_dict=par_dsp[lh5_path]) - cut_dict = generate_cuts(dsp_fft, parameters=dplms_dict["bls_cut_pars"]) - idxs = get_cut_indexes(dsp_fft, cut_dict) bl_field = dplms_dict["bl_field"] log.info(f"... {len(dsp_fft[bl_field].values.nda[idxs,:])} baselines after cuts") @@ -87,25 +84,20 @@ def dplms_ge_dict( ) log.info( - "\nCalculating noise matrix of length", - dplms_dict["length"], - "n. events", - bls.shape[0], - "size", - bls.shape[1], + f'Calculating noise matrix of length {dplms_dict["length"]} n. events: {bls.shape[0]}, size: {bls.shape[1]}' ) nmat = noise_matrix(bls, dplms_dict["length"]) t2 = time.time() log.info(f"Time to calculate noise matrix {(t2-t1):.2f} s") - log.info("\nSelecting signals") + log.info("Selecting signals") wsize = dplms_dict["wsize"] wf_field = dplms_dict["wf_field"] - peaks_keV = np.array(dplms_dict["peaks_keV"]) + peaks_kev = np.array(dplms_dict["peaks_kev"]) kev_widths = [tuple(kev_width) for kev_width in dplms_dict["kev_widths"]] log.info(f"Produce dsp data for {len(raw_cal)} events") - dsp_cal = run_one_dsp(raw_cal, dsp_config, db_dict=par_dsp[lh5_path]) + dsp_cal = run_one_dsp(raw_cal, dsp_config, db_dict=par_dsp) t3 = time.time() log.info(f"Time to run dsp production {(t3-t2):.2f} s") @@ -113,11 +105,10 @@ def dplms_ge_dict( # dictionary for peak fitting peak_dict = { - "peak": peaks_keV[-1], + "peak": peaks_kev[-1], "kev_width": kev_widths[-1], "parameter": ene_par, - "func": extended_gauss_step_pdf, - "gof_func": gauss_step_pdf, + "func": gauss_on_step, } if display > 0: @@ -138,13 +129,10 @@ def dplms_ge_dict( for i, values in enumerate(prod): coeff_values = dict(zip(coeff_keys, values)) + log_msg = f"Case {i} ->" + for key, value in coeff_values.items(): + log_msg += f" {key} = {value}" - log.info( - "\nCase", - i, - "->", - ", ".join(f"{key} = {value}" for key, value in coeff_values.items()), - ) grid_dict[i] = coeff_values sel_dict = signal_selection(dsp_cal, dplms_dict, coeff_values) @@ -166,22 +154,22 @@ def dplms_ge_dict( dplms_dict["length"], wsize, ) - par_dsp[lh5_path]["dplms"] = {"length": dplms_dict["length"], "coefficients": x} + par_dsp["dplms"] = {"length": dplms_dict["length"], "coefficients": x} log.info( - f"Filter synthesis in {time.time()-t_tmp:.1f} s, filter area", np.sum(x) + f"Filter synthesis in {time.time()-t_tmp:.1f} s, filter area {np.sum(x)}" ) t_tmp = time.time() - dsp_opt = run_one_dsp(raw_cal, dsp_config, db_dict=par_dsp[lh5_path]) + dsp_opt = run_one_dsp(raw_cal, dsp_config, db_dict=par_dsp) try: - res = fom_FWHM_with_dt_corr_fit( + res = fom_fwhm_with_alpha_fit( dsp_opt, peak_dict, - "QDrift", + "dt_eff", idxs=np.where(~np.isnan(dsp_opt["dt_eff"].nda))[0], ) - except: + except Exception: log.debug("FWHM not calculated") continue @@ -198,11 +186,12 @@ def dplms_ge_dict( grid_dict[i]["fwhm"] = fwhm grid_dict[i]["fwhm_err"] = fwhm_err grid_dict[i]["alpha"] = alpha - + p_val = chi2.sf(chisquare[0], chisquare[1]) if ( fwhm < dplms_dict["fwhm_limit"] and fwhm_err < dplms_dict["err_limit"] - and chisquare < dplms_dict["chi_limit"] + and p_val > p_val_lim + and ~np.isnan(fwhm) ): if fwhm < min_fom: min_idx, min_fom = i, fwhm @@ -242,6 +231,8 @@ def dplms_ge_dict( ft_coeff = dplms_dict["dp_def"]["ft"] rt_coeff = dplms_dict["dp_def"]["rt"] pt_coeff = dplms_dict["dp_def"]["pt"] + best_case_values = {} + alpha = 0 # filter synthesis sel_dict = signal_selection(dsp_cal, dplms_dict, best_case_values) @@ -260,18 +251,10 @@ def dplms_ge_dict( wsize, ) - sto.write( - Array(x), - name="dplms", - lh5_file=par_dsp_lh5, - wo_mode="overwrite", - group=lh5_path, - ) - out_dict = { "dplms": { "length": dplms_dict["length"], - "coefficients": f"loadlh5('{par_dsp_lh5}', '{lh5_path}/dplms')", + "coefficients": x, "dp_coeffs": { "nm": nm_coeff, "za": za_coeff, @@ -292,8 +275,7 @@ def dplms_ge_dict( log.info(f"Time to complete DPLMS filter synthesis {time.time()-t0:.1f}") if display > 0: - plot_dict["dplms"]["ref"] = ref - plot_dict["dplms"]["coefficients"] = x + plot_dict = {"ref": ref, "coefficients": x} bl_idxs = np.random.choice(len(bls), dplms_dict["n_plot"]) bls = bls[bl_idxs] @@ -303,12 +285,20 @@ def dplms_ge_dict( ax.plot(wf, label=f"mean = {wf.mean():.1f}") else: ax.plot(wf) - ax.legend(title=f"{lh5_path}", loc="upper right") - plot_dict["dplms"]["bls"] = fig + ax.legend(loc="upper right") + plot_dict["bls"] = fig fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(16, 9), facecolor="white") for ii, par in enumerate(bls_cut_pars): - mean = cut_dict[par]["Mean Value"] - llo, lup = cut_dict[par]["Lower Boundary"], cut_dict[par]["Upper Boundary"] + if "parameters" in cut_dict[par]: + if "a" in cut_dict[par]["parameters"]: + llo = cut_dict[par]["parameters"]["a"] + else: + llo = np.nan + if "b" in cut_dict[par]["parameters"]: + lup = cut_dict[par]["parameters"]["b"] + else: + lup = np.nan + mean = (lup + llo) / 2 plo, pup = mean - 2 * (mean - llo), mean + 2 * (lup - mean) hh, bb = np.histogram(bls_par[par], bins=np.linspace(plo, pup, 200)) ax.flat[ii].plot(bb[1:], hh, ds="steps", label=f"cut on {par}") @@ -316,39 +306,37 @@ def dplms_ge_dict( ax.flat[ii].axvline(llo, color="k", linestyle=":") ax.flat[ii].set_xlabel(par) ax.flat[ii].set_yscale("log") - ax.flat[ii].legend(title=f"{lh5_path}", loc="upper right") - plot_dict["dplms"]["bl_sel"] = fig + ax.flat[ii].legend(loc="upper right") + plot_dict["bl_sel"] = fig wf_idxs = np.random.choice(len(wfs), dplms_dict["n_plot"]) wfs = wfs[wf_idxs] - peak_pos = dsp_cal["peak_pos"].nda - peak_pos_neg = dsp_cal["peak_pos_neg"].nda centroid = dsp_cal["centroid"].nda - risetime = dsp_cal["tp_90"].nda - dsp_cal["tp_10"].nda - rt_low = dplms_dict["rt_low"] - rt_high = dplms_dict["rt_high"] - peak_lim = dplms_dict["peak_lim"] - cal_par = {} - wfs_cut_pars = [par for par in dplms_dict["wfs_cut_pars"].keys()] - for par in wfs_cut_pars: - cal_par[par] = dsp_cal[par].nda + fig, ax = plt.subplots(figsize=(12, 6.75), facecolor="white") for ii, wf in enumerate(wfs): if ii < 10: ax.plot(wf, label=f"centr = {centroid[ii]}") else: ax.plot(wf) - ax.legend(title=f"{lh5_path}", loc="upper right") + ax.legend(loc="upper right") axin = ax.inset_axes([0.1, 0.15, 0.35, 0.5]) for wf in wfs: axin.plot(wf) axin.set_xlim(wsize / 2 - dplms_dict["zoom"], wsize / 2 + dplms_dict["zoom"]) axin.set_yticklabels("") - plot_dict["dplms"]["wfs"] = fig + plot_dict["wfs"] = fig + + peak_pos = dsp_cal["peak_pos"].nda + risetime = dsp_cal["tp_90"].nda - dsp_cal["tp_10"].nda + rt_low = dplms_dict["rt_low"] + rt_high = dplms_dict["rt_high"] + peak_lim = dplms_dict["peak_lim"] + cal_par = {} + wfs_cut_pars = ["centroid", "peak_pos", "risetime"] + fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(16, 9), facecolor="white") - wfs_cut_pars.append("centroid") - wfs_cut_pars.append("peak_pos") - wfs_cut_pars.append("risetime") + for ii, par in enumerate(wfs_cut_pars): pspace = np.linspace( wsize / 2 - peak_lim, wsize / 2 + peak_lim, 2 * peak_lim @@ -375,7 +363,7 @@ def dplms_ge_dict( ax.flat[ii + 1].axvline(lup, color="k", linestyle=":") ax.flat[ii + 1].set_xlabel(par) ax.flat[ii + 1].set_yscale("log") - ax.flat[ii + 1].legend(title=f"{lh5_path}", loc="upper right") + ax.flat[ii + 1].legend(loc="upper right") roughenergy = dsp_cal["trapTmax"].nda roughenergy_sel = roughenergy[idxs] ell, ehh = roughenergy.min(), roughenergy.max() @@ -385,13 +373,13 @@ def dplms_ge_dict( ax.flat[0].plot(be[1:], hs, c="r", ds="steps", label="selected") ax.flat[0].set_xlabel("rough energy (ADC)") ax.flat[0].set_yscale("log") - ax.flat[0].legend(loc="upper right", title=f"{lh5_path}") - plot_dict["dplms"]["wf_sel"] = fig + ax.flat[0].legend(loc="upper right") + plot_dict["wf_sel"] = fig fig, ax = plt.subplots(figsize=(12, 6.75), facecolor="white") - ax.plot(x, "r-", label=f"filter") + ax.plot(x, "r-", label="filter") ax.axhline(0, color="black", linestyle=":") - ax.legend(loc="upper right", title=f"{lh5_path}") + ax.legend(loc="upper right") axin = ax.inset_axes([0.6, 0.1, 0.35, 0.33]) axin.plot(x, "r-") axin.set_xlim( @@ -456,7 +444,6 @@ def signal_selection(dsp_cal, dplms_dict, coeff_values): risetime = dsp_cal["tp_90"].nda - dsp_cal["tp_10"].nda rt_low = dplms_dict["rt_low"] - rt_high = dplms_dict["rt_high"] peak_lim = dplms_dict["peak_lim"] wsize = dplms_dict["wsize"] bsize = dplms_dict["bsize"] diff --git a/src/pygama/pargen/dsp_optimize.py b/src/pygama/pargen/dsp_optimize.py index e006a8379..36d91dd60 100644 --- a/src/pygama/pargen/dsp_optimize.py +++ b/src/pygama/pargen/dsp_optimize.py @@ -1,11 +1,19 @@ import logging -import multiprocessing as mp +import sys from collections import namedtuple -from multiprocessing import get_context -from pprint import pprint +import matplotlib.pyplot as plt import numpy as np +import pandas as pd +import pint from dspeed import build_processing_chain +from dspeed.units import unit_registry as ureg +from matplotlib.colors import LogNorm +from scipy.optimize import minimize +from scipy.stats import norm +from sklearn.exceptions import ConvergenceWarning +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.utils._testing import ignore_warnings log = logging.getLogger(__name__) @@ -114,11 +122,11 @@ def iterate_indices(self, indices): the order appearin in dims (first dimension is first for loop, etc): Return False when the grid runs out of indices. Otherwise returns True. """ - for iD in reversed(range(self.get_n_dimensions())): - indices[iD] += 1 - if indices[iD] < self.get_n_points_of_dim(iD): + for dim in reversed(range(self.get_n_dimensions())): + indices[dim] += 1 + if indices[dim] < self.get_n_points_of_dim(dim): return True - indices[iD] = 0 + indices[dim] = 0 return False # def check_indices(self, indices): @@ -198,7 +206,7 @@ def run_grid( while True: db_dict = grid.set_dsp_pars(db_dict, iii) if verbosity > 1: - pprint(dsp_config) + log.debug(dsp_config) log.debug(grid.print_data(iii)) grid_values[tuple(iii)] = run_one_dsp( tb_data, @@ -277,102 +285,525 @@ def get_grid_points(grid): return out -def run_grid_multiprocess_parallel( +OptimiserDimension = namedtuple( + "OptimiserDimension", "name parameter min_val max_val round unit" +) + + +class BayesianOptimizer: + """ + Bayesian optimiser uses Gaussian Process Regressor from sklearn to fit kernel + to data, takes in a series of init samples for this fit and then calculates + the next point using the acquisition function specified. + """ + + np.random.seed(55) + lambda_param = 0.01 + eta_param = 0 + + def __init__( + self, + acq_func, + batch_size, + kernel=None, + sampling_rate=None, + fom_value="y_val", + fom_error="y_val_err", + ): + self.dims = [] + self.current_iter = 0 + + self.batch_size = batch_size + self.iters = 0 + + if isinstance(sampling_rate, str): + self.sampling_rate = ureg.Quantity(sampling_rate) + elif isinstance(sampling_rate, pint.Quantity): + self.sampling_rate = sampling_rate + else: + if sampling_rate is not None: + raise TypeError("Unknown type for sampling rate") + + self.gauss_pr = GaussianProcessRegressor(kernel=kernel) + self.best_samples_ = pd.DataFrame(columns=["x", "y", "ei"]) + self.distances_ = [] + + if acq_func == "ei": + self.acq_function = self._get_expected_improvement + elif acq_func == "ucb": + self.acq_function = self._get_ucb + elif acq_func == "lcb": + self.acq_function = self._get_lcb + + self.fom_value = fom_value + self.fom_error = fom_error + + def add_dimension( + self, name, parameter, min_val, max_val, round_to_samples=False, unit=None + ): + if round_to_samples is True and self.sampling_rate is None: + raise ValueError("Must provide sampling rate to round to samples") + if unit is not None: + unit = ureg.Quantity(unit) + self.dims.append( + OptimiserDimension( + name, parameter, min_val, max_val, round_to_samples, unit + ) + ) + + def get_n_dimensions(self): + return len(self.dims) + + def add_initial_values(self, x_init, y_init, yerr_init): + self.x_init = x_init + self.y_init = y_init + self.yerr_init = yerr_init + + def _get_expected_improvement(self, x_new): + mean_y_new, sigma_y_new = self.gauss_pr.predict( + np.array([x_new]), return_std=True + ) + + mean_y = self.gauss_pr.predict(self.x_init) + min_mean_y = np.min(mean_y) + z = (mean_y_new[0] - min_mean_y - 1) / (sigma_y_new[0] + 1e-9) + exp_imp = (mean_y_new[0] - min_mean_y - 1) * norm.cdf(z) + sigma_y_new[ + 0 + ] * norm.pdf(z) + return exp_imp + + def _get_ucb(self, x_new): + mean_y_new, sigma_y_new = self.gauss_pr.predict( + np.array([x_new]), return_std=True + ) + return mean_y_new[0] + self.lambda_param * sigma_y_new[0] + + def _get_lcb(self, x_new): + mean_y_new, sigma_y_new = self.gauss_pr.predict( + np.array([x_new]), return_std=True + ) + return mean_y_new[0] - self.lambda_param * sigma_y_new[0] + + def _get_next_probable_point(self): + min_ei = float(sys.maxsize) + x_optimal = None + # Trial with an array of random data points + rands = np.random.uniform( + np.array([dim.min_val for dim in self.dims]), + np.array([dim.max_val for dim in self.dims]), + (self.batch_size, self.get_n_dimensions()), + ) + for x_start in rands: + response = minimize( + fun=self.acq_function, + x0=x_start, + bounds=[(dim.min_val, dim.max_val) for dim in self.dims], + method="L-BFGS-B", + ) + if response.fun < min_ei: + min_ei = response.fun + x_optimal = [] + for y, dim in zip(response.x, self.dims): + if dim.round is True and dim.unit is not None: + # round so samples is integer + + x_optimal.append( + float( + round( + (y * (dim.unit / self.sampling_rate)).to( + "dimensionless" + ), + 0, + ) + * (self.sampling_rate / dim.unit) + ) + ) + else: + x_optimal.append(y) + if x_optimal in self.x_init: + perturb = np.random.uniform( + -np.array([(dim.max_val - dim.min_val) / 10 for dim in self.dims]), + np.array([(dim.max_val - dim.min_val) / 10 for dim in self.dims]), + (1, len(self.dims)), + ) + x_optimal += perturb + new_x_optimal = [] + for y, dim in zip(x_optimal[0], self.dims): + if dim.round is True and dim.unit is not None: + # round so samples is integer + new_x_optimal.append( + float( + round( + (y * (dim.unit / self.sampling_rate)).to( + "dimensionless" + ), + 0, + ) + * (self.sampling_rate / dim.unit) + ) + ) + else: + new_x_optimal.append(y) + x_optimal = new_x_optimal + for i, y in enumerate(x_optimal): + if y > self.dims[i].max_val: + x_optimal[i] = self.dims[i].max_val + elif y < self.dims[i].min_val: + x_optimal[i] = self.dims[i].min_val + return x_optimal, min_ei + + def _extend_prior_with_posterior_data(self, x, y, yerr): + self.x_init = np.append(self.x_init, np.array([x]), axis=0) + self.y_init = np.append(self.y_init, np.array(y), axis=0) + self.yerr_init = np.append(self.yerr_init, np.array(yerr), axis=0) + + def get_first_point(self): + y_min_ind = np.nanargmin(self.y_init) + self.y_min = self.y_init[y_min_ind] + self.optimal_x = self.x_init[y_min_ind] + self.optimal_ei = None + return self.optimal_x, self.optimal_ei + + @ignore_warnings(category=ConvergenceWarning) + def iterate_values(self): + nan_idxs = np.isnan(self.y_init) + self.gauss_pr.fit(self.x_init[~nan_idxs], np.array(self.y_init)[~nan_idxs]) + x_next, ei = self._get_next_probable_point() + return x_next, ei + + def update_db_dict(self, db_dict): + if self.current_iter == 0: + x_new, ei = self.get_first_point() + x_new, ei = self.iterate_values() + self.current_x = x_new + self.current_ei = ei + for i, val in enumerate(x_new): + name, parameter, min_val, max_val, rounding, unit = self.dims[i] + if unit is not None: + value_str = f"{val}*{unit.units:~}" + if "µ" in value_str: + value_str = value_str.replace("µ", "u") + else: + value_str = f"{val}" + if name not in db_dict.keys(): + db_dict[name] = {parameter: value_str} + else: + db_dict[name][parameter] = value_str + self.current_iter += 1 + return db_dict + + def update(self, results): + y_val = results[self.fom_value] + y_err = results[self.fom_error] + self._extend_prior_with_posterior_data( + self.current_x, np.array([y_val]), np.array([y_err]) + ) + + if np.isnan(y_val) | np.isnan(y_err): + pass + else: + if y_val < self.y_min: + self.y_min = y_val + self.optimal_x = self.current_x + self.optimal_ei = self.current_ei + self.optimal_results = results + + if self.current_iter == 1: + self.prev_x = self.current_x + else: + self.distances_.append( + np.linalg.norm(np.array(self.prev_x) - np.array(self.current_x)) + ) + self.prev_x = self.current_x + + self.best_samples_ = pd.concat( + [ + self.best_samples_, + pd.DataFrame( + {"x": self.optimal_x, "y": self.y_min, "ei": self.optimal_ei} + ), + ], + ignore_index=True, + ) + + def get_best_vals(self): + out_dict = {} + for i, val in enumerate(self.optimal_x): + name, parameter, min_val, max_val, rounding, unit = self.dims[i] + if unit is not None: + value_str = f"{val}*{unit.units:~}" + if "µ" in value_str: + value_str = value_str.replace("µ", "u") + else: + value_str = f"{val}" + if name not in out_dict.keys(): + out_dict[name] = {parameter: value_str} + else: + out_dict[name][parameter] = value_str + return out_dict + + @ignore_warnings(category=ConvergenceWarning) + def plot(self, init_samples=None): + nan_idxs = np.isnan(self.y_init) + fail_idxs = np.isnan(self.yerr_init) + self.gauss_pr.fit(self.x_init[~nan_idxs], np.array(self.y_init)[~nan_idxs]) + if (len(self.dims) != 2) and (len(self.dims) != 1): + raise Exception("Acquisition Function Plotting not implemented for dim!=2") + elif len(self.dims) == 1: + points = np.arange(self.dims[0].min_val, self.dims[0].max_val, 0.1) + ys = np.zeros_like(points) + ys_err = np.zeros_like(points) + for i, point in enumerate(points): + ys[i], ys_err[i] = self.gauss_pr.predict( + np.array([point]).reshape(1, -1), return_std=True + ) + fig = plt.figure() + + plt.scatter(np.array(self.x_init), np.array(self.y_init), label="Samples") + plt.scatter( + np.array(self.x_init)[fail_idxs], + np.array(self.y_init)[fail_idxs], + color="green", + label="Failed samples", + ) + plt.fill_between(points, ys - ys_err, ys + ys_err, alpha=0.1) + if init_samples is not None: + init_ys = np.array( + [ + np.where(init_sample == self.x_init)[0][0] + for init_sample in init_samples + ] + ) + plt.scatter( + np.array(init_samples)[:, 0], + np.array(self.y_init)[init_ys], + color="red", + label="Init Samples", + ) + plt.scatter(self.optimal_x[0], self.y_min, color="orange", label="Optimal") + + plt.xlabel( + f"{self.dims[0].name}-{self.dims[0].parameter}({self.dims[0].unit})" + ) + plt.ylabel("Kernel Value") + plt.legend() + elif len(self.dims) == 2: + x, y = np.mgrid[ + self.dims[0].min_val : self.dims[0].max_val : 0.1, + self.dims[1].min_val : self.dims[1].max_val : 0.1, + ] + points = np.vstack((x.flatten(), y.flatten())).T + out_grid = np.zeros( + ( + int((self.dims[0].max_val - self.dims[0].min_val) * 10), + int((self.dims[1].max_val - self.dims[1].min_val) * 10), + ) + ) + + j = 0 + for i, _ in np.ndenumerate(out_grid): + out_grid[i] = self.gauss_pr.predict( + points[j].reshape(1, -1), return_std=False + ) + j += 1 + + fig = plt.figure() + plt.imshow( + out_grid, + norm=LogNorm(), + origin="lower", + aspect="auto", + extent=(0, out_grid.shape[1], 0, out_grid.shape[0]), + ) + plt.scatter( + np.array(self.x_init - self.dims[1].min_val)[:, 1] * 10, + np.array(self.x_init - self.dims[0].min_val)[:, 0] * 10, + ) + if init_samples is not None: + plt.scatter( + (init_samples[:, 1] - self.dims[1].min_val) * 10, + (init_samples[:, 0] - self.dims[0].min_val) * 10, + color="red", + ) + plt.scatter( + (self.optimal_x[1] - self.dims[1].min_val) * 10, + (self.optimal_x[0] - self.dims[0].min_val) * 10, + color="orange", + ) + ticks, labels = plt.xticks() + labels = np.linspace(self.dims[1].min_val, self.dims[1].max_val, 5) + ticks = np.linspace(0, out_grid.shape[1], 5) + plt.xticks(ticks=ticks, labels=labels, rotation=45) + ticks, labels = plt.yticks() + labels = np.linspace(self.dims[0].min_val, self.dims[0].max_val, 5) + ticks = np.linspace(0, out_grid.shape[0], 5) + plt.yticks(ticks=ticks, labels=labels, rotation=45) + plt.xlabel( + f"{self.dims[1].name}-{self.dims[1].parameter}({self.dims[1].unit})" + ) + plt.ylabel( + f"{self.dims[0].name}-{self.dims[0].parameter}({self.dims[0].unit})" + ) + plt.title(f"{self.dims[0].name} Kernel Prediction") + plt.tight_layout() + plt.close() + return fig + + @ignore_warnings(category=ConvergenceWarning) + def plot_acq(self, init_samples=None): + nan_idxs = np.isnan(self.y_init) + self.gauss_pr.fit(self.x_init[~nan_idxs], np.array(self.y_init)[~nan_idxs]) + if (len(self.dims) != 2) and (len(self.dims) != 1): + raise Exception("Acquisition Function Plotting not implemented for dim!=2") + elif len(self.dims) == 1: + points = np.arange(self.dims[0].min_val, self.dims[0].max_val, 0.1) + ys = np.zeros_like(points) + for i, point in enumerate(points): + ys[i] = self.acq_function(np.array([point]).reshape(1, -1)[0]) + fig = plt.figure() + plt.plot(points, ys) + plt.scatter(np.array(self.x_init), np.array(self.y_init), label="Samples") + if init_samples is not None: + init_ys = np.array( + [ + np.where(init_sample == self.x_init)[0][0] + for init_sample in init_samples + ] + ) + plt.scatter( + np.array(init_samples)[:, 0], + np.array(self.y_init)[init_ys], + color="red", + label="Init Samples", + ) + plt.scatter(self.optimal_x[0], self.y_min, color="orange", label="Optimal") + + plt.xlabel( + f"{self.dims[0].name}-{self.dims[0].parameter}({self.dims[0].unit})" + ) + plt.ylabel("Acquisition Function Value") + plt.legend() + + elif len(self.dims) == 2: + x, y = np.mgrid[ + self.dims[0].min_val : self.dims[0].max_val : 0.1, + self.dims[1].min_val : self.dims[1].max_val : 0.1, + ] + points = np.vstack((x.flatten(), y.flatten())).T + out_grid = np.zeros( + ( + int((self.dims[0].max_val - self.dims[0].min_val) * 10), + int((self.dims[1].max_val - self.dims[1].min_val) * 10), + ) + ) + + j = 0 + for i, _ in np.ndenumerate(out_grid): + out_grid[i] = self.acq_function(points[j]) + j += 1 + + fig = plt.figure() + plt.imshow( + out_grid, + norm=LogNorm(), + origin="lower", + aspect="auto", + extent=(0, out_grid.shape[1], 0, out_grid.shape[0]), + ) + plt.scatter( + np.array(self.x_init - self.dims[1].min_val)[:, 1] * 10, + np.array(self.x_init - self.dims[0].min_val)[:, 0] * 10, + ) + if init_samples is not None: + plt.scatter( + (init_samples[:, 1] - self.dims[1].min_val) * 10, + (init_samples[:, 0] - self.dims[0].min_val) * 10, + color="red", + ) + plt.scatter( + (self.optimal_x[1] - self.dims[1].min_val) * 10, + (self.optimal_x[0] - self.dims[0].min_val) * 10, + color="orange", + ) + ticks, labels = plt.xticks() + labels = np.linspace(self.dims[1].min_val, self.dims[1].max_val, 5) + ticks = np.linspace(0, out_grid.shape[1], 5) + plt.xticks(ticks=ticks, labels=labels, rotation=45) + ticks, labels = plt.yticks() + labels = np.linspace(self.dims[0].min_val, self.dims[0].max_val, 5) + ticks = np.linspace(0, out_grid.shape[0], 5) + plt.yticks(ticks=ticks, labels=labels, rotation=45) + plt.xlabel( + f"{self.dims[1].name}-{self.dims[1].parameter}({self.dims[1].unit})" + ) + plt.ylabel( + f"{self.dims[0].name}-{self.dims[0].parameter}({self.dims[0].unit})" + ) + plt.title(f"{self.dims[0].name} Acquisition Space") + plt.tight_layout() + plt.close() + return fig + + +def run_bayesian_optimisation( tb_data, dsp_config, - grid, fom_function, - db_dict=None, - verbosity=1, - processes=5, + optimisers, fom_kwargs=None, + db_dict=None, + nan_val=10, + n_iter=10, ): - """ - run one iteration of DSP on tb_data with multiprocessing, can handle - multiple grids if they are the same dimensions + if not isinstance(optimisers, list): + optimisers = [optimisers] + if not isinstance(fom_kwargs, list): + fom_kwargs = [fom_kwargs] + if not isinstance(fom_function, list): + fom_function = [fom_function] - Optionally returns a value for optimization + for j in range(n_iter): + for optimiser in optimisers: + db_dict = optimiser.update_db_dict(db_dict) - Parameters - ---------- - tb_data : lh5 Table - An input table of lh5 data. Typically a selection is made prior to - sending tb_data to this function: optimization typically doesn't have to - run over all data - dsp_config : dict - Specifies the DSP to be performed for this iteration (see - build_processing_chain()) and the list of output variables to appear in - the output table - grid : pargrid, list of pargrids - Grids to run optimization on - db_dict : dict (optional) - DSP parameters database. See build_processing_chain for formatting info - fom_function : function or None (optional) - When given the output lh5 table of this DSP iteration, the - fom_function must return a scalar figure-of-merit value upon which the - optimization will be based. Should accept verbosity as a second argument. - If multiple grids provided can either pass one fom to have it run for each grid - or a list of fom to run different fom on each grid. - verbosity : int (optional) - verbosity for the processing chain and fom_function calls - processes : int - DOCME - fom_kwargs - any keyword arguments to pass to the fom, - if multiple grids given will need to be a list of the fom_kwargs for each grid + log.info(f"Iteration number: {j+1}") + log.info(f"Processing with {db_dict}") - Returns - ------- - figure_of_merit : float - If fom_function is not None, returns figure-of-merit value for the DSP iteration - tb_out : lh5 Table - If fom_function is None, returns the output lh5 table for the DSP iteration - """ + tb_out = run_one_dsp(tb_data, dsp_config, db_dict=db_dict) - if not isinstance(grid, list): - grid = [grid] - if not isinstance(fom_function, list) and fom_function is not None: - fom_function = [fom_function] - if not isinstance(fom_kwargs, list): - fom_kwargs = [fom_kwargs for gri in grid] - grid_values = [] - shapes = [gri.get_shape() for gri in grid] - if fom_function is not None: - for i in range(len(grid)): - grid_values.append(np.ndarray(shape=shapes[i], dtype="O")) - else: - grid_lengths = np.array([gri.get_n_grid_points() for gri in grid]) - grid_values.append(np.ndarray(shape=shapes[np.argmax(grid_lengths)], dtype="O")) - grid_list = get_grid_points(grid) - pool = mp.Pool(processes=processes) - results = [ - pool.apply_async( - run_grid_point, - args=( - tb_data, - dsp_config, - grid, - fom_function, - np.asarray(gl), - db_dict, - verbosity, - fom_kwargs, - ), - ) - for gl in grid_list - ] - - for result in results: - res = result.get() - indexes = res["indexes"] - if fom_function is not None: - for i in range(len(grid)): - index = indexes[i] - if grid_values[i][index] is None: - grid_values[i][index] = res["results"][i] - else: - grid_values[0][indexes[0]] = {f"{indexes[0]}": res["results"]} + res = np.ndarray(shape=len(optimisers), dtype="O") - pool.close() - pool.join() - return grid_values + for i in range(len(optimisers)): + if fom_kwargs[i] is not None: + if len(fom_function) > 1: + res[i] = fom_function[i](tb_out, fom_kwargs[i]) + else: + res[i] = fom_function[0](tb_out, fom_kwargs[i]) + else: + if len(fom_function) > 1: + res[i] = fom_function[i](tb_out) + else: + res[i] = fom_function[0](tb_out) + + log.info(f"Results of iteration {j+1} are {res}") + + for i, optimiser in enumerate(optimisers): + if np.isnan(res[i][optimiser.fom_value]): + if isinstance(nan_val, list): + res[i][optimiser.fom_value] = nan_val[i] + else: + res[i][optimiser.fom_value] = nan_val + + optimiser.update(res[i]) + + out_param_dict = {} + out_results_list = [] + for optimiser in optimisers: + param_dict = optimiser.get_best_vals() + out_param_dict.update(param_dict) + results_dict = optimiser.optimal_results + if np.isnan(results_dict[optimiser.fom_value]): + log.error(f"Energy optimisation failed for {optimiser.dims[0][0]}") + out_results_list.append(results_dict) + + return out_param_dict, out_results_list diff --git a/src/pygama/pargen/ecal_th.py b/src/pygama/pargen/ecal_th.py deleted file mode 100644 index e3526c63a..000000000 --- a/src/pygama/pargen/ecal_th.py +++ /dev/null @@ -1,1507 +0,0 @@ -""" -This module provides a routine for running the energy calibration on Th data -""" - -from __future__ import annotations - -import json -import logging -import math -import os -import pathlib -from datetime import datetime - -import matplotlib as mpl -from scipy.stats import binned_statistic - -mpl.use("agg") -import lgdo.lh5 as lh5 -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import scipy.stats -from iminuit import Minuit, cost -from matplotlib.backends.backend_pdf import PdfPages -from matplotlib.colors import LogNorm -from scipy.optimize import curve_fit - -import pygama.math.histogram as pgh -import pygama.math.peak_fitting as pgf -import pygama.pargen.cuts as cts -import pygama.pargen.energy_cal as cal -from pygama.pargen.utils import load_data, return_nans - -log = logging.getLogger(__name__) - - -def fwhm_slope(x: np.array, m0: float, m1: float, m2: float = None) -> np.array: - """ - Fit the energy resolution curve - """ - if m2 is None: - return np.sqrt(m0 + m1 * x) - else: - return np.sqrt(m0 + m1 * x + m2 * x**2) - - -def apply_cuts( - data: pd.DataFrame, - hit_dict, - cut_parameters=None, - final_cut_field: str = "is_valid_cal", - pulser_field="is_pulser", -): - if cut_parameters is not None: - cut_dict = cts.generate_cuts(data.query(f"(~{pulser_field})"), cut_parameters) - hit_dict.update( - cts.cut_dict_to_hit_dict(cut_dict, final_cut_field=final_cut_field) - ) - mask = cts.get_cut_indexes(data, cut_dict) - - data[final_cut_field] = mask - - else: - data[final_cut_field] = np.ones(len(data), dtype=bool) - - events_pqc = len(data.query(f"{final_cut_field}&(~{pulser_field})")) - log.debug(f"{events_pqc} events valid for calibration") - - return data, hit_dict - - -def gen_pars_dict(pars, deg, energy_param): - if deg == 1: - out_dict = { - "expression": f"a*{energy_param}+b", - "parameters": {"a": pars[0], "b": pars[1]}, - } - elif deg == 0: - out_dict = { - "expression": f"a*{energy_param}", - "parameters": {"a": pars[0]}, - } - elif deg == 2: - out_dict = { - "expression": f"a*{energy_param}**2 +b*{energy_param}+c", - "parameters": {"a": pars[0], "b": pars[1], "c": pars[2]}, - } - else: - out_dict = {} - log.error(f"hit_dict not implemented for deg = {deg}") - - return out_dict - - -class fwhm_linear: - def func(x, a, b): - return np.sqrt(a + b * x) - - def string_func(input_param): - return f"(a+b*{input_param})**(0.5)" - - def guess(xs, ys, y_errs): - return [np.nanmin(ys), 10**-3] - - def bounds(): - return [(0, None), (0, None)] - - -class fwhm_quadratic: - def func(x, a, b, c): - return np.sqrt(a + b * x + c * x**2) - - def string_func(input_param): - return f"(a+b*{input_param}+c*{input_param}**2)**(0.5)" - - def guess(xs, ys, y_errs): - return [np.nanmin(ys), 10**-3, 10**-5] - - def bounds(): - return [(0, None), (0, None), (0, None)] - - -class calibrate_parameter: - glines = [ - # 238.632, - 583.191, - 727.330, - 860.564, - 1592.53, - 1620.50, - 2103.53, - 2614.50, - ] # gamma lines used for calibration - range_keV = [ - # (8, 8), - (20, 20), - (30, 30), - (30, 30), - (40, 20), - (20, 40), - (40, 40), - (60, 60), - ] # side bands width - funcs = [ - # pgf.extended_gauss_step_pdf, - pgf.extended_radford_pdf, - pgf.extended_radford_pdf, - pgf.extended_radford_pdf, - pgf.extended_gauss_step_pdf, - pgf.extended_gauss_step_pdf, - pgf.extended_gauss_step_pdf, - pgf.extended_radford_pdf, - ] - gof_funcs = [ - # pgf.gauss_step_pdf, - pgf.radford_pdf, - pgf.radford_pdf, - pgf.radford_pdf, - pgf.gauss_step_pdf, - pgf.gauss_step_pdf, - pgf.gauss_step_pdf, - pgf.radford_pdf, - ] - - def __init__( - self, - energy_param, - selection_string="", - plot_options: dict = None, - guess_keV: float | None = None, - threshold: int = 0, - p_val: float = 0, - n_events: int = None, - simplex: bool = True, - deg: int = 1, - cal_energy_param: str = None, - tail_weight=100, - ): - self.energy_param = energy_param - if cal_energy_param is None: - self.cal_energy_param = f"{self.energy_param}_cal" - else: - self.cal_energy_param = cal_energy_param - self.selection_string = selection_string - self.guess_keV = guess_keV - self.threshold = threshold - self.p_val = p_val - self.n_events = n_events - self.deg = deg - self.plot_options = plot_options - self.simplex = simplex - self.tail_weight = tail_weight - - def fit_energy_res(self): - fitted_peaks = self.results["fitted_keV"] - fwhms = self.results["pk_fwhms"][:, 0] - dfwhms = self.results["pk_fwhms"][:, 1] - - ##### - # Remove the Tl SEP and DEP from calibration if found - fwhm_peaks = np.array([], dtype=np.float32) - all_peaks = np.array([], dtype=np.float32) - indexes = [] - for i, peak in enumerate(fitted_peaks): - all_peaks = np.append(all_peaks, peak) - if peak == 2103.53: - log.info(f"Tl SEP found at index {i}") - indexes.append(i) - continue - elif peak == 1592.53: - log.info(f"Tl DEP found at index {i}") - indexes.append(i) - continue - elif peak == 511.0: - log.info(f"e annihilation found at index {i}") - indexes.append(i) - continue - elif np.isnan(dfwhms[i]): - log.info(f"{peak} failed") - indexes.append(i) - continue - else: - fwhm_peaks = np.append(fwhm_peaks, peak) - fit_fwhms = np.delete(fwhms, [indexes]) - fit_dfwhms = np.delete(dfwhms, [indexes]) - ##### - for i, peak in enumerate(fwhm_peaks): - log.info( - f"FWHM of {peak} keV peak is: {fit_fwhms[i]:1.2f} +- {fit_dfwhms[i]:1.2f} keV" - ) - try: - if 2614.50 not in fwhm_peaks: - raise RuntimeError - - c_lin = cost.LeastSquares( - fwhm_peaks, fit_fwhms, fit_dfwhms, fwhm_linear.func - ) - c_lin.loss = "soft_l1" - m_lin = Minuit(c_lin, *fwhm_linear.guess(fwhm_peaks, fit_fwhms, fit_dfwhms)) - m_lin.limits = fwhm_linear.bounds() - m_lin.simplex() - m_lin.migrad() - m_lin.hesse() - - rng = np.random.default_rng(1) - pars_b = rng.multivariate_normal(m_lin.values, m_lin.covariance, size=1000) - fits = np.array([fwhm_linear.func(fwhm_peaks, *par_b) for par_b in pars_b]) - qbb_vals = np.array([fwhm_linear.func(2039.0, *par_b) for par_b in pars_b]) - qbb_err = np.nanstd(qbb_vals) - predicted_fwhms = fwhm_linear.func(fwhm_peaks, *m_lin.values) - fit_qbb = fwhm_linear.func(2039.0, *m_lin.values) - - p_val = scipy.stats.chi2.sf(m_lin.fval, len(fwhm_peaks) - len(m_lin.values)) - - self.fwhm_fit_linear = { - "function": fwhm_linear.__name__, - "module": fwhm_linear.__module__, - "expression": fwhm_linear.string_func("x"), - "Qbb_fwhm_in_keV": fit_qbb, - "Qbb_fwhm_err_in_keV": qbb_err, - "parameters": m_lin.values, - "uncertainties": m_lin.errors, - "cov": m_lin.covariance, - "csqr": (m_lin.fval, len(fwhm_peaks) - len(m_lin.values)), - "p_val": p_val, - } - - log.info(f'FWHM linear fit: {self.fwhm_fit_linear["parameters"].to_dict()}') - log.info(f"FWHM fit values:") - log.info(f"\t Energy | FWHM (keV) | Predicted (keV)") - for i, (peak, fwhm, fwhme) in enumerate( - zip(fwhm_peaks, fit_fwhms, fit_dfwhms) - ): - log.info( - f"\t{i}".ljust(4) - + str(peak).ljust(9) - + f"| {fwhm:.2f}+-{fwhme:.2f} ".ljust(5) - + f"| {fwhm_linear.func(peak, *self.fwhm_fit_linear['parameters']):.2f}".ljust( - 5 - ) - ) - - log.info( - f"FWHM energy resolution at Qbb (linear fit): {fit_qbb:1.2f} +- {qbb_err:1.2f} keV" - ) - except RuntimeError: - log.error(f"FWHM linear fit failed for {self.energy_param}") - pars, errs, cov = return_nans(fwhm_linear.func) - self.fwhm_fit_linear = { - "function": fwhm_linear.__name__, - "module": fwhm_linear.__module__, - "expression": fwhm_linear.string_func("x"), - "Qbb_fwhm_in_keV": np.nan, - "Qbb_fwhm_err_in_keV": np.nan, - "parameters": pars, - "uncertainties": errs, - "cov": cov, - "csqr": (np.nan, np.nan), - "p_val": 0, - } - log.error("FWHM linear fit failed to converge") - try: - if 2614.50 not in fwhm_peaks: - raise RuntimeError - c_quad = cost.LeastSquares( - fwhm_peaks, fit_fwhms, fit_dfwhms, fwhm_quadratic.func - ) - c_quad.loss = "soft_l1" - m_quad = Minuit( - c_quad, *fwhm_quadratic.guess(fwhm_peaks, fit_fwhms, fit_dfwhms) - ) - m_quad.limits = fwhm_quadratic.bounds() - m_quad.simplex() - m_quad.migrad() - m_quad.hesse() - - rng = np.random.default_rng(1) - pars_b = rng.multivariate_normal( - m_quad.values, m_quad.covariance, size=1000 - ) - fits = np.array( - [fwhm_quadratic.func(fwhm_peaks, *par_b) for par_b in pars_b] - ) - qbb_vals = np.array( - [fwhm_quadratic.func(2039.0, *par_b) for par_b in pars_b] - ) - qbb_err = np.nanstd(qbb_vals) - predicted_fwhms = fwhm_quadratic.func(fwhm_peaks, *m_quad.values) - fit_qbb = fwhm_quadratic.func(2039.0, *m_quad.values) - - p_val = scipy.stats.chi2.sf( - m_quad.fval, len(fwhm_peaks) - len(m_quad.values) - ) - - self.fwhm_fit_quadratic = { - "function": fwhm_quadratic.__name__, - "module": fwhm_quadratic.__module__, - "expression": fwhm_quadratic.string_func("x"), - "Qbb_fwhm_in_keV": fit_qbb, - "Qbb_fwhm_err_in_keV": qbb_err, - "parameters": m_quad.values, - "uncertainties": m_quad.errors, - "cov": m_quad.covariance, - "csqr": (m_quad.fval, len(fwhm_peaks) - len(m_quad.values)), - "p_val": p_val, - } - log.info( - f'FWHM quadratic fit: {self.fwhm_fit_quadratic["parameters"].to_dict()}' - ) - log.info( - f"FWHM energy resolution at Qbb (quadratic fit): {fit_qbb:1.2f} +- {qbb_err:1.2f} keV" - ) - except RuntimeError: - log.error(f"FWHM quadratic fit failed for {self.energy_param}") - pars, errs, cov = return_nans(fwhm_quadratic.func) - self.fwhm_fit_quadratic = { - "function": fwhm_quadratic.__name__, - "module": fwhm_quadratic.__module__, - "expression": fwhm_quadratic.string_func("x"), - "Qbb_fwhm_in_keV": np.nan, - "Qbb_fwhm_err_in_keV": np.nan, - "parameters": pars, - "uncertainties": errs, - "cov": cov, - "csqr": (np.nan, np.nan), - "p_val": 0, - } - log.error("FWHM quadratic fit failed to converge") - - def gen_pars_dict(self): - if self.deg == 1: - out_dict = { - "expression": f"a*{self.energy_param}+b", - "parameters": {"a": self.pars[0], "b": self.pars[1]}, - } - elif self.deg == 0: - out_dict = { - "expression": f"a*{self.energy_param}", - "parameters": {"a": self.pars[0]}, - } - elif self.deg == 2: - out_dict = { - "expression": f"a*{self.energy_param}**2 +b*{self.energy_param}+c", - "parameters": {"a": self.pars[0], "b": self.pars[1], "c": self.pars[2]}, - } - else: - out_dict = {} - log.warning(f"hit_dict not implemented for deg = {self.deg}") - - return out_dict - - def get_results_dict(self, data): - if np.isnan(self.pars).all(): - return {} - else: - fwhm_linear = self.fwhm_fit_linear.copy() - fwhm_linear["parameters"] = fwhm_linear["parameters"].to_dict() - fwhm_linear["uncertainties"] = fwhm_linear["uncertainties"].to_dict() - fwhm_linear["cov"] = fwhm_linear["cov"].tolist() - fwhm_quad = self.fwhm_fit_quadratic.copy() - fwhm_quad["parameters"] = fwhm_quad["parameters"].to_dict() - fwhm_quad["uncertainties"] = fwhm_quad["uncertainties"].to_dict() - fwhm_quad["cov"] = fwhm_quad["cov"].tolist() - - pk_dict = { - Ei: { - "function": func_i.__name__, - "module": func_i.__module__, - "parameters_in_ADC": parsi.to_dict(), - "uncertainties_in_ADC": errorsi.to_dict(), - "p_val": pvali, - "fwhm_in_keV": list(fwhmi), - } - for i, (Ei, parsi, errorsi, pvali, fwhmi, func_i) in enumerate( - zip( - self.results["fitted_keV"], - self.results["pk_pars"][self.results["pk_validities"]], - self.results["pk_errors"][self.results["pk_validities"]], - self.results["pk_pvals"][self.results["pk_validities"]], - self.results["pk_fwhms"], - self.funcs, - ) - ) - } - - return { - "total_fep": len( - data.query( - f"{self.cal_energy_param}>2604&{self.cal_energy_param}<2624" - ) - ), - "total_dep": len( - data.query( - f"{self.cal_energy_param}>1587&{self.cal_energy_param}<1597" - ) - ), - "pass_fep": len( - data.query( - f"{self.cal_energy_param}>2604&{self.cal_energy_param}<2624&{self.selection_string}" - ) - ), - "pass_dep": len( - data.query( - f"{self.cal_energy_param}>1587&{self.cal_energy_param}<1597&{self.selection_string}" - ) - ), - "eres_linear": fwhm_linear, - "eres_quadratic": fwhm_quad, - "fitted_peaks": self.results["fitted_keV"].tolist(), - "pk_fits": pk_dict, - } - - def calibrate_parameter(self, data): - kev_ranges = self.range_keV.copy() - if self.guess_keV is None: - self.guess_keV = 2620 / np.nanpercentile( - data.query( - f"{self.selection_string} & {self.energy_param}>{self.threshold}" - )[self.energy_param], - 99, - ) - - log.debug(f"Find peaks and compute calibration curve for {self.energy_param}") - log.debug(f"Guess is {self.guess_keV:.3f}") - - try: - self.pars, self.cov, self.results = cal.hpge_E_calibration( - data.query(self.selection_string)[self.energy_param], - self.glines, - self.guess_keV, - deg=self.deg, - range_keV=kev_ranges, - funcs=self.funcs, - gof_funcs=self.gof_funcs, - n_events=self.n_events, - allowed_p_val=self.p_val, - simplex=self.simplex, - tail_weight=self.tail_weight, - verbose=False, - ) - pk_pars = self.results["pk_pars"] - found_peaks = self.results["got_peaks_locs"] - fitted_peaks = self.results["fitted_keV"] - fitted_funcs = self.results["pk_funcs"] - if self.pars is None: - raise ValueError - - for i, peak in enumerate(self.results["got_peaks_keV"]): - idx = np.where(peak == self.glines)[0][0] - self.funcs[idx] = fitted_funcs[i] - if fitted_funcs[i] == pgf.extended_radford_pdf: - self.gof_funcs[idx] = pgf.radford_pdf - else: - self.gof_funcs[idx] = pgf.gauss_step_pdf - except: - found_peaks = np.array([]) - fitted_peaks = np.array([]) - fitted_funcs = np.array([]) - if len(fitted_peaks) != len(self.glines): - for i, peak in enumerate(self.glines): - if peak not in fitted_peaks: - kev_ranges[i] = (kev_ranges[i][0] - 5, kev_ranges[i][1] - 5) - for i, peak in enumerate(fitted_peaks): - try: - if ( - self.results["pk_fwhms"][:, 1][i] - / self.results["pk_fwhms"][:, 0][i] - > 0.05 - ): - index = np.where(self.glines == peak)[0][0] - kev_ranges[index] = ( - kev_ranges[index][0] - 5, - kev_ranges[index][1] - 5, - ) - except: - pass - try: - self.pars, self.cov, self.results = cal.hpge_E_calibration( - data.query(self.selection_string)[self.energy_param], - self.glines, - self.guess_keV, - deg=self.deg, - range_keV=kev_ranges, - funcs=self.funcs, - gof_funcs=self.gof_funcs, - n_events=self.n_events, - allowed_p_val=self.p_val, - simplex=self.simplex, - tail_weight=self.tail_weight, - verbose=False, - ) - fitted_peaks = self.results["fitted_keV"] - fitted_funcs = self.results["pk_funcs"] - - log.debug("Calibrated found") - log.info(f"Calibration pars are {self.pars}") - - for i, peak in enumerate(self.results["got_peaks_keV"]): - idx = np.where(peak == self.glines)[0][0] - self.funcs[idx] = fitted_funcs[i] - if fitted_funcs[i] == pgf.extended_radford_pdf: - self.gof_funcs[idx] = pgf.radford_pdf - else: - self.gof_funcs[idx] = pgf.gauss_step_pdf - if self.pars is None: - raise ValueError - - except: - self.pars = np.full(self.deg + 1, np.nan) - self.results = None - - log.error(f"Calibration failed completely for {self.energy_param}") - else: - log.debug("Calibrated found") - log.info(f"Calibration pars are {self.pars}") - if ~np.isnan(self.pars).all(): - self.fit_energy_res() - self.hit_dict = {self.cal_energy_param: self.gen_pars_dict()} - data[self.cal_energy_param] = pgf.poly(data[self.energy_param], self.pars) - - def fill_plot_dict(self, data, plot_dict={}): - for key, item in self.plot_options.items(): - if item["options"] is not None: - plot_dict[key] = item["function"](self, data, **item["options"]) - else: - plot_dict[key] = item["function"](self, data) - return plot_dict - - -class high_stats_fitting(calibrate_parameter): - glines = [ - 238.632, - 511, - 583.191, - 727.330, - 763, - 785, - 860.564, - 893, - 1079, - 1513, - 1592.53, - 1620.50, - 2103.53, - 2614.50, - 3125, - 3198, - 3474, - ] # gamma lines used for calibration - range_keV = [ - (10, 10), - (30, 30), - (30, 30), - (30, 30), - (30, 15), - (15, 30), - (30, 25), - (25, 30), - (30, 30), - (30, 30), - (30, 20), - (20, 30), - (30, 30), - (30, 30), - (30, 30), - (30, 30), - (30, 30), - ] # side bands width - binning = [ - 0.02, - 0.02, - 0.02, - 0.02, - 0.2, - 0.2, - 0.02, - 0.2, - 0.2, - 0.2, - 0.1, - 0.1, - 0.1, - 0.02, - 0.2, - 0.2, - 0.2, - ] - funcs = [ - pgf.extended_gauss_step_pdf, # probably should be gauss on exp - pgf.extended_gauss_step_pdf, - pgf.extended_radford_pdf, - pgf.extended_radford_pdf, - pgf.extended_gauss_step_pdf, - pgf.extended_gauss_step_pdf, - pgf.extended_radford_pdf, - pgf.extended_gauss_step_pdf, - pgf.extended_gauss_step_pdf, - pgf.extended_gauss_step_pdf, - pgf.extended_radford_pdf, - pgf.extended_radford_pdf, - pgf.extended_radford_pdf, - pgf.extended_radford_pdf, - pgf.extended_gauss_step_pdf, - pgf.extended_gauss_step_pdf, - pgf.extended_gauss_step_pdf, - ] - gof_funcs = [ - pgf.gauss_step_pdf, - pgf.gauss_step_pdf, - pgf.radford_pdf, - pgf.radford_pdf, - pgf.gauss_step_pdf, - pgf.gauss_step_pdf, - pgf.radford_pdf, - pgf.gauss_step_pdf, - pgf.gauss_step_pdf, - pgf.gauss_step_pdf, - pgf.radford_pdf, - pgf.radford_pdf, - pgf.radford_pdf, - pgf.radford_pdf, - pgf.gauss_step_pdf, - pgf.gauss_step_pdf, - pgf.gauss_step_pdf, - ] - - def __init__( - self, - energy_param, - selection_string, - threshold, - p_val, - plot_options={}, - simplex=False, - tail_weight=20, - cal_energy_param=None, - deg=2, - fixed=None, - ): - self.energy_param = energy_param - if cal_energy_param is None: - self.cal_energy_param = energy_param - else: - self.cal_energy_param = cal_energy_param - self.selection_string = selection_string - self.threshold = threshold - self.p_val = p_val - self.plot_options = plot_options - self.simplex = simplex - self.results = {} - self.plot_dict = {} - self.n_events = None - self.output_dict = {} - self.pars = [1, 0] - self.tail_weight = tail_weight - self.fixed = fixed - self.deg = deg - - def get_results_dict(self, data): - if self.results: - fwhm_linear = self.fwhm_fit_linear.copy() - fwhm_linear["parameters"] = fwhm_linear["parameters"].to_dict() - fwhm_linear["uncertainties"] = fwhm_linear["uncertainties"].to_dict() - fwhm_linear["cov"] = fwhm_linear["cov"].tolist() - fwhm_quad = self.fwhm_fit_quadratic.copy() - fwhm_quad["parameters"] = fwhm_quad["parameters"].to_dict() - fwhm_quad["uncertainties"] = fwhm_quad["uncertainties"].to_dict() - fwhm_quad["cov"] = fwhm_quad["cov"].tolist() - - pk_dict = { - Ei: { - "function": func_i.__name__, - "module": func_i.__module__, - "parameters_in_keV": parsi.to_dict(), - "uncertainties_in_keV": errorsi.to_dict(), - "p_val": pvali, - "fwhm_in_keV": list(fwhmi), - } - for i, (Ei, parsi, errorsi, pvali, fwhmi, func_i) in enumerate( - zip( - self.results["fitted_keV"], - self.results["pk_pars"][self.results["pk_validities"]], - self.results["pk_errors"][self.results["pk_validities"]], - self.results["pk_pvals"][self.results["pk_validities"]], - self.results["pk_fwhms"], - self.funcs, - ) - ) - } - - return { - "eres_linear": fwhm_linear, - "eres_quadratic": fwhm_quad, - "fitted_peaks": self.results["fitted_keV"].tolist(), - "pk_fits": pk_dict, - } - else: - return {} - - def run_fit(self, data): - hist, bins, var = pgh.get_hist( - data.query(self.selection_string)[self.energy_param], - range=(np.amin(self.glines) * 0.8, np.amax(self.glines) * 1.1), - dx=0.5, - ) - (got_peak_locations, got_peak_energies, roughpars) = cal.hpge_get_E_peaks( - hist, bins, var, np.array([1, 0]), n_sigma=3, peaks_keV=self.glines - ) - - found_mask = np.in1d(self.glines, got_peak_energies) - self.results["got_peaks_locs"] = got_peak_locations - self.results["got_peaks_keV"] = got_peak_energies - - log.info(f"{len(got_peak_energies)} peaks obtained:") - log.info(f"\t Energy | Position ") - for i, (Li, Ei) in enumerate(zip(got_peak_locations, got_peak_energies)): - log.info(f"\t{i}".ljust(4) + str(Ei).ljust(9) + f"| {Li:g}".ljust(5)) - - self.glines = np.array(self.glines)[found_mask].tolist() - self.range_keV = np.array(self.range_keV)[found_mask].tolist() - self.binning = np.array(self.binning)[found_mask].tolist() - self.funcs = np.array(self.funcs)[found_mask].tolist() - self.gof_funcs = np.array(self.gof_funcs)[found_mask].tolist() - - n_bins = [ - int((self.range_keV[i][1] + self.range_keV[i][0]) / self.binning[i]) - for i in range(len(self.glines)) - ] - ( - pk_pars, - pk_errors, - pk_covs, - pk_binws, - pk_ranges, - pk_pvals, - valid_pks, - pk_funcs, - ) = cal.hpge_fit_E_peaks( - data.query(self.selection_string)[self.energy_param], - self.glines, - self.range_keV, - n_bins=n_bins, - funcs=self.funcs, - method="unbinned", - gof_funcs=self.gof_funcs, - n_events=None, - allowed_p_val=self.p_val, - tail_weight=20, - ) - for idx, peak in enumerate(self.glines): - self.funcs[idx] = pk_funcs[idx] - if pk_funcs[idx] == pgf.extended_radford_pdf: - self.gof_funcs[idx] = pgf.radford_pdf - else: - self.gof_funcs[idx] = pgf.gauss_step_pdf - - self.results["got_peaks_keV"] = self.glines - self.results["pk_pars"] = pk_pars - self.results["pk_errors"] = pk_errors - self.results["pk_covs"] = pk_covs - self.results["pk_binws"] = pk_binws - self.results["pk_ranges"] = pk_ranges - self.results["pk_pvals"] = pk_pvals - - for i, pk in enumerate(self.results["got_peaks_keV"]): - try: - if self.results["pk_pars"][i]["n_sig"] < 10: - valid_pks[i] = False - elif ( - 2 * self.results["pk_errors"][i]["n_sig"] - > self.results["pk_pars"][i]["n_sig"] - ): - valid_pks[i] = False - except: - pass - - self.results["pk_validities"] = valid_pks - - # Drop failed fits - self.results["fitted_keV"] = np.asarray(self.glines)[valid_pks] - - def fit_peaks(self, data): - log.debug(f"Fitting {self.energy_param}") - try: - self.run_fit(data) - - valid_pks = self.results["pk_validities"] - fitted_peaks_keV = self.results["fitted_keV"] - pk_pars = np.asarray(self.results["pk_pars"], dtype=object)[ - valid_pks - ] # ragged - pk_errors = np.asarray(self.results["pk_errors"], dtype=object)[valid_pks] - pk_covs = np.asarray(self.results["pk_covs"], dtype=object)[valid_pks] - pk_funcs = np.asarray(self.funcs)[valid_pks] - - log.info(f"{len(np.where(valid_pks)[0])} peaks fitted:") - for i, (Ei, parsi, errorsi, covsi, func_i) in enumerate( - zip(fitted_peaks_keV, pk_pars, pk_errors, pk_covs, pk_funcs) - ): - varnames = func_i.__code__.co_varnames[1 : len(pk_pars[-1]) + 1] - parsi = np.asarray(parsi, dtype=float) - errorsi = np.asarray(errorsi, dtype=float) - covsi = np.asarray(covsi, dtype=float) - - log.info(f"\tEnergy: {str(Ei)}") - log.info(f"\t\tParameter | Value +/- Sigma ") - for vari, pari, errorsi in zip(varnames, parsi, errorsi): - log.info(f"\t\t{str(vari):<12} | {pari: 8.2f} +/- {errorsi:.2f}") - - cal_fwhms = [ - pgf.get_fwhm_func(func_i, pars_i, cov=covs_i) - for func_i, pars_i, covs_i in zip(pk_funcs, pk_pars, pk_covs) - ] - - cal_fwhms, cal_fwhms_errs = zip(*cal_fwhms) - cal_fwhms = np.asarray(cal_fwhms) - cal_fwhms_errs = np.asarray(cal_fwhms_errs) - self.results["pk_fwhms"] = np.asarray( - [(u, e) for u, e in zip(cal_fwhms, cal_fwhms_errs)] - ) - - log.info(f"{len(cal_fwhms)} FWHMs found:") - log.info(f"\t{'Energy':>10}{'| FWHM':>9}") - for i, (Ei, fwhm, fwhme) in enumerate( - zip(fitted_peaks_keV, cal_fwhms, cal_fwhms_errs) - ): - log.info( - f"\t{str(i):<4}{str(Ei):<9}| {f'{fwhm:.2f}+-{fwhme:.2f}':<10} keV" - ) - self.fit_energy_res() - log.debug(f"high stats fitting successful") - except: - self.results = {} - log.debug(f"high stats fitting failed") - - def update_calibration(self, data): - log.debug(f"Calibrating {self.energy_param}") - self.run_fit(data) - - valid_pks = self.results["pk_validities"] - fitted_peaks_keV = self.results["fitted_keV"] - pk_pars = np.asarray(self.results["pk_pars"], dtype=object)[valid_pks] # ragged - pk_errors = np.asarray(self.results["pk_errors"], dtype=object)[valid_pks] - pk_covs = np.asarray(self.results["pk_covs"], dtype=object)[valid_pks] - pk_funcs = np.asarray(self.funcs)[valid_pks] - - log.info(f"{len(np.where(valid_pks)[0])} peaks fitted:") - for i, (Ei, parsi, errorsi, covsi, func_i) in enumerate( - zip(fitted_peaks_keV, pk_pars, pk_errors, pk_covs, pk_funcs) - ): - varnames = func_i.__code__.co_varnames[1 : len(pk_pars[-1]) + 1] - parsi = np.asarray(parsi, dtype=float) - errorsi = np.asarray(errorsi, dtype=float) - covsi = np.asarray(covsi, dtype=float) - # parsigsi = np.sqrt(covsi.diagonal()) - log.info(f"\tEnergy: {str(Ei)}") - log.info(f"\t\tParameter | Value +/- Sigma ") - for vari, pari, errorsi in zip(varnames, parsi, errorsi): - log.info(f"\t\t{str(vari):<12} | {pari: 8.2f} +/- {errorsi:.2f}") - # Drop failed fits - - mus = [ - pgf.get_mu_func(func_i, pars_i, errors=errors_i) - for func_i, pars_i, errors_i in zip(pk_funcs, pk_pars, pk_errors) - ] - mus, mu_vars = zip(*mus) - mus = np.asarray(mus) - mu_errs = np.asarray(mu_vars) - mu_vars = np.asarray(mu_vars) ** 2 - - try: - pars, errs, cov = cal.hpge_fit_E_scale( - mus, mu_vars, fitted_peaks_keV, deg=self.deg, fixed=self.fixed - ) - except ValueError: - log.error("Failed to fit enough peaks to get accurate calibration") - return None, None, None, results - - # Invert the E scale fit to get a calibration function - self.pars, self.errs, self.cov = cal.hpge_fit_E_cal_func( - mus, mu_vars, fitted_peaks_keV, pars, deg=self.deg, fixed=self.fixed - ) - - uncal_fwhms = [ - pgf.get_fwhm_func(func_i, pars_i, cov=covs_i) - for func_i, pars_i, covs_i in zip(pk_funcs, pk_pars, pk_covs) - ] - uncal_fwhms, uncal_fwhm_errs = zip(*uncal_fwhms) - uncal_fwhms = np.asarray(uncal_fwhms) - uncal_fwhm_errs = np.asarray(uncal_fwhm_errs) - derco = np.polyder(np.poly1d(pars)).coefficients - der = [pgf.poly(Ei, derco) for Ei in fitted_peaks_keV] - - cal_fwhms = uncal_fwhms * der - cal_fwhms_errs = uncal_fwhm_errs * der - self.results["pk_fwhms"] = np.asarray( - [(u * d, e * d) for u, e, d in zip(uncal_fwhms, uncal_fwhm_errs, der)] - ) - - log.info(f"{len(cal_fwhms)} FWHMs found:") - log.info(f"\t{'Energy':>10}{'| FWHM':>9}") - for i, (Ei, fwhm, fwhme) in enumerate( - zip(fitted_peaks_keV, cal_fwhms, cal_fwhms_errs) - ): - log.info(f"\t{str(i):<4}{str(Ei):<9}| {f'{fwhm:.2f}+-{fwhme:.2f}':<10} keV") - self.fit_energy_res() - if self.cal_energy_param == self.energy_param: - log.info( - "Warning dataframe energy will be overwritten as cal energy and input energy have same name" - ) - self.hit_dict = {self.cal_energy_param: self.gen_pars_dict()} - data[self.cal_energy_param] = pgf.poly(data[self.energy_param], self.pars) - log.debug(f"high stats calibration successful") - - -def get_peak_labels( - labels: list[str], pars: list[float] -) -> tuple(list[float], list[float]): - out = [] - out_labels = [] - for i, label in enumerate(labels): - if i % 2 == 1: - continue - else: - out.append(f"{pgf.poly(label, pars):.1f}") - out_labels.append(label) - return out_labels, out - - -def get_peak_label(peak: float) -> str: - if peak == 583.191: - return "Tl 583" - elif peak == 727.33: - return "Bi 727" - elif peak == 860.564: - return "Tl 860" - elif peak == 1592.53: - return "Tl DEP" - elif peak == 1620.5: - return "Bi FEP" - elif peak == 2103.53: - return "Tl SEP" - elif peak == 2614.5: - return "Tl FEP" - else: - return "" - - -def plot_fits( - ecal_class, data, figsize=[12, 8], fontsize=12, ncols=3, nrows=3, binning_keV=5 -): - plt.rcParams["figure.figsize"] = figsize - plt.rcParams["font.size"] = fontsize - - fitted_peaks = ecal_class.results["got_peaks_keV"] - pk_pars = ecal_class.results["pk_pars"] - pk_ranges = ecal_class.results["pk_ranges"] - p_vals = ecal_class.results["pk_pvals"] - - fitted_gof_funcs = [] - for i, peak in enumerate(ecal_class.glines): - if peak in fitted_peaks: - fitted_gof_funcs.append(ecal_class.gof_funcs[i]) - - mus = [ - pgf.get_mu_func(func_i, pars_i) if pars_i is not None else np.nan - for func_i, pars_i in zip(fitted_gof_funcs, pk_pars) - ] - - fig = plt.figure() - derco = np.polyder(np.poly1d(ecal_class.pars)).coefficients - der = [pgf.poly(5, derco) for Ei in fitted_peaks] - for i, peak in enumerate(mus): - range_adu = 5 / der[i] - plt.subplot(nrows, ncols, i + 1) - try: - binning = np.arange(pk_ranges[i][0], pk_ranges[i][1], 0.1 / der[i]) - bin_cs = (binning[1:] + binning[:-1]) / 2 - energies = data.query( - f"{ecal_class.energy_param}>{pk_ranges[i][0]}&{ecal_class.energy_param}<{pk_ranges[i][1]}&{ecal_class.selection_string}" - )[ecal_class.energy_param] - energies = energies.iloc[: ecal_class.n_events] - - counts, bs, bars = plt.hist(energies, bins=binning, histtype="step") - if pk_pars[i] is not None: - fit_vals = ( - fitted_gof_funcs[i](bin_cs, *pk_pars[i][:-1], 0) * np.diff(bs)[0] - ) - plt.plot(bin_cs, fit_vals) - plt.step( - bin_cs, - [ - (fval - count) / count if count != 0 else (fval - count) - for count, fval in zip(counts, fit_vals) - ], - where="mid", - ) - - plt.annotate( - get_peak_label(fitted_peaks[i]), - (0.02, 0.9), - xycoords="axes fraction", - ) - plt.annotate( - f"{fitted_peaks[i]:.1f} keV", (0.02, 0.8), xycoords="axes fraction" - ) - plt.annotate( - f"p-value : {p_vals[i]:.4f}", (0.02, 0.7), xycoords="axes fraction" - ) - plt.xlabel("Energy (keV)") - plt.ylabel("Counts") - plt.legend(loc="upper left", frameon=False) - plt.xlim([peak - range_adu, peak + range_adu]) - locs, labels = plt.xticks() - new_locs, new_labels = get_peak_labels(locs, ecal_class.pars) - plt.xticks(ticks=new_locs, labels=new_labels) - except: - pass - - plt.tight_layout() - plt.close() - return fig - - -def plot_2614_timemap( - ecal_class, - data, - figsize=[12, 8], - fontsize=12, - erange=[2580, 2630], - dx=1, - time_dx=180, -): - plt.rcParams["figure.figsize"] = figsize - plt.rcParams["font.size"] = fontsize - - selection = data.query( - f"{ecal_class.cal_energy_param}>2560&{ecal_class.cal_energy_param}<2660&{ecal_class.selection_string}" - ) - - fig = plt.figure() - if len(selection) == 0: - pass - else: - time_bins = np.arange( - (np.amin(data["timestamp"]) // time_dx) * time_dx, - ((np.amax(data["timestamp"]) // time_dx) + 2) * time_dx, - time_dx, - ) - - plt.hist2d( - selection["timestamp"], - selection[ecal_class.cal_energy_param], - bins=[time_bins, np.arange(erange[0], erange[1] + dx, dx)], - norm=LogNorm(), - ) - - ticks, labels = plt.xticks() - plt.xlabel( - f"Time starting : {datetime.utcfromtimestamp(ticks[0]).strftime('%d/%m/%y %H:%M')}" - ) - plt.ylabel("Energy(keV)") - plt.ylim([erange[0], erange[1]]) - - plt.xticks( - ticks, - [datetime.utcfromtimestamp(tick).strftime("%H:%M") for tick in ticks], - ) - plt.close() - return fig - - -def plot_pulser_timemap( - ecal_class, - data, - pulser_field="is_pulser", - figsize=[12, 8], - fontsize=12, - dx=0.2, - time_dx=180, - n_spread=3, -): - plt.rcParams["figure.figsize"] = figsize - plt.rcParams["font.size"] = fontsize - - time_bins = np.arange( - (np.amin(data["timestamp"]) // time_dx) * time_dx, - ((np.amax(data["timestamp"]) // time_dx) + 2) * time_dx, - time_dx, - ) - - selection = data.query(pulser_field) - fig = plt.figure() - if len(selection) == 0: - pass - - else: - mean = np.nanpercentile(selection[ecal_class.cal_energy_param], 50) - spread = mean - np.nanpercentile(selection[ecal_class.cal_energy_param], 10) - - plt.hist2d( - selection["timestamp"], - selection[ecal_class.cal_energy_param], - bins=[ - time_bins, - np.arange(mean - n_spread * spread, mean + n_spread * spread + dx, dx), - ], - norm=LogNorm(), - ) - plt.ylim([mean - n_spread * spread, mean + n_spread * spread]) - ticks, labels = plt.xticks() - plt.xlabel( - f"Time starting : {datetime.utcfromtimestamp(ticks[0]).strftime('%d/%m/%y %H:%M')}" - ) - plt.ylabel("Energy(keV)") - - plt.xticks( - ticks, - [datetime.utcfromtimestamp(tick).strftime("%H:%M") for tick in ticks], - ) - plt.close() - return fig - - -def bin_pulser_stability(ecal_class, data, pulser_field="is_pulser", time_slice=180): - selection = data.query(pulser_field) - - utime_array = data["timestamp"] - select_energies = selection[ecal_class.cal_energy_param].to_numpy() - - time_bins = np.arange( - (np.amin(utime_array) // time_slice) * time_slice, - ((np.amax(utime_array) // time_slice) + 2) * time_slice, - time_slice, - ) - # bin time values - times_average = (time_bins[:-1] + time_bins[1:]) / 2 - - if len(selection) == 0: - return { - "time": times_average, - "energy": np.full_like(times_average, np.nan), - "spread": np.full_like(times_average, np.nan), - } - - nanmedian = ( - lambda x: np.nanpercentile(x, 50) if len(x[~np.isnan(x)]) >= 10 else np.nan - ) - error = ( - lambda x: np.nanvar(x) / np.sqrt(len(x)) - if len(x[~np.isnan(x)]) >= 10 - else np.nan - ) - - par_average, _, _ = binned_statistic( - selection["timestamp"], select_energies, statistic=nanmedian, bins=time_bins - ) - par_error, _, _ = binned_statistic( - selection["timestamp"], select_energies, statistic=error, bins=time_bins - ) - - return {"time": times_average, "energy": par_average, "spread": par_error} - - -def bin_stability(ecal_class, data, time_slice=180, energy_range=[2585, 2660]): - selection = data.query( - f"{ecal_class.cal_energy_param}>{energy_range[0]}&{ecal_class.cal_energy_param}<{energy_range[1]}&{ecal_class.selection_string}" - ) - - utime_array = data["timestamp"] - select_energies = selection[ecal_class.cal_energy_param].to_numpy() - - time_bins = np.arange( - (np.amin(utime_array) // time_slice) * time_slice, - ((np.amax(utime_array) // time_slice) + 2) * time_slice, - time_slice, - ) - # bin time values - times_average = (time_bins[:-1] + time_bins[1:]) / 2 - - if len(selection) == 0: - return { - "time": times_average, - "energy": np.full_like(times_average, np.nan), - "spread": np.full_like(times_average, np.nan), - } - - nanmedian = ( - lambda x: np.nanpercentile(x, 50) if len(x[~np.isnan(x)]) >= 10 else np.nan - ) - error = ( - lambda x: np.nanvar(x) / np.sqrt(len(x)) - if len(x[~np.isnan(x)]) >= 10 - else np.nan - ) - - par_average, _, _ = binned_statistic( - selection["timestamp"], select_energies, statistic=nanmedian, bins=time_bins - ) - par_error, _, _ = binned_statistic( - selection["timestamp"], select_energies, statistic=error, bins=time_bins - ) - - return {"time": times_average, "energy": par_average, "spread": par_error} - - -def plot_cal_fit(ecal_class, data, figsize=[12, 8], fontsize=12, erange=[200, 2700]): - valid_fits = ecal_class.results["pk_validities"] - pk_pars = ecal_class.results["pk_pars"][valid_fits] - pk_errs = ecal_class.results["pk_errors"][valid_fits] - fitted_peaks = ecal_class.results["got_peaks_keV"] - - fitted_gof_funcs = [] - for i, peak in enumerate(ecal_class.glines): - if peak in fitted_peaks: - fitted_gof_funcs.append(ecal_class.gof_funcs[i]) - - fitted_gof_funcs = np.array(fitted_gof_funcs)[valid_fits] - fitted_peaks = np.array(fitted_peaks)[valid_fits] - - mus = [ - pgf.get_mu_func(func_i, pars_i) if pars_i is not None else np.nan - for func_i, pars_i in zip(fitted_gof_funcs, pk_pars) - ] - - mu_errs = [ - pgf.get_mu_func(func_i, pars_i) if pars_i is not None else np.nan - for func_i, pars_i in zip(fitted_gof_funcs, pk_errs) - ] - - plt.rcParams["figure.figsize"] = figsize - plt.rcParams["font.size"] = fontsize - - fig, (ax1, ax2) = plt.subplots( - 2, 1, sharex=True, gridspec_kw={"height_ratios": [3, 1]} - ) - - cal_bins = np.linspace(0, np.nanmax(mus) * 1.1, 20) - - ax1.scatter(fitted_peaks, mus, marker="x", c="b") - - ax1.plot(pgf.poly(cal_bins, ecal_class.pars), cal_bins, lw=1, c="g") - - ax1.grid() - ax1.set_xlim([erange[0], erange[1]]) - ax1.set_ylabel("Energy (ADC)") - ax2.errorbar( - fitted_peaks, - pgf.poly(np.array(mus), ecal_class.pars) - fitted_peaks, - yerr=pgf.poly(np.array(mus) + np.array(mu_errs), ecal_class.pars) - - pgf.poly(np.array(mus), ecal_class.pars), - linestyle=" ", - marker="x", - c="b", - ) - ax2.grid() - ax2.set_xlabel("Energy (keV)") - ax2.set_ylabel("Residuals (keV)") - plt.close() - return fig - - -def plot_eres_fit(ecal_class, data, erange=[200, 2700], figsize=[12, 8], fontsize=12): - plt.rcParams["figure.figsize"] = figsize - plt.rcParams["font.size"] = fontsize - - fwhms = ecal_class.results["pk_fwhms"][:, 0] - dfwhms = ecal_class.results["pk_fwhms"][:, 1] - fitted_peaks = ecal_class.results["fitted_keV"] - - ##### - # Remove the Tl SEP and DEP from calibration if found - fwhm_peaks = np.array([], dtype=np.float32) - indexes = [] - for i, peak in enumerate(fitted_peaks): - if peak == 2103.53: - log.info(f"Tl SEP found at index {i}") - indexes.append(i) - continue - elif peak == 1592.53: - log.info(f"Tl DEP found at index {i}") - indexes.append(i) - continue - elif np.isnan(dfwhms[i]): - log.info(f"{peak} failed") - indexes.append(i) - continue - elif peak == 511.0: - log.info(f"e annihilation found at index {i}") - indexes.append(i) - continue - else: - fwhm_peaks = np.append(fwhm_peaks, peak) - fit_fwhms = np.delete(fwhms, [indexes]) - fit_dfwhms = np.delete(dfwhms, [indexes]) - - fig, (ax1, ax2) = plt.subplots( - 2, 1, sharex=True, gridspec_kw={"height_ratios": [3, 1]} - ) - if len(np.where((~np.isnan(fit_fwhms)) & (~np.isnan(fit_dfwhms)))[0]) > 0: - ax1.errorbar( - fwhm_peaks, fit_fwhms, yerr=fit_dfwhms, marker="x", ls=" ", c="black" - ) - - fwhm_slope_bins = np.arange(erange[0], erange[1], 10) - - qbb_line_vx = [2039.0, 2039.0] - qbb_line_vy = [ - 0.9 - * np.nanmin( - fwhm_linear.func( - fwhm_slope_bins, *ecal_class.fwhm_fit_linear["parameters"] - ) - ), - np.nanmax( - [ - ecal_class.fwhm_fit_linear["Qbb_fwhm_in_keV"], - ecal_class.fwhm_fit_quadratic["Qbb_fwhm_in_keV"], - ] - ), - ] - qbb_line_hx = [erange[0], 2039.0] - - ax1.plot( - fwhm_slope_bins, - fwhm_linear.func( - fwhm_slope_bins, *ecal_class.fwhm_fit_linear["parameters"] - ), - lw=1, - c="g", - label=f'linear, Qbb fwhm: {ecal_class.fwhm_fit_linear["Qbb_fwhm_in_keV"]:1.2f} +- {ecal_class.fwhm_fit_linear["Qbb_fwhm_err_in_keV"]:1.2f} keV', - ) - ax1.plot( - fwhm_slope_bins, - fwhm_quadratic.func( - fwhm_slope_bins, *ecal_class.fwhm_fit_quadratic["parameters"] - ), - lw=1, - c="b", - label=f'quadratic, Qbb fwhm: {ecal_class.fwhm_fit_quadratic["Qbb_fwhm_in_keV"]:1.2f} +- {ecal_class.fwhm_fit_quadratic["Qbb_fwhm_err_in_keV"]:1.2f} keV', - ) - ax1.plot( - qbb_line_hx, - [ - ecal_class.fwhm_fit_linear["Qbb_fwhm_in_keV"], - ecal_class.fwhm_fit_linear["Qbb_fwhm_in_keV"], - ], - lw=1, - c="r", - ls="--", - ) - ax1.plot( - qbb_line_hx, - [ - ecal_class.fwhm_fit_quadratic["Qbb_fwhm_in_keV"], - ecal_class.fwhm_fit_quadratic["Qbb_fwhm_in_keV"], - ], - lw=1, - c="r", - ls="--", - ) - ax1.plot(qbb_line_vx, qbb_line_vy, lw=1, c="r", ls="--") - - ax1.legend(loc="upper left", frameon=False) - if np.isnan(ecal_class.fwhm_fit_linear["parameters"]).all(): - [ - 0.9 * np.nanmin(fit_fwhms), - 1.1 * np.nanmax(fit_fwhms), - ] - else: - ax1.set_ylim( - [ - 0.9 - * np.nanmin( - fwhm_linear.func( - fwhm_slope_bins, *ecal_class.fwhm_fit_linear["parameters"] - ) - ), - 1.1 - * np.nanmax( - fwhm_linear.func( - fwhm_slope_bins, *ecal_class.fwhm_fit_linear["parameters"] - ) - ), - ] - ) - ax1.set_xlim(erange) - ax1.set_ylabel("FWHM energy resolution (keV)") - ax2.plot( - fwhm_peaks, - ( - fit_fwhms - - fwhm_linear.func( - fwhm_peaks, *ecal_class.fwhm_fit_linear["parameters"] - ) - ) - / fit_dfwhms, - lw=0, - marker="x", - c="g", - ) - ax2.plot( - fwhm_peaks, - ( - fit_fwhms - - fwhm_quadratic.func( - fwhm_peaks, *ecal_class.fwhm_fit_quadratic["parameters"] - ) - ) - / fit_dfwhms, - lw=0, - marker="x", - c="b", - ) - ax2.plot(erange, [0, 0], color="black", lw=0.5) - ax2.set_xlabel("Energy (keV)") - ax2.set_ylabel("Normalised Residuals") - plt.tight_layout() - plt.close() - return fig - - -def bin_spectrum( - ecal_class, - data, - cut_field="is_valid_cal", - pulser_field="is_pulser", - erange=[0, 3000], - dx=2, -): - bins = np.arange(erange[0], erange[1] + dx, dx) - return { - "bins": pgh.get_bin_centers(bins), - "counts": np.histogram( - data.query(ecal_class.selection_string)[ecal_class.cal_energy_param], bins - )[0], - "cut_counts": np.histogram( - data.query(f"(~{cut_field})&(~{pulser_field})")[ - ecal_class.cal_energy_param - ], - bins, - )[0], - "pulser_counts": np.histogram( - data.query(pulser_field)[ecal_class.cal_energy_param], - bins, - )[0], - } - - -def bin_survival_fraction( - ecal_class, - data, - cut_field="is_valid_cal", - pulser_field="is_pulser", - erange=[0, 3000], - dx=6, -): - counts_pass, bins_pass, _ = pgh.get_hist( - data.query(ecal_class.selection_string)[ecal_class.cal_energy_param], - bins=np.arange(erange[0], erange[1] + dx, dx), - ) - counts_fail, bins_fail, _ = pgh.get_hist( - data.query(f"(~{cut_field})&(~{pulser_field})")[ecal_class.cal_energy_param], - bins=np.arange(erange[0], erange[1] + dx, dx), - ) - sf = 100 * (counts_pass + 10 ** (-6)) / (counts_pass + counts_fail + 10 ** (-6)) - return {"bins": pgh.get_bin_centers(bins_pass), "sf": sf} diff --git a/src/pygama/pargen/energy_cal.py b/src/pygama/pargen/energy_cal.py index a9de8e6d3..c6c5daae4 100644 --- a/src/pygama/pargen/energy_cal.py +++ b/src/pygama/pargen/energy_cal.py @@ -1,197 +1,1709 @@ """routines for automatic calibration. -- hpge_find_E_peaks (Find uncalibrated E peaks whose E spacing matches the pattern in peaks_keV) -- hpge_get_E_peaks (Get uncalibrated E peaks at the energies of peaks_keV) -- hpge_fit_E_peaks (fits the energy peals) +- hpge_find_energy_peaks (Find uncalibrated E peaks whose E spacing matches the pattern in peaks_kev) +- hpge_get_energy_peaks (Get uncalibrated E peaks at the energies of peaks_kev) +- hpge_fit_energy_peaks (fits the energy peals) - hpge_E_calibration (main routine -- finds and fits peaks specified) """ + +from __future__ import annotations + +import inspect import logging -import sys +import string -import matplotlib.gridspec as gs import matplotlib.pyplot as plt import numpy as np import scipy.stats from iminuit import Minuit, cost -from scipy.signal import find_peaks_cwt, medfilt +from iminuit.util import ValueView +from numpy.polynomial.polynomial import Polynomial +from scipy.stats import chi2 +import pygama.math.binned_fitting as pgb +import pygama.math.distributions as pgf import pygama.math.histogram as pgh -import pygama.math.peak_fitting as pgf -import pygama.math.utils as pgu -from pygama.pargen.utils import return_nans +from pygama.math.histogram import get_i_local_maxima +from pygama.math.least_squares import fit_simple_scaling +from pygama.pargen.utils import convert_to_minuit, return_nans log = logging.getLogger(__name__) -def hpge_find_E_peaks( - hist, - bins, - var, - peaks_keV, - n_sigma=5, - deg=0, - Etol_keV=None, - var_zero=1, - verbose=False, -): - """Find uncalibrated E peaks whose E spacing matches the pattern in peaks_keV - Note: the specialization here to units "keV" in peaks and Etol is - unnecessary. However it is kept so that the default value for Etol_keV has - an unambiguous interpretation. +class HPGeCalibration: + """ + Calibrate HPGe data to a set of known peaks. Class stores the calibration parameters + as well as the peaks locations and energies used. Each function called updates a results + dictionary with any additional information which is stored in the class. Parameters ---------- - hist, bins, var : array, array, array - Histogram of uncalibrated energies, see pgh.get_hist() - var cannot contain any zero entries. - peaks_keV : array - Energies of peaks to search for (in keV) - n_sigma : float - Threshold for detecting a peak in sigma (i.e. sqrt(var)) - deg : int - deg arg to pass to poly_match - Etol_keV : float - absolute tolerance in energy for matching peaks - var_zero : float - number used to replace zeros of var to avoid divide-by-zero in - hist/sqrt(var). Default value is 1. Usually when var = 0 its because - hist = 0, and any value here is fine. - verbose : bool - print debug messages + e_uncal : array + uncalibrated energy data + glines : array + list of peak energies to be fit to. Each must be in the data + guess_kev : float + a rough initial guess at the conversion factor from e_uncal to kev. Must + be positive + deg : non-negative int + degree of the polynomial for the E_cal function E_kev = poly(e_uncal). + deg = 0 corresponds to a simple scaling E_kev = scale * e_uncal. + Otherwise follows the convention in np.polynomial.polynomial of + lowest order to highest order + uncal_is_int : bool + if True, attempts will be made to avoid picket-fencing when binning + e_uncal + fixed : dict + dictionary of fixed parameters for the calibration function - Returns - ------- - detected_peak_locations : list - list of uncalibrated energies of detected peaks - detected_peak_energies : list - list of calibrated energies of detected peaks - pars : list of floats - the parameters for poly(peaks_uncal) = peaks_keV (polyfit convention) """ - # clean up var if necessary - if np.any(var == 0): - log.debug(f"hpge_find_E_peaks: replacing var zeros with {var_zero}") - var[np.where(var == 0)] = var_zero - peaks_keV = np.asarray(peaks_keV) - - # Find all maxes with > n_sigma significance - imaxes = get_i_local_maxima(hist / np.sqrt(var), n_sigma) - - # Now pattern match to peaks_keV within Etol_keV using poly_match - detected_max_locs = pgh.get_bin_centers(bins)[imaxes] - - if Etol_keV is None: - # estimate Etol_keV - pt_pars, pt_covs = hpge_fit_E_peak_tops( - hist, bins, var, detected_max_locs, n_to_fit=15 + + def __init__( + self, + energy_param, + glines, + guess_kev: float, + deg: int = 1, + uncal_is_int: bool = False, + fixed=None, + debug_mode: bool = False, + ): + self.energy_param = energy_param + + if deg < -1: + log.error(f"hpge_E_cal warning: invalid deg = {deg}") + return + self.deg = int(deg) + + self.peaks_kev = np.asarray(sorted(glines)) + self.peak_locs = [] + + if guess_kev <= 0: + log.error(f"hpge_E_cal warning: invalid guess_kev = {guess_kev}") + if deg == -1: + self.pars = np.zeros(2, dtype=float) + self.pars[0] = guess_kev + self.fixed = {1: 1} + elif deg == 0: + self.pars = np.zeros(2, dtype=float) + self.pars[1] = guess_kev + self.fixed = {0: 0} + else: + self.pars = np.zeros(self.deg + 1, dtype=float) + self.pars[1] = guess_kev + self.fixed = fixed + self.results = {} + + self.uncal_is_int = uncal_is_int + self.debug_mode = debug_mode + + def gen_pars_dict(self): + """ + Generate a dictionary containing the expression and parameters used for energy calibration. + + Returns: + dict: A dictionary with keys 'expression' and 'parameters'. + 'expression' is a string representing the energy calibration expression. + 'parameters' is a dictionary containing the parameter values used in the expression. + """ + expression = "" + parameters = {} + for i, coeff in enumerate(self.pars): + parameter_name = string.ascii_lowercase[i] + if i == 0: + expression += f"{parameter_name}" + elif i == 1: + expression += f" + {parameter_name} * {self.energy_param}" + else: + expression += f" + {parameter_name} * {self.energy_param}**{i} " + parameters[parameter_name] = coeff + return {"expression": expression, "parameters": parameters} + + def update_results_dict(self, results_dict): + name = inspect.stack()[1][3] + if name in self.results: + it = 0 + for n in self.results: + if name in n: + if name == n: + pass + else: + new_it = int(n.split("_")[-1]) + if new_it > it: + it = new_it + it += 1 + name += f"_{it}" + self.results[name] = results_dict + + def hpge_find_energy_peaks( + self, + e_uncal, + peaks_kev=None, + n_sigma=5, + etol_kev=None, + bin_width_kev=1, + erange=None, + var_zero=1, + update_cal_pars=True, + ): + """ + Find uncalibrated energy peaks whose energy spacing matches the pattern in peaks_kev. + + Parameters + ---------- + e_uncal (array-like): + Uncalibrated energy values. + peaks_kev (array-like, optional): + Pattern of energy peaks to match. If not provided, the peaks from the object's attribute `peaks_kev` will be used. + n_sigma (float, optional): + Number of standard deviations above the mean to consider a peak significant. Default is 5. + etol_kev (float, optional): + Tolerance in energy units for matching peaks. If not provided, it will be estimated based on the peak widths. + bin_width_kev (float, optional): + Width of the energy bins for initial peak search. Default is 1 keV. + erange (tuple, optional): + Range of uncalibrated energy values to consider. If not provided, the range will be determined based on the peaks. + var_zero (float, optional): + Value to replace zero variance with. Default is 1. + update_cal_pars (bool, optional): + Whether to update the calibration parameters. Default is True. + + Returns: + None + + """ + + if peaks_kev is None: + peaks_kev = self.peaks_kev + + peaks_adc = [(Polynomial(self.pars) - ei).roots() for ei in peaks_kev] + + # bin the histogram in ~1 kev bins for the initial rough peak search + if erange is None: + euc_min = np.nanmin(peaks_adc) * 0.6 + euc_max = np.nanmax(peaks_adc) * 1.1 + + else: + euc_min, euc_max = erange + d_euc = bin_width_kev / self.pars[1] + if self.uncal_is_int: + euc_min, euc_max, d_euc = pgh.better_int_binning( + x_lo=euc_min, x_hi=euc_max, dx=d_euc + ) + hist, bins, var = pgh.get_hist(e_uncal, range=(euc_min, euc_max), dx=d_euc) + + # clean up var if necessary + if np.any(var == 0): + log.debug(f"hpge_find_energy_peaks: replacing var zeros with {var_zero}") + var[np.where(var == 0)] = var_zero + peaks_kev = np.asarray(peaks_kev) + + # Find all maxes with > n_sigma significance + imaxes = get_i_local_maxima(hist / np.sqrt(var), n_sigma) + + # Now pattern match to peaks_kev within etol_kev using poly_match + detected_max_locs = pgh.get_bin_centers(bins)[imaxes] + + if etol_kev is None: + # estimate etol_kev + pt_pars, pt_covs = hpge_fit_energy_peak_tops( + hist, bins, var, detected_max_locs, n_to_fit=15 + ) + if ( + sum(np.sum(c.flatten()) if c.ndim != 0 else 0 for c in pt_covs) + == np.inf + or sum(np.sum(c.flatten()) if c.ndim != 0 else 0 for c in pt_covs) == 0 + ): + log.debug( + "hpge_find_energy_peaks: can safely ignore previous covariance warning, not used" + ) + pt_pars = pt_pars[np.array([x is not None for x in pt_pars])] + med_sigma_ratio = np.median( + np.stack(pt_pars)[:, 1] / np.stack(pt_pars)[:, 0] + ) + + etol_kev = 5.0 * (med_sigma_ratio / 0.003) + pars, ixtup, iytup = poly_match( + detected_max_locs, peaks_kev, deg=self.deg, atol=etol_kev, fixed=self.fixed + ) + + if len(ixtup) != len(peaks_kev): + log.info( + f"hpge_find_energy_peaks: only found {len(ixtup)} of {len(peaks_kev)} expected peaks" + ) + + self.update_results_dict( + { + "input_peaks_kev": peaks_kev, + "found_peaks_kev": peaks_kev[iytup], + "found_peaks_locs": detected_max_locs[iytup], + } + ) + log.info(f"{len(peaks_kev[iytup])} peaks found:") + log.info("\t Energy | Position ") + for i, (li, ei) in enumerate(zip(detected_max_locs[ixtup], peaks_kev[iytup])): + log.info(f"\t{i}".ljust(4) + str(ei).ljust(9) + f"| {li:g}".ljust(5)) + + if update_cal_pars is False: + return + + self.peak_locs = detected_max_locs[ixtup] + self.peaks_kev = peaks_kev[iytup] + self.pars = np.array(pars) + + def hpge_get_energy_peaks( + self, + e_uncal, + peaks_kev=None, + n_sigma=3, + etol_kev=5, + var_zero=1, + bin_width_kev=0.2, + update_cal_pars=True, + erange=None, + ): + """Get uncalibrated E peaks at the energies of peaks_kev + + Parameters + ---------- + e_uncal : array + Uncalibrated energy values. + peaks_kev : array, optional + Energies of peaks to search for (in keV). If not provided, the peaks_kev + attribute of the object will be used. + n_sigma : float, optional + Threshold for detecting a peak in sigma (i.e. sqrt(var)). Default is 3. + etol_kev : float, optional + Absolute tolerance in energy for matching peaks. Default is 5. + var_zero : float, optional + Number used to replace zeros of var to avoid divide-by-zero in hist/sqrt(var). + Default is 1. Usually when var = 0, it's because hist = 0, and any value here is fine. + bin_width_kev : float, optional + Width of the energy bins for re-binning the histogram. Default is 0.2 keV. + update_cal_pars : bool, optional + Flag indicating whether to update the calibration parameters. Default is True. + erange : tuple, optional + Range of energy values to consider for peak search. If not provided, the range + will be determined automatically based on the peaks_kev values. + + Returns + ------- + None + + Notes + ----- + This method performs the following steps: + 1. Re-bins the histogram in ~0.2 keV bins with updated energy scale parameters for peak-top fits. + 2. Finds all local maxima in the histogram with significance greater than n_sigma. + 3. Matches the calculated peak energies with the expected peak energies. + 4. Removes duplicate peak matches. + 5. Updates the input peaks, got peaks, and got peak locations in the results dictionary. + 6. If update_cal_pars is True, calculates the updated calibration curve using the matched peak energies. + + """ + if peaks_kev is None: + peaks_kev = self.peaks_kev + + peaks_kev = np.asarray(peaks_kev) + + # re-bin the histogram in ~0.2 kev bins with updated E scale par for peak-top fits + if erange is None: + euc_min, euc_max = ( + (Polynomial(self.pars) - i).roots() + for i in (peaks_kev[0] * 0.9, peaks_kev[-1] * 1.1) + ) + euc_min = euc_min[0] + euc_max = euc_max[0] + if euc_min < 0: + euc_min = 0 + if euc_max > np.nanmax(e_uncal) * 1.1: + euc_max = np.nanmax(e_uncal) * 1.1 + else: + euc_min, euc_max = erange + + d_euc = bin_width_kev / self.pars[1] + + if self.uncal_is_int: + euc_min, euc_max, d_euc = pgh.better_int_binning( + x_lo=euc_min, x_hi=euc_max, dx=d_euc + ) + hist, bins, var = pgh.get_hist(e_uncal, range=(euc_min, euc_max), dx=d_euc) + + # clean up var if necessary + if np.any(var == 0): + log.debug(f"hpge_find_energy_peaks: replacing var zeros with {var_zero}") + var[np.where(var == 0)] = var_zero + + # Find all maxes with > n_sigma significance + imaxes = get_i_local_maxima(hist / np.sqrt(var), n_sigma) + + # Keep maxes if they coincide with expected peaks + test_peaks_kev = np.asarray([pgf.nb_poly(i, self.pars) for i in bins[imaxes]]) + imatch = [abs(peaks_kev - i).min() < etol_kev for i in test_peaks_kev] + + got_peak_locations = bins[imaxes[imatch]] + got_peak_energies = test_peaks_kev[imatch] + + # Match calculated and true peak energies + matched_energies = peaks_kev[ + [np.argmin(abs(peaks_kev - i)) for i in got_peak_energies] + ] + while not all([list(matched_energies).count(x) == 1 for x in matched_energies]): + for i in range(len(matched_energies)): + if matched_energies[i + 1] == matched_energies[i]: + # remove duplicates + if np.argmin( + abs(got_peak_energies[i : i + 2] - matched_energies[i]) + ): # i+1 is best match + got_peak_locations = np.delete(got_peak_locations, i) + got_peak_energies = np.delete(got_peak_energies, i) + else: # i is best match + got_peak_locations = np.delete(got_peak_locations, i + 1) + got_peak_energies = np.delete(got_peak_energies, i + 1) + matched_energies = np.delete(matched_energies, i) + break + i += 1 + + input_peaks = peaks_kev.copy() + + self.update_results_dict( + { + "input_peaks_kev": input_peaks, + "got_peaks_kev": matched_energies, + "got_peaks_locs": got_peak_locations, + } + ) + + if update_cal_pars is False: + return + + self.peak_locs = got_peak_locations + self.peaks_kev = matched_energies + + # Calculate updated calibration curve + if self.deg == 0: + scale, _ = fit_simple_scaling(got_peak_locations, matched_energies) + poly_pars = np.array([0, scale]) + else: + # Calculate updated calibration curve + poly_pars = ( + Polynomial.fit(got_peak_locations, matched_energies, len(self.pars) - 1) + .convert() + .coef + ) + c = cost.LeastSquares( + matched_energies, + got_peak_locations, + np.full_like(got_peak_locations, 1), + poly_wrapper, + ) + if self.fixed is not None: + for idx, val in self.fixed.items(): + if val is True or val is None: + pass + else: + poly_pars[idx] = val + m = Minuit(c, *poly_pars) + if self.fixed is not None: + for idx in list(self.fixed): + m.fixed[idx] = True + + self.pars = np.array(m.values) + + log.info(f"{len(self.peak_locs)} peaks obtained:") + log.info("\t Energy | Position ") + for i, (li, ei) in enumerate(zip(self.peak_locs, self.peaks_kev)): + log.info(f"\t{i}".ljust(4) + str(ei).ljust(9) + f"| {li:g}".ljust(5)) + + def hpge_cal_energy_peak_tops( + self, + e_uncal, + n_sigmas=1.2, + peaks_kev=None, + default_n_bins=50, + n_events=None, + allowed_p_val=0.01, + update_cal_pars=True, + ): + """ + Perform energy calibration for HPGe detector using peak fitting. + + Args: + e_uncal (array-like): + Uncalibrated energy values. + n_sigmas (float, optional): + Number of standard deviations to use for peak fitting range. Defaults to 1.2. + peaks_kev (array-like, optional): + Known peak positions in keV. If not provided, uses self.peaks_kev. Defaults to None. + default_n_bins (int, optional): + Number of bins for histogram. Defaults to 50. + n_events (int, optional): + Number of events to consider for calibration. Defaults to None which uses all events. + allowed_p_val (float, optional): + Maximum p-value for a fit to be considered valid. Defaults to 0.05. + update_cal_pars (bool, optional): + Whether to update the calibration parameters. Defaults to True. + """ + + results_dict = {} + + # check no peaks in self.peaks_kev missing from peak_pars + if peaks_kev is None: + peaks_kev = self.peaks_kev + + peak_pars = [(peak, None, pgf.gauss_on_uniform) for peak in peaks_kev] + + # convert peak pars to array of tuples + tmp = np.empty(len(peak_pars), dtype=object) + tmp[:] = peak_pars + peak_pars = tmp + + peak_pars_lines = [i[0] for i in peak_pars] + peaks_mask = np.array( + [True if peak in peaks_kev else False for peak in peak_pars_lines], + dtype=bool, + ) + peak_pars = peak_pars[peaks_mask] + + fit_peaks_mask = np.array( + [True for i in peak_pars if i[2] is not None], + dtype=bool, + ) + peak_pars = peak_pars[fit_peaks_mask] + + # First calculate range around peaks to fit + + euc_min, euc_max = ( + (Polynomial(self.pars) - i).roots() + for i in (peaks_kev[0] * 0.9, peaks_kev[-1] * 1.1) + ) + euc_min = euc_min[0] + euc_max = euc_max[0] + + if euc_min < 0: + euc_min = 0 + if euc_max > np.nanmax(e_uncal) * 1.1: + euc_max = np.nanmax(e_uncal) * 1.1 + + d_euc = 0.5 / self.pars[1] + if self.uncal_is_int: + euc_min, euc_max, d_euc = pgh.better_int_binning( + x_lo=euc_min, x_hi=euc_max, dx=d_euc + ) + + hist, bins, var = pgh.get_hist(e_uncal, range=(euc_min, euc_max), dx=d_euc) + + uncal_peak_pars = [] + for pars in peak_pars: + peak, fit_range, func = pars + + if peak in self.peaks_kev: + loc = self.peak_locs[np.where(peak == self.peaks_kev)][0] + else: + loc = (Polynomial(self.pars) - peak).roots()[0] + + # Need to do initial fit + pt_pars, _ = hpge_fit_energy_peak_tops(hist, bins, var, [loc], n_to_fit=7) + # Drop failed fits + if pt_pars[0] is not None: + range_uncal = (float(pt_pars[0][1]) * 20, float(pt_pars[0][1]) * 20) + n_bins = default_n_bins + else: + range_uncal = None + if range_uncal is not None: + uncal_peak_pars.append((peak, loc, range_uncal, n_bins, func)) + + fit_dict = {} + + for i_peak, uncal_peak_par in enumerate(uncal_peak_pars): + try: + peak_kev, mode_guess, wwidth_i, n_bins_i, func_i = uncal_peak_par + wleft_i, wright_i = wwidth_i + euc_min = mode_guess - wleft_i + euc_max = mode_guess + wright_i + + if self.uncal_is_int is True: + euc_min, euc_max, n_bins_i = pgh.better_int_binning( + x_lo=euc_min, x_hi=euc_max, n_bins=n_bins_i + ) + + energies = e_uncal[(e_uncal > euc_min) & (e_uncal < euc_max)][:n_events] + binw_1 = (euc_max - euc_min) / n_bins_i + + x0 = get_hpge_energy_peak_par_guess( + energies, + func_i, + (euc_min, euc_max), + bin_width=binw_1, + mode_guess=mode_guess, + ) + + euc_min = x0["mu"] - n_sigmas * x0["sigma"] + euc_max = x0["mu"] + n_sigmas * x0["sigma"] + + bin_width = (x0["sigma"]) * len(energies) ** (-1 / 3) + n_bins_i = int((euc_max - euc_min) / bin_width) + + if self.uncal_is_int is True: + euc_min, euc_max, n_bins_i = pgh.better_int_binning( + x_lo=euc_min, x_hi=euc_max, n_bins=n_bins_i + ) + + hist, bins, var = pgh.get_hist( + energies, bins=n_bins_i, range=(euc_min, euc_max) + ) + + x0["x_lo"] = euc_min + x0["x_hi"] = euc_max + + fixed, mask = get_hpge_energy_fixed(func_i) + fixed.append("n_bkg") + mask[np.where(np.array(func_i.required_args()) == "n_bkg")[0]] = True + bounds = get_hpge_energy_bounds(func_i, x0) + + pars_i, errs_i, cov_i = pgb.fit_binned( + func_i.cdf_ext, + hist, + bins, + var=var, + guess=x0, + cost_func="LL", + Extended=True, + fixed=fixed, + bounds=bounds, + ) + valid_fit = True + + csqr = pgb.goodness_of_fit( + hist, + bins, + None, + func_i.get_pdf, + pars_i, + method="Pearson", + scale_bins=True, + ) + csqr = (csqr[0], csqr[1] + len(np.where(mask)[0])) + + if np.isnan(pars_i).any(): + log.debug( + f"hpge_cal_energy_peak_tops: fit failed for i_peak={i_peak} at loc {mode_guess:g}, par is nan : {pars_i}" + ) + raise RuntimeError + + p_val = scipy.stats.chi2.sf(csqr[0], csqr[1]) + + if ( + cov_i is None + or cov_i.ndim == 0 + or sum(sum(c) for c in cov_i[mask, :][:, mask]) == np.inf + or sum(sum(c) for c in cov_i[mask, :][:, mask]) == 0 + or np.isnan(sum(sum(c) for c in cov_i[mask, :][:, mask])) + ): + log.debug( + f"hpge_cal_energy_peak_tops: cov estimation failed for i_peak={i_peak} at loc {mode_guess:g}" + ) + valid_pk = False + + elif valid_fit is False: + log.debug( + f"hpge_cal_energy_peak_tops: peak fitting failed for i_peak={i_peak} at loc {mode_guess:g}" + ) + valid_pk = False + + elif ( + errs_i is None + or pars_i is None + or np.abs(np.array(errs_i)[mask] / np.array(pars_i)[mask]) < 1e-7 + ).any() or np.isnan(np.array(errs_i)[mask]).any(): + log.debug( + f"hpge_cal_energy_peak_tops: failed for i_peak={i_peak} at loc {mode_guess:g}, parameter error too low" + ) + valid_pk = False + + elif p_val < allowed_p_val or np.isnan(p_val): + log.debug( + f"hpge_cal_energy_peak_tops: fit failed for i_peak={i_peak}, p-value too low: {p_val}" + ) + valid_pk = False + else: + valid_pk = True + + mu, mu_err = func_i.get_mu(pars_i, errors=errs_i) + + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif self.debug_mode: + raise (e) + log.debug( + f"hpge_fit_energy_peaks: fit failed for i_peak={i_peak}, unknown error" + ) + valid_pk = False + pars_i, errs_i, cov_i = return_nans(func_i) + p_val = 0 + mu = np.nan + mu_err = np.nan + + fit_dict[peak_kev] = { + "function": func_i, + "validity": valid_pk, + "parameters": pars_i, + "uncertainties": errs_i, + "covariance": cov_i, + "nbins": binw_1, + "range": [euc_min, euc_max], + "p_value": p_val, + "position": mu, + "position_uncertainty": mu_err, + } + + results_dict["peak_parameters"] = fit_dict + + fitted_peaks_kev = np.array( + [peak for peak in fit_dict if fit_dict[peak]["validity"]] + ) + + log.info(f"{len(fitted_peaks_kev)} peaks fitted:") + for peak, peak_dict in fit_dict.items(): + if peak_dict["validity"] is True: + varnames = peak_dict["function"].required_args() + pars = np.asarray(peak_dict["parameters"], dtype=float) + errors = np.asarray(peak_dict["uncertainties"], dtype=float) + log.info(f"\tEnergy: {str(peak)}") + log.info("\t\tParameter | Value +/- Sigma ") + for vari, pari, errorsi in zip(varnames, pars, errors): + log.info( + f'\t\t{str(vari).ljust(10)} | {("%4.2f" % pari).rjust(8)} +/- {("%4.2f" % errorsi).ljust(8)}' + ) + + if len(fitted_peaks_kev) == 0: + log.error("hpge_fit_energy_peaks: no peaks fitted") + self.update_results_dict(results_dict) + return + + mus = [ + fit_dict[peak]["position"] + for peak in fit_dict + if fit_dict[peak]["validity"] + ] + mu_vars = [ + fit_dict[peak]["position_uncertainty"] + for peak in fit_dict + if fit_dict[peak]["validity"] + ] + + mus = results_dict["pk_pos"] = np.asarray(mus) + mu_vars = results_dict["pk_pos_uncertainties"] = np.asarray(mu_vars) ** 2 + + if update_cal_pars is False: + self.update_results_dict(results_dict) + return + + self.peaks_kev = np.asarray(fitted_peaks_kev) + self.peak_locs = np.asarray(mus) + + # Now fit the E scale + try: + pars, errs, cov = hpge_fit_energy_scale( + mus, mu_vars, fitted_peaks_kev, deg=self.deg, fixed=self.fixed + ) + + results_dict["pk_cal_pars"] = pars + results_dict["pk_cal_errs"] = errs + results_dict["pk_cal_cov"] = cov + + # Invert the E scale fit to get a calibration function + pars, errs, cov = hpge_fit_energy_cal_func( + mus, + mu_vars, + fitted_peaks_kev, + pars, + deg=self.deg, + fixed=self.fixed, + ) + self.pars = np.array(pars) + + except ValueError: + log.error("Failed to fit enough peaks to get accurate calibration") + + self.update_results_dict(results_dict) + + def hpge_fit_energy_peaks( + self, + e_uncal, + peak_pars=None, + peaks_kev=None, + bin_width_kev=1, + peak_param="mode", + method="unbinned", + n_events=None, + allowed_p_val=0.01, + tail_weight=0, + update_cal_pars=True, + use_bin_width_in_fit=True, + ): + """ + Fit the energy peaks specified using the given function. + + Parameters + ---------- + e_uncal : array + Unbinned energy data to be fit. + peaks_kev : array, optional + Array of energy values for the peaks to fit. If not provided, it uses the peaks_kev attribute of the class. + peak_pars : list of tuples, optional + List containing tuples of the form (peak, range, func) where peak is the energy of the peak to fit, + range is the range in keV to fit, and func is the function to fit. + bin_width_kev : int, optional + Default binwidth to use for the fit window histogramming. Default is 1 keV. + peak_param : str, optional + Parameter to use for peak fitting. Default is "mode". + method : str, optional + Method to use for fitting. Default is "unbinned". Can specify to use binned fit method instead. + n_events : int, optional + Number of events to use for unbinned fit. + allowed_p_val : float, optional + Lower limit on p-value of fit. + tail_weight : int, optional + Weight to apply to the tail of the fit. + update_cal_pars : bool, optional + Whether to update the calibration parameters. Default is True. + + Returns + ------- + results_dict : dict + Dictionary containing the fit results for each peak. + + Raises + ------ + RuntimeError + If the fit fails. + + Notes + ----- + This function fits the energy peaks specified using the given function. It calculates the range around each peak to fit, + performs the fitting using either unbinned or binned method, and returns the fit results in a dictionary. + + """ + + results_dict = {} + # check no peaks in self.peaks_kev missing from peak_pars + + if peaks_kev is None: + peaks_kev = self.peaks_kev + + if peak_pars is None: + peak_pars = [(peak, None, pgf.gauss_on_step) for peak in peaks_kev] + + # convert peak pars to array of tuples + tmp = np.empty(len(peak_pars), dtype=object) + tmp[:] = peak_pars + peak_pars = tmp + + peak_pars_lines = [i[0] for i in peak_pars] + peaks_mask = np.array( + [True if peak in peaks_kev else False for peak in peak_pars_lines], + dtype=bool, + ) + peak_pars = peak_pars[peaks_mask] + + fit_peaks_mask = np.array( + [True for i in peak_pars if i[1] is not None and i[2] is not None], + dtype=bool, + ) + peak_pars = peak_pars[fit_peaks_mask] + + # First calculate range around peaks to fit + + uncal_peak_pars = [] + derco = Polynomial(self.pars).deriv().coef + for pars in peak_pars: + peak, fit_range, func = pars + + if peak in self.peaks_kev: + loc = self.peak_locs[np.where(peak == self.peaks_kev)][0] + else: + loc = (Polynomial(self.pars) - peak).roots()[0] + + if fit_range is None: + euc_min, euc_max = ( + (Polynomial(self.pars) - i).roots() + for i in (peaks_kev[0] * 0.9, peaks_kev[-1] * 1.1) + ) + euc_min = euc_min[0] + euc_max = euc_max[0] + if euc_min < 0: + euc_min = 0 + if euc_max > np.nanmax(e_uncal) * 1.1: + euc_max = np.nanmax(e_uncal) * 1.1 + d_euc = 0.5 / self.pars[1] + if self.uncal_is_int: + euc_min, euc_max, d_euc = pgh.better_int_binning( + x_lo=euc_min, x_hi=euc_max, dx=d_euc + ) + hist, bins, var = pgh.get_hist( + e_uncal, range=(euc_min, euc_max), dx=d_euc + ) + # Need to do initial fit + pt_pars, _ = hpge_fit_energy_peak_tops( + hist, bins, var, [loc], n_to_fit=7 + ) + # Drop failed fits + if pt_pars[0] is not None: + range_uncal = (float(pt_pars[0][1]) * 20, float(pt_pars[0][1]) * 20) + n_bins = int(range_uncal / bin_width_kev) + else: + range_uncal = None + elif isinstance(fit_range, tuple): + der = pgf.nb_poly(peak, derco) + range_uncal = (fit_range[0] / der, fit_range[1] / der) + n_bins = int(sum(fit_range) / (der * bin_width_kev)) + + if range_uncal is not None: + uncal_peak_pars.append((peak, loc, range_uncal, n_bins, func)) + + fit_dict = {} + + for i_peak, uncal_peak_par in enumerate(uncal_peak_pars): + peak_kev, mode_guess, wwidth_i, n_bins_i, func_i = uncal_peak_par + wleft_i, wright_i = wwidth_i + try: + euc_min = mode_guess - wleft_i + euc_max = mode_guess + wright_i + + if self.uncal_is_int is True: + euc_min, euc_max, n_bins_i = pgh.better_int_binning( + x_lo=euc_min, x_hi=euc_max, n_bins=n_bins_i + ) + energies = e_uncal[(e_uncal > euc_min) & (e_uncal < euc_max)][:n_events] + binw_1 = (euc_max - euc_min) / n_bins_i + if method == "unbinned": + ( + pars_i, + errs_i, + cov_i, + csqr_i, + func_i, + mask, + valid_fit, + _, + ) = unbinned_staged_energy_fit( + energies, + func=func_i, + fit_range=(euc_min, euc_max), + guess_func=get_hpge_energy_peak_par_guess, + bounds_func=get_hpge_energy_bounds, + fixed_func=get_hpge_energy_fixed, + allow_tail_drop=True, + tail_weight=tail_weight, + bin_width=binw_1 if use_bin_width_in_fit is True else None, + guess_kwargs={"mode_guess": mode_guess}, + ) + if pars_i["n_sig"] < 100: + valid_fit = False + csqr = csqr_i + + else: + hist, bins, var = pgh.get_hist( + energies, bins=n_bins_i, range=(euc_min, euc_max) + ) + binw_1 = (bins[-1] - bins[0]) / (len(bins) - 1) + par_guesses = get_hpge_energy_peak_par_guess( + hist, bins, var, func_i, mode_guess=mode_guess + ) + bounds = get_hpge_energy_bounds(func_i, par_guesses) + fixed, mask = get_hpge_energy_fixed(func_i) + + x0 = get_hpge_energy_peak_par_guess( + energies, func_i, (euc_min, euc_max), bin_width=binw_1 + ) + fixed, mask = get_hpge_energy_fixed(func_i) + bounds = get_hpge_energy_bounds(func_i, x0) + + pars_i, errs_i, cov_i = pgb.fit_binned( + func_i.get_pdf, + hist, + bins, + var=var, + guess=x0, + cost_func=method, + Extended=True, + fixed=fixed, + bounds=bounds, + ) + valid_fit = True + + csqr = pgb.goodness_of_fit( + hist, + bins, + None, + func_i.get_pdf, + pars_i, + method="Pearson", + scale_bins=True, + ) + csqr = (csqr[0], csqr[1] + len(np.where(mask)[0])) + + if np.isnan(pars_i).any(): + log.debug( + f"hpge_fit_energy_peaks: fit failed for i_peak={i_peak} at loc {mode_guess:g}, par is nan : {pars_i}" + ) + raise RuntimeError + + p_val = scipy.stats.chi2.sf(csqr[0], csqr[1]) + + total_events = func_i.get_total_events(pars_i, errors=errs_i) + if ( + cov_i is None + or cov_i.ndim == 0 + or sum(sum(c) for c in cov_i[mask, :][:, mask]) == np.inf + or sum(sum(c) for c in cov_i[mask, :][:, mask]) == 0 + or np.isnan(sum(sum(c) for c in cov_i[mask, :][:, mask])) + ): + log.debug( + f"hpge_fit_energy_peaks: cov estimation failed for i_peak={i_peak} at loc {mode_guess:g}" + ) + valid_pk = False + + elif valid_fit is False: + log.debug( + f"hpge_fit_energy_peaks: peak fitting failed for i_peak={i_peak} at loc {mode_guess:g}" + ) + valid_pk = False + + elif ( + errs_i is None + or pars_i is None + or np.abs(np.array(errs_i)[mask] / np.array(pars_i)[mask]) < 1e-7 + ).any() or np.isnan(np.array(errs_i)[mask]).any(): + log.debug( + f"hpge_fit_energy_peaks: failed for i_peak={i_peak} at loc {mode_guess:g}, parameter error too low" + ) + valid_pk = False + + elif np.abs(total_events[0] - len(energies)) / len(energies) > 0.1: + log.debug( + f"hpge_fit_energy_peaks: fit failed for i_peak={i_peak} at loc {mode_guess:g}, total_events is outside limit" + ) + valid_pk = False + + elif p_val < allowed_p_val or np.isnan(p_val): + log.debug( + f"hpge_fit_energy_peaks: fit failed for i_peak={i_peak}, p-value too low: {p_val}" + ) + valid_pk = False + else: + valid_pk = True + + if peak_param == "mu": + mu, mu_err = func_i.get_mu(pars_i, errors=errs_i) + + elif peak_param == "mode": + mu, mu_err = func_i.get_mode(pars_i, cov=cov_i) + else: + log.error( + f"hpge_fit_energy_peaks: mode {self.peak_param} not recognized" + ) + raise RuntimeError + + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif self.debug_mode: + raise (e) + log.debug( + f"hpge_fit_energy_peaks: fit failed for i_peak={i_peak}, unknown error" + ) + valid_pk = False + pars_i, errs_i, cov_i = return_nans(func_i) + p_val = 0 + mu = np.nan + mu_err = np.nan + + fit_dict[peak_kev] = { + "function": func_i, + "validity": valid_pk, + "parameters": pars_i, + "uncertainties": errs_i, + "covariance": cov_i, + "nbins": binw_1, + "range": [euc_min, euc_max], + "p_value": p_val, + "position": mu, + "position_uncertainty": mu_err, + } + + results_dict["peak_parameters"] = fit_dict + + fitted_peaks_kev = np.array( + [peak for peak in fit_dict if fit_dict[peak]["validity"]] + ) + + log.info(f"{len(fitted_peaks_kev)} peaks fitted:") + for peak, peak_dict in fit_dict.items(): + if peak_dict["validity"] is True: + varnames = peak_dict["function"].required_args() + pars = np.asarray(peak_dict["parameters"], dtype=float) + errors = np.asarray(peak_dict["uncertainties"], dtype=float) + log.info(f"\tEnergy: {str(peak)}") + log.info("\t\tParameter | Value +/- Sigma ") + for vari, pari, errorsi in zip(varnames, pars, errors): + log.info( + f'\t\t{str(vari).ljust(10)} | {("%4.2f" % pari).rjust(8)} +/- {("%4.2f" % errorsi).ljust(8)}' + ) + + if len(fitted_peaks_kev) == 0: + log.error("hpge_fit_energy_peaks: no peaks fitted") + self.update_results_dict(results_dict) + return + + mus = [ + fit_dict[peak]["position"] + for peak in fit_dict + if fit_dict[peak]["validity"] + ] + mu_vars = [ + fit_dict[peak]["position_uncertainty"] + for peak in fit_dict + if fit_dict[peak]["validity"] + ] + + results_dict["peak_param"] = peak_param + mus = results_dict["pk_pos"] = np.asarray(mus) + mu_vars = results_dict["pk_pos_uncertainties"] = np.asarray(mu_vars) ** 2 + + if update_cal_pars is False: + self.update_results_dict(results_dict) + return + + self.peaks_kev = np.asarray(fitted_peaks_kev) + self.peak_locs = np.asarray(mus) + + # Now fit the E scale + try: + pars, errs, cov = hpge_fit_energy_scale( + mus, mu_vars, fitted_peaks_kev, deg=self.deg, fixed=self.fixed + ) + + results_dict["pk_cal_pars"] = pars + results_dict["pk_cal_errs"] = errs + results_dict["pk_cal_cov"] = cov + + # Invert the E scale fit to get a calibration function + pars, errs, cov = hpge_fit_energy_cal_func( + mus, + mu_vars, + fitted_peaks_kev, + pars, + deg=self.deg, + fixed=self.fixed, + ) + self.pars = np.array(pars) + + except ValueError: + log.error("Failed to fit enough peaks to get accurate calibration") + + self.update_results_dict(results_dict) + + def get_fwhms(self): + """ + Updates last results dictionary with fwhms in kev + """ + + peak_parameters = self.results[list(self.results)[-1]].get( + "peak_parameters", None + ) + + if peak_parameters is None: + log.error("No peak parameters found") + return + + cal_fwhms = [] + cal_fwhm_errs = [] + for peak, peak_dict in peak_parameters.items(): + # Calculate the uncalibrated fwhm + if peak_dict["validity"] is True: + uncal_fwhm, uncal_fwhm_err = peak_dict["function"].get_fwhm( + peak_dict["parameters"], + cov=peak_dict["covariance"], + ) + else: + uncal_fwhm, uncal_fwhm_err = (np.nan, np.nan) + + # Apply calibration + + derco = Polynomial(self.pars).deriv().coef + der = pgf.nb_poly(peak, derco) + cal_fwhm = uncal_fwhm * der + cal_fwhm_err = uncal_fwhm_err * der + + peak_dict.update({"fwhm_in_kev": cal_fwhm, "fwhm_err_in_kev": cal_fwhm_err}) + + if peak_dict["validity"] is True: + cal_fwhms.append(cal_fwhm) + cal_fwhm_errs.append(cal_fwhm_err) + + cal_fwhms = np.array(cal_fwhms) + cal_fwhm_errs = np.array(cal_fwhm_errs) + fitted_peaks_kev = np.array( + [ + peak + for peak, peak_dict in peak_parameters.items() + if peak_dict["validity"] + ] ) - if ( - sum(sum(sum(c) if (c != None).any() else 0 for c in pt_covs)) == np.inf - or sum(sum(sum(c) if (c != None).any() else 0 for c in pt_covs)) == 0 + + log.info(f"{len(cal_fwhms)} FWHMs found:") + log.info("\t Energy | FWHM ") + for i, (ei, fwhm, fwhme) in enumerate( + zip(fitted_peaks_kev, cal_fwhms, cal_fwhm_errs) ): - log.debug( - "hpge_find_E_peaks: can safely ignore previous covariance warning, not used" + log.info( + f"\t{i}".ljust(4) + + str(ei).ljust(9) + + f"| {fwhm:.2f}+-{fwhme:.2f} kev".ljust(5) ) - pt_pars = pt_pars[np.array([x is not None for x in pt_pars])] - med_sigma_ratio = np.median(np.stack(pt_pars)[:, 1] / np.stack(pt_pars)[:, 0]) - Etol_keV = 5.0 * (med_sigma_ratio / 0.003) - pars, ixtup, iytup = poly_match( - detected_max_locs, peaks_keV, deg=deg, atol=Etol_keV - ) + @staticmethod + def fit_energy_res_curve(fwhm_func, fwhm_peaks, fwhms, dfwhms): + try: + if len(fwhm_peaks) == 0: + raise RuntimeError + c_lin = cost.LeastSquares(fwhm_peaks, fwhms, dfwhms, fwhm_func.func) + # c_lin.loss = "soft_l1" + m = Minuit(c_lin, *fwhm_func.guess(fwhm_peaks, fwhms, dfwhms)) + bounds = fwhm_func.bounds(fwhms) + for arg, val in enumerate(bounds): + m.limits[arg] = val + m.simplex() + m.migrad() + m.hesse() + + p_val = scipy.stats.chi2.sf(m.fval, len(fwhm_peaks) - len(m.values)) + + results = { + "function": fwhm_func, + "module": fwhm_func.__module__, + "expression": fwhm_func.string_func("x"), + "parameters": m.values, + "uncertainties": m.errors, + "cov": m.covariance, + "csqr": (m.fval, len(fwhm_peaks) - len(m.values)), + "p_val": p_val, + } + + log.info(f'FWHM fit: {results["parameters"].to_dict()}') + log.info("FWHM fit values:") + log.info("\t Energy | FWHM (kev) | Predicted (kev)") + for i, (peak, fwhm, fwhme) in enumerate(zip(fwhm_peaks, fwhms, dfwhms)): + log.info( + f"\t{i}".ljust(4) + + str(peak).ljust(9) + + f"| {fwhm:.2f}+-{fwhme:.2f} ".ljust(5) + + f"| {fwhm_func.func(peak, *results['parameters']):.2f}".ljust(5) + ) + except RuntimeError: + pars, errs, cov = return_nans(fwhm_func.func) + results = { + "function": fwhm_func, + "module": fwhm_func.__module__, + "expression": fwhm_func.string_func("x"), + "parameters": pars, + "uncertainties": errs, + "cov": cov, + "csqr": (np.nan, np.nan), + "p_val": 0, + } + log.error("FWHM fit failed to converge") + return results + + @staticmethod + def interpolate_energy_res( + fwhm_func, fwhm_peaks, fwhm_results, interp_energy_kev=None, debug_mode=False + ): + if interp_energy_kev is not None: + for key, energy in interp_energy_kev.items(): + try: + if energy > np.nanmax(fwhm_peaks) or energy < np.nanmin(fwhm_peaks): + raise RuntimeError( + "Interpolating energy out of range of fitted peaks" + ) + rng = np.random.default_rng(1) + pars_b = rng.multivariate_normal( + fwhm_results["parameters"], fwhm_results["cov"], size=1000 + ) + interp_vals = np.array( + [fwhm_func.func(energy, *par_b) for par_b in pars_b] + ) + interp_err = np.nanstd(interp_vals) + interp_fwhm = fwhm_func.func(energy, *fwhm_results["parameters"]) + except BaseException as e: + if debug_mode: + raise (e) + interp_fwhm = np.nan + interp_err = np.nan + fwhm_results.update( + { + "interp_energy_in_kev": energy, + f"{key}_fwhm_in_kev": interp_fwhm, + f"{key}_fwhm_err_in_kev": interp_err, + } + ) + log.info( + f"FWHM {key} energy resolution at {energy} : {interp_fwhm:1.2f} +- {interp_err:1.2f} kev" + ) + return fwhm_results + + def get_energy_res_curve(self, fwhm_func, interp_energy_kev=None): + peak_parameters = self.results[list(self.results)[-1]].get( + "peak_parameters", None + ) + if peak_parameters is None: + log.error("No peak parameters found") + return + fitted_peaks_kev = np.array( + [ + peak + for peak, peak_dict in peak_parameters.items() + if peak_dict["validity"] + ] + ) + if len(fitted_peaks_kev) == 0: + return + if "fwhm_in_kev" not in peak_parameters[fitted_peaks_kev[0]]: + self.get_fwhms() + peak_parameters = self.results[list(self.results)[-1]].get( + "peak_parameters", None + ) - if len(ixtup) != len(peaks_keV): - log.info( - f"hpge_find_E_peaks: only found {len(ixtup)} of {len(peaks_keV)} expected peaks" + fwhm_peaks = np.array([], dtype=np.float32) + fwhms = np.array([], dtype=np.float32) + dfwhms = np.array([], dtype=np.float32) + all_peaks = np.array([], dtype=np.float32) + ##### + # Remove the Doppler Broadened peaks from calibration if found + for peak, peak_dict in peak_parameters.items(): + all_peaks = np.append(all_peaks, peak) + if np.abs(peak - 2103.5) < 1: + log.info("Tl SEP removed from fwhm fitting") + elif np.abs(peak - 1592.53) < 1: + log.info("Tl DEP removed from fwhm fitting") + elif np.abs(peak - 511.0) < 1: + log.info("e annihilation removed from fwhm fitting") + elif np.isnan(peak_dict["fwhm_in_kev"]) or np.isnan( + peak_dict["fwhm_err_in_kev"] + ): + log.info(f"{peak} failed, removed from fwhm fitting") + else: + fwhm_peaks = np.append(fwhm_peaks, peak) + fwhms = np.append(fwhms, peak_dict["fwhm_in_kev"]) + dfwhms = np.append(dfwhms, peak_dict["fwhm_err_in_kev"]) + + log.info(f"Running FWHM fit for : {fwhm_func.__name__}") + + results = self.fit_energy_res_curve(fwhm_func, fwhm_peaks, fwhms, dfwhms) + if interp_energy_kev is not None: + results = self.interpolate_energy_res( + fwhm_func, + fwhm_peaks, + results, + interp_energy_kev, + debug_mode=self.debug_mode, + ) + self.results[list(self.results)[-1]].update({fwhm_func.__name__: results}) + + def full_calibration( + self, + e_uncal, + peak_pars, + allowed_p_val=10**-20, + tail_weight=0, + peak_param="mode", + n_events=None, + ): + log.debug(f"Find peaks and compute calibration curve for {self.energy_param}") + log.debug(f"Guess is {self.pars[1]:.3f}") + self.hpge_find_energy_peaks(e_uncal) + self.hpge_get_energy_peaks(e_uncal) + + got_peaks_kev = self.peaks_kev.copy() + self.hpge_fit_energy_peaks( + e_uncal, + peak_pars=peak_pars, + allowed_p_val=allowed_p_val, + tail_weight=tail_weight, + peak_param=peak_param, + n_events=n_events, ) - return detected_max_locs[ixtup], peaks_keV[iytup], pars + if len(self.peaks_kev) != len(got_peaks_kev): + for i, peak in enumerate(got_peaks_kev): + if peak not in self.peaks_kev: + for i, peak_par in enumerate(peak_pars): + if peak_par[0] == peak: + new_kev_ranges = (peak_par[1][0] - 5, peak_par[1][1] - 5) + peak_pars[i] = (peak, new_kev_ranges, peak_par[2]) + for i, peak in enumerate(self.peaks_kev): + try: + if ( + self.results["pk_fwhms"][:, 1][i] + / self.results["pk_fwhms"][:, 0][i] + > 0.05 + ): + for i, peak_par in enumerate(peak_pars): + if peak_par[0] == peak: + new_kev_ranges = ( + peak_par[1][0] - 5, + peak_par[1][1] - 5, + ) + peak_pars[i] = (peak, new_kev_ranges, peak_par[2]) + except BaseException as e: + if self.debug_mode: + raise (e) + + self.hpge_fit_energy_peaks( + e_uncal, + peaks=got_peaks_kev, + peak_pars=peak_pars, + allowed_p_val=allowed_p_val, + tail_weight=tail_weight, + peak_param=peak_param, + n_events=n_events, + ) + if self.pars is None: + if self.deg < 1: + self.pars = np.full(2, np.nan) + else: + self.pars = np.full(self.deg + 1, np.nan) -def hpge_get_E_peaks( - hist, - bins, - var, - cal_pars, - peaks_keV, - n_sigma=3, - Etol_keV=5, - var_zero=1, - verbose=False, -): - """Get uncalibrated E peaks at the energies of peaks_keV + log.error(f"Calibration failed completely for {self.energy_param}") + return - Parameters - ---------- - hist, bins, var : array, array, array - Histogram of uncalibrated energies, see pgh.get_hist() - var cannot contain any zero entries. - cal_pars : array - Estimated energy calibration parameters used to search for peaks - peaks_keV : array - Energies of peaks to search for (in keV) - n_sigma : float - Threshold for detecting a peak in sigma (i.e. sqrt(var)) - Etol_keV : float - absolute tolerance in energy for matching peaks - var_zero : float - number used to replace zeros of var to avoid divide-by-zero in - hist/sqrt(var). Default value is 1. Usually when var = 0 its because - hist = 0, and any value here is fine. - verbose : bool - print debug messages + log.debug("Calibrated found") + log.info(f"Calibration pars are {self.pars}") - Returns - ------- - got_peak_locations : list - list of uncalibrated energies of found peaks - got_peak_energies : list - list of calibrated energies of found peaks - pars : list of floats - the parameters for poly(peaks_uncal) = peaks_keV (polyfit convention) - """ - # clean up var if necessary - if np.any(var == 0): - log.debug(f"hpge_find_E_peaks: replacing var zeros with {var_zero}") - var[np.where(var == 0)] = var_zero - peaks_keV = np.asarray(peaks_keV) - - # Find all maxes with > n_sigma significance - imaxes = get_i_local_maxima(hist / np.sqrt(var), n_sigma) - - # Keep maxes if they coincide with expected peaks - test_peaks_keV = np.asarray([pgf.poly(i, cal_pars) for i in bins[imaxes]]) - imatch = [abs(peaks_keV - i).min() < Etol_keV for i in test_peaks_keV] - - got_peak_locations = bins[imaxes[imatch]] - got_peak_energies = test_peaks_keV[imatch] - - # Match calculated and true peak energies - matched_energies = peaks_keV[ - [np.argmin(abs(peaks_keV - i)) for i in got_peak_energies] - ] - while not all([list(matched_energies).count(x) == 1 for x in matched_energies]): - for i in range(len(matched_energies)): - if matched_energies[i + 1] == matched_energies[i]: - # remove duplicates - if np.argmin( - abs(got_peak_energies[i : i + 2] - matched_energies[i]) - ): # i+1 is best match - got_peak_locations = np.delete(got_peak_locations, i) - got_peak_energies = np.delete(got_peak_energies, i) - else: # i is best match - got_peak_locations = np.delete(got_peak_locations, i + 1) - got_peak_energies = np.delete(got_peak_energies, i + 1) - matched_energies = np.delete(matched_energies, i) - break - i += 1 + self.get_energy_res_curve( + FWHMLinear, + interp_energy_kev={"Qbb": 2039.0}, + ) + self.get_energy_res_curve( + FWHMQuadratic, + interp_energy_kev={"Qbb": 2039.0}, + ) + + def fit_calibrated_peaks(self, e_uncal, peak_pars): + log.debug(f"Fitting {self.energy_param}") + self.hpge_get_energy_peaks(e_uncal, update_cal_pars=False) + self.hpge_fit_energy_peaks(e_uncal, peak_pars=peak_pars, update_cal_pars=False) + self.get_energy_res_curve( + FWHMLinear, + interp_energy_kev={"Qbb": 2039.0}, + ) + self.get_energy_res_curve( + FWHMQuadratic, + interp_energy_kev={"Qbb": 2039.0}, + ) + + def calibrate_prominent_peak( + self, + e_uncal, + peak, + peak_pars, + allowed_p_val=10**-20, + tail_weight=0, + peak_param="mode", + n_events=None, + ): + log.debug(f"Find peaks and compute calibration curve for {self.energy_param}") + log.debug(f"Guess is {self.pars[1]:.3f}") + if self.deg != 0: + log.error("deg must be 0 for calibrate_prominent_peak") + return + self.hpge_find_energy_peaks(e_uncal) + self.hpge_get_energy_peaks(e_uncal) + + got_peaks_kev = self.peaks_kev.copy() + self.hpge_fit_energy_peaks( + e_uncal, + peaks_kev=[peak], + peak_pars=peak_pars, + allowed_p_val=allowed_p_val, + tail_weight=tail_weight, + peak_param=peak_param, + n_events=n_events, + ) + self.hpge_fit_energy_peaks( + e_uncal, + peaks_kev=got_peaks_kev, + peak_pars=peak_pars, + allowed_p_val=allowed_p_val, + tail_weight=tail_weight, + peak_param=peak_param, + n_events=n_events, + update_cal_pars=False, + ) + self.get_energy_res_curve( + FWHMLinear, + interp_energy_kev={"Qbb": 2039.0}, + ) + self.get_energy_res_curve( + FWHMQuadratic, + interp_energy_kev={"Qbb": 2039.0}, + ) + + def plot_cal_fit(self, data, figsize=(12, 8), fontsize=12, erange=(200, 2700)): + fig, (ax1, ax2) = plt.subplots( + 2, 1, sharex=True, gridspec_kw={"height_ratios": [3, 1]}, figsize=figsize + ) + + cal_bins = np.linspace(0, np.nanmax(self.peak_locs) * 1.1, 20) - # Calculate updated calibration curve - pars = np.polyfit(got_peak_locations, matched_energies, len(cal_pars)) + ax1.scatter(self.peaks_kev, self.peak_locs, marker="x", c="b") - return got_peak_locations, matched_energies, pars + ax1.plot(pgf.nb_poly(cal_bins, self.pars), cal_bins, lw=1, c="g") + ax1.grid() + ax1.set_xlim([erange[0], erange[1]]) + ax1.set_ylabel("Energy (ADC)", fontsize=fontsize) + ax2.scatter( + self.peaks_kev, + pgf.nb_poly(np.array(self.peak_locs), self.pars) - self.peaks_kev, + marker="x", + c="b", + ) + ax2.grid() + ax2.set_xlabel("Energy (keV)", fontsize=fontsize) + ax2.set_ylabel("Residuals (keV)", fontsize=fontsize) + plt.close() + return fig + + def plot_fits( + self, energies, figsize=(12, 8), fontsize=12, ncols=3, nrows=3, binning_kev=5 + ): + plt.rcParams["font.size"] = fontsize -def hpge_fit_E_peak_tops( + pk_parameters = self.results[list(self.results)[-1]].get( + "peak_parameters", None + ) + + fig = plt.figure(figsize=figsize) + derco = Polynomial(self.pars).deriv().coef + der = [pgf.nb_poly(5, derco) for _ in list(pk_parameters)] + for i, peak in enumerate(pk_parameters): + range_adu = 5 / der[i] + plt.subplot(nrows, ncols, i + 1) + pk_dict = pk_parameters[peak] + pk_pars = pk_dict["parameters"] + pk_ranges = pk_dict["range"] + pk_func = pk_dict["function"] + mu = pk_func.get_mu(pk_pars) if pk_pars is not None else np.nan + + try: + binning = np.arange(pk_ranges[0], pk_ranges[1], 0.1 / der[i]) + bin_cs = (binning[1:] + binning[:-1]) / 2 + + counts, bs, bars = plt.hist(energies, bins=binning, histtype="step") + if pk_pars is not None: + fit_vals = pk_func.get_pdf(bin_cs, *pk_pars, 0) * np.diff(bs)[0] + plt.plot(bin_cs, fit_vals) + plt.step( + bin_cs, + [ + (fval - count) / count if count != 0 else (fval - count) + for count, fval in zip(counts, fit_vals) + ], + where="mid", + ) + + plt.annotate( + f"{peak:.1f} keV", (0.02, 0.8), xycoords="axes fraction" + ) + plt.annotate( + f"p-value : {pk_dict['p_value']:.4f}", + (0.02, 0.7), + xycoords="axes fraction", + ) + plt.xlabel("Energy (keV)") + plt.ylabel("Counts") + plt.legend(loc="upper left", frameon=False) + + plt.xlim([mu - range_adu, mu + range_adu]) + locs, labels = plt.xticks() + + def get_peak_labels( + labels: list[str], pars: list[float] + ) -> tuple(list[float], list[float]): + out = [] + out_labels = [] + for i, label in enumerate(labels): + if i % 2 == 1: + continue + else: + out.append(f"{pgf.nb_poly(label, pars):.1f}") + out_labels.append(label) + return out_labels, out + + new_locs, new_labels = get_peak_labels(locs, self.pars) + plt.xticks(ticks=new_locs, labels=new_labels) + + except BaseException as e: + if self.debug_mode: + raise (e) + + plt.tight_layout() + plt.close() + return fig + + def plot_eres_fit(self, data, erange=(200, 2700), figsize=(12, 8), fontsize=12): + plt.rcParams["font.size"] = fontsize + + pk_parameters = self.results[list(self.results)[-1]].get( + "peak_parameters", None + ) + + if pk_parameters is None: + fig = plt.figure() + return fig + + ##### + # Remove the Tl SEP and DEP from calibration if found + fwhm_peaks = np.array([], dtype=np.float32) + fwhms = np.array([], dtype=np.float32) + dfwhms = np.array([], dtype=np.float32) + + for peak, pk_dict in pk_parameters.items(): + if peak == 2103.53: + pass + elif peak == 1592.53: + pass + elif peak == 511.0: + pass + elif pk_dict["validity"] is False: + pass + elif np.isnan(pk_dict["fwhm_err_in_kev"]): + pass + else: + fwhm_peaks = np.append(fwhm_peaks, peak) + fwhms = np.append(fwhms, pk_dict["fwhm_in_kev"]) + dfwhms = np.append(dfwhms, pk_dict["fwhm_err_in_kev"]) + + fwhm_dicts = {} + interp_energy = None + interp_fwhm_name = None + for name, item in self.results[list(self.results)[-1]].items(): + if "FWHM" in name: + fwhm_dicts[name] = item + if "interp_energy_in_kev" in item: + interp_energy = item["interp_energy_in_kev"] + for field in item: + if "_fwhm_in_kev" in field: + interp_fwhm_name = field.replace("_fwhm_in_kev", "") + + fig, (ax1, ax2) = plt.subplots( + 2, 1, sharex=True, gridspec_kw={"height_ratios": [3, 1]}, figsize=figsize + ) + if len(np.where((~np.isnan(fwhms)) & (~np.isnan(dfwhms)))[0]) > 0: + ax1.errorbar(fwhm_peaks, fwhms, yerr=dfwhms, marker="x", ls=" ", c="black") + + fwhm_slope_bins = np.arange(erange[0], erange[1], 10) + + if interp_energy is not None: + qbb_line_vx = [interp_energy, interp_energy] + qbb_line_hx = [erange[0], interp_energy] + for name, fwhm_dict in fwhm_dicts.items(): + qbb_line_vy = [np.inf, -np.inf] + low_lim = 0.9 * np.nanmin( + fwhm_dict["function"].func( + fwhm_slope_bins, *fwhm_dict["parameters"] + ) + ) + up_lim = fwhm_dict[f"{interp_fwhm_name}_fwhm_in_kev"] + if low_lim < qbb_line_vy[0]: + qbb_line_vy[0] = low_lim + if up_lim > qbb_line_vy[1]: + qbb_line_vy[1] = up_lim + ax1.plot( + qbb_line_hx, + [ + fwhm_dict[f"{interp_fwhm_name}_fwhm_in_kev"], + fwhm_dict[f"{interp_fwhm_name}_fwhm_in_kev"], + ], + lw=1, + c="r", + ls="--", + ) + ax1.plot( + fwhm_slope_bins, + fwhm_dict["function"].func( + fwhm_slope_bins, *fwhm_dict["parameters"] + ), + lw=1, + label=f'{name}, {interp_fwhm_name} fwhm: {fwhm_dict[f"{interp_fwhm_name}_fwhm_in_kev"]:1.2f} +- {fwhm_dict[f"{interp_fwhm_name}_fwhm_err_in_kev"]:1.2f} keV', + ) + ax1.plot(qbb_line_vx, qbb_line_vy, lw=1, c="r", ls="--") + + ax1.set_xlim(erange) + ax1.set_ylim([low_lim, None]) + ax1.set_ylabel("FWHM energy resolution (keV)") + for _, fwhm_dict in fwhm_dicts.items(): + ax2.plot( + fwhm_peaks, + ( + fwhms + - fwhm_dict["function"].func( + fwhm_peaks, *fwhm_dict["parameters"] + ) + ) + / dfwhms, + lw=0, + marker="x", + ) + ax2.plot(erange, [0, 0], color="black", lw=0.5) + ax2.set_xlabel("Energy (keV)") + ax2.set_ylabel("Normalised Residuals") + plt.tight_layout() + plt.close() + return fig + + +class FWHMLinear: + @staticmethod + def func(x, a, b): + return np.sqrt(a + b * x) + + @staticmethod + def string_func(input_param): + return f"(a+b*{input_param})**(0.5)" + + @staticmethod + def guess(xs, ys, y_errs): + return [np.nanmin(ys), 10**-3] + + @staticmethod + def bounds(ys): + return [(0, None), (0, None)] + + +class FWHMQuadratic: + @staticmethod + def func(x, a, b, c): + return np.sqrt(a + b * x + c * x**2) + + @staticmethod + def string_func(input_param): + return f"(a+b*{input_param}+c*{input_param}**2)**(0.5)" + + @staticmethod + def guess(xs, ys, y_errs): + return [np.nanmin(ys), 2 * 10**-3, 10**-8] + + @staticmethod + def bounds(ys): + return [(0, np.nanmin(ys) ** 2), (10**-3, None), (0, None)] + + +def hpge_fit_energy_peak_tops( hist, bins, var, @@ -200,6 +1712,7 @@ def hpge_fit_E_peak_tops( cost_func="Least Squares", inflate_errors=False, gof_method="var", + debug_mode=False, ): """Fit gaussians to the tops of peaks @@ -227,19 +1740,23 @@ def hpge_fit_E_peak_tops( """ pars_list = [] cov_list = [] - for E_peak in peak_locs: + for e_peak in peak_locs: try: - pars, cov = pgf.gauss_mode_width_max( + pars, cov = pgb.gauss_mode_width_max( hist, bins, var, - mode_guess=E_peak, + mode_guess=e_peak, n_bins=n_to_fit, cost_func=cost_func, inflate_errors=inflate_errors, gof_method=gof_method, ) - except: + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif debug_mode: + raise (e) pars, cov = None, None pars_list.append(pars) @@ -247,38 +1764,114 @@ def hpge_fit_E_peak_tops( return np.array(pars_list, dtype=object), np.array(cov_list, dtype=object) -def get_hpge_E_peak_par_guess(hist, bins, var, func, mode_guess): - """Get parameter guesses for func fit to peak in hist +def get_hpge_energy_peak_par_guess( + energy, func, fit_range=None, bin_width=None, mode_guess=None +): + """ + Get parameter guesses for func fit to peak in hist Parameters ---------- - hist, bins, var : array, array, array - Histogram of uncalibrated energies, see pgh.get_hist(). Should be - windowed around the peak. + energy : array + An array of energy values in the range around the peak for guessing. func : function - The function to be fit to the peak in the (windowed) hist + The function to be fit to the peak in the histogram. + fit_range : tuple, optional + A tuple specifying the range around the peak to perform the fit. If not provided, the entire range of energy values will be used. + bin_width : float, optional + The width of the bins in the histogram. Default is 1. + mode_guess : float, optional + A guess for the mode (mu) parameter of the function. If not provided, it will be estimated from the data. + + Returns + ------- + ValueView + A ValueView object from iminuit containing the parameter guesses for the function fit. + + Notes + ----- + This function calculates parameter guesses for fitting a function to a peak in a histogram. It uses various methods to estimate the parameters based on the provided energy values and the selected function. + + If the function is 'gauss_on_step', the following parameters will be estimated: + - n_sig: Number of signal events in the peak. + - mu: Mean of the peak. + - sigma: Standard deviation of the peak. + - n_bkg: Number of background events. + - hstep: Height of the step between the peak and the background. + - x_lo: Lower bound of the fit range. + - x_hi: Upper bound of the fit range. + + If the function is 'hpge_peak', the following parameters will be estimated: + - n_sig: Number of signal events in the peak. + - mu: Mean of the peak. + - sigma: Standard deviation of the peak. + - htail: Height of the tail component. + - tau: Decay constant of the tail component. + - n_bkg: Number of background events. + - hstep: Height of the step between the peak and the background. + - x_lo: Lower bound of the fit range. + - x_hi: Upper bound of the fit range. + + If the provided function is not implemented, an error will be raised. + + Examples + -------- + >>> energy = [1, 2, 3, 4, 5] + >>> func = pgf.gauss_on_step + >>> fit_range = (2, 4) + >>> bin_width = 0.5 + >>> mode_guess = 3.5 + >>> get_hpge_energy_peak_par_guess(energy, func, fit_range, bin_width, mode_guess) + {'n_sig': 3, 'mu': 3.5, 'sigma': 0.5, 'n_bkg': 2, 'hstep': 0.5, 'x_lo': 2, 'x_hi': 4} """ + if fit_range is None: + fit_range = (np.nanmin(energy), np.nanmax(energy)) + + energy = energy[(energy >= fit_range[0]) & (energy <= fit_range[1])] + if bin_width is None: + init_bin_width = ( + 2 + * (np.nanpercentile(energy, 75) - np.nanpercentile(energy, 25)) + * len(energy) ** (-1 / 3) + ) + init_hist, init_bins, _ = pgh.get_hist( + energy, dx=init_bin_width, range=fit_range + ) + try: + _, init_sigma, _ = pgh.get_gaussian_guess(init_hist, init_bins) + except IndexError: + init_hist, init_bins, _ = pgh.get_hist( + energy, dx=init_bin_width / 2, range=fit_range + ) + try: + _, init_sigma, _ = pgh.get_gaussian_guess(init_hist, init_bins) + except IndexError: + init_sigma = np.nanstd(energy) + bin_width = 2 * (init_sigma) * len(energy) ** (-1 / 3) + + hist, bins, var = pgh.get_hist(energy, dx=bin_width, range=fit_range) + if ( - func == pgf.gauss_step_cdf - or func == pgf.gauss_step_pdf - or func == pgf.extended_gauss_step_pdf + func == pgf.gauss_on_step + or func == pgf.hpge_peak + or func == pgf.gauss_on_uniform ): # get mu and height from a gauss fit, also sigma as fallback - pars, cov = pgf.gauss_mode_width_max( - hist, bins, var, mode_guess=mode_guess, n_bins=10 + pars, cov = pgb.gauss_mode_width_max( + hist, bins, var, mode_guess=mode_guess, n_bins=5 ) + bin_centres = pgh.get_bin_centers(bins) if pars is None: - log.info("get_hpge_E_peak_par_guess: gauss_mode_width_max failed") + log.info("get_hpge_energy_peak_par_guess: gauss_mode_width_max failed") i_0 = np.argmax(hist) mu = bin_centres[i_0] height = hist[i_0] sigma_guess = None else: - mu = mode_guess + mu = mode_guess if mode_guess is not None else pars[0] sigma_guess = pars[1] height = pars[2] - # get bg and step from edges of hist bg = np.mean(hist[-10:]) step = bg - np.mean(hist[:10]) @@ -293,191 +1886,145 @@ def get_hpge_E_peak_par_guess(hist, bins, var, func, mode_guess): bl=bg - step / 2, method="interpolate", )[0] - if sigma == 0: + if sigma <= 0: raise ValueError - except: - sigma = pgh.get_fwfm( - 0.6065, - hist, - bins, - var, - mx=height, - bl=bg - step / 2, - method="fit_slopes", - )[0] - if sigma == 0: - log.info("get_hpge_E_peak_par_guess: sigma estimation failed") - if sigma_guess is not None: + except ValueError: + try: + sigma = pgh.get_fwfm( + 0.6065, + hist, + bins, + var, + mx=height, + bl=bg - step / 2, + method="fit_slopes", + )[0] + except RuntimeError: + sigma = -1 + if sigma <= 0 or sigma > 1000: + if sigma_guess is not None and sigma_guess > 0 and sigma_guess < 1000: sigma = sigma_guess else: - return [] - + (_, sigma, _) = pgh.get_gaussian_guess(hist, bins) + if sigma is not None and sigma_guess > 0 and sigma_guess < 1000: + pass + else: + log.info( + "get_hpge_energy_peak_par_guess: sigma estimation failed" + ) + return {} # now compute amp and return n_sig = np.sum( hist[(bin_centres > mu - 3 * sigma) & (bin_centres < mu + 3 * sigma)] ) n_bkg = np.sum(hist) - n_sig - hstep = step / (bg + np.mean(hist[:10])) - - parguess = [n_sig, mu, sigma / 2, n_bkg, hstep, bins[0], bins[-1], 0] - for i, guess in enumerate(parguess): + parguess = { + "n_sig": n_sig, + "mu": mu, + "sigma": sigma, + "n_bkg": n_bkg, + "x_lo": bins[0], + "x_hi": bins[-1], + } + + if func == pgf.gauss_on_step or func == pgf.hpge_peak: + hstep = step / (bg + np.mean(hist[:10])) + parguess["hstep"] = hstep + + if func == pgf.hpge_peak: + sigma = sigma * 0.8 # roughly remove some amount due to tail + # for now hard-coded + htail = 1.0 / 5 + tau = sigma / 2 + parguess["sigma"] = sigma + parguess["htail"] = htail + parguess["tau"] = tau + + for name, guess in parguess.items(): if np.isnan(guess): - parguess[i] = 0 - - return parguess - - if ( - func == pgf.radford_cdf - or func == pgf.radford_pdf - or func == pgf.extended_radford_pdf - ): - # guess mu, height - pars, cov = pgf.gauss_mode_width_max( - hist, bins, var, mode_guess=mode_guess, n_bins=10 - ) - bin_centres = pgh.get_bin_centers(bins) - if pars is None: - log.info("get_hpge_E_peak_par_guess: gauss_mode_width_max failed") - sigma_guess = None - - else: - sigma_guess = pars[1] - # mu=pars[0] - # height=pars[2] - i_0 = np.argmax(hist) - mu = bin_centres[i_0] - height = hist[i_0] - - # get bg and step from edges of hist - bg0 = np.mean(hist[-10:]) - step = bg0 - np.mean(hist[:10]) - - # get sigma from fwfm with f = 1/sqrt(e) - try: - sigma = pgh.get_fwfm( - 0.6065, - hist, - bins, - var, - mx=height, - bl=bg0 + step / 2, - method="interpolate", - )[0] - if sigma == 0: - raise ValueError - except: - sigma = pgh.get_fwfm( - 0.6065, - hist, - bins, - var, - mx=height, - bl=bg0 + step / 2, - method="fit_slopes", - )[0] - if sigma == 0: - log.info("get_hpge_E_peak_par_guess: sigma estimation failed") - if sigma_guess is not None: - sigma = sigma_guess - else: - return [] - sigma = sigma * 0.8 # roughly remove some amount due to tail - - # for now hard-coded - htail = 1.0 / 5 - tau = 0.5 * sigma - - hstep = step / (bg0 + np.mean(hist[:10])) - - n_sig = np.sum( - hist[(bin_centres > mu - 3 * sigma) & (bin_centres < mu + 3 * sigma)] - ) - n_bkg = np.sum(hist) - n_sig - - parguess = [n_sig, mu, sigma, htail, tau, n_bkg, hstep, bins[0], bins[-1], 0] - - for i, guess in enumerate(parguess): - if np.isnan(guess): - parguess[i] = 0 - - return parguess + parguess[name] = 0 else: - log.error(f"get_hpge_E_peak_par_guess not implemented for {func.__name__}") - return [] + log.error(f"get_hpge_energy_peak_par_guess not implemented for {func.__name__}") + return return_nans(func) + return convert_to_minuit(parguess, func).values -def get_hpge_E_fixed(func): - """ - Returns: Sequence list of fixed indexes for fitting and mask for parameters - """ - - if ( - func == pgf.gauss_step_cdf - or func == pgf.gauss_step_pdf - or func == pgf.extended_gauss_step_pdf - ): - # pars are: n_sig, mu, sigma, n_bkg, hstep, components - return [5, 6, 7], np.array([True, True, True, True, True, False, False, False]) - if ( - func == pgf.radford_cdf - or func == pgf.radford_pdf - or func == pgf.extended_radford_pdf - ): - # pars are: n_sig, mu, sigma, htail,tau, n_bkg, hstep, components - return [7, 8, 9], np.array( - [True, True, True, True, True, True, True, False, False, False] - ) +def get_hpge_energy_fixed(func): + """ + Get the fixed indexes for fitting and mask for parameters based on the given function. - else: - log.error(f"get_hpge_E_fixed not implemented for {func.__name__}") - return None - return None + Parameters + ---------- + func : function + The function for which the fixed indexes and mask are to be determined. + Returns + ------- + fixed : list + A sequence list of fixed indexes for fitting. + mask : ndarray + A boolean mask indicating which parameters are fixed (False) and which are not fixed (True). + """ -def get_hpge_E_bounds(func, parguess): if ( - func == pgf.radford_cdf - or func == pgf.radford_pdf - or func == pgf.extended_radford_pdf + func == pgf.gauss_on_step + or func == pgf.hpge_peak + or func == pgf.gauss_on_uniform ): - return [ - (0, None), - (parguess[-3], parguess[-2]), - (0, None), - (0, 1), - (None, None), - (0, None), - (-1, 1), - (None, None), - (None, None), - (None, None), - ] + # pars are: n_sig, mu, sigma, n_bkg, hstep, components + fixed = ["x_lo", "x_hi"] - elif ( - func == pgf.gauss_step_cdf - or func == pgf.gauss_step_pdf - or func == pgf.extended_gauss_step_pdf - ): - return [ - (0, None), - (parguess[-3], parguess[-2]), - (0, None), - (0, None), - (-1, 1), - (None, None), - (None, None), - (None, None), - ] + else: + log.error(f"get_hpge_energy_fixed not implemented for {func.__name__}") + return None + mask = ~np.in1d(func.required_args(), fixed) + return fixed, mask + + +def get_hpge_energy_bounds(func, parguess): + if func == pgf.gauss_on_step: + return { + "n_sig": (0, None), + "mu": (parguess["x_lo"], parguess["x_hi"]), + "sigma": (0, None), + "n_bkg": (0, None), + "hstep": (-1, 1), + "x_lo": (None, None), + "x_hi": (None, None), + } + + elif func == pgf.hpge_peak: + return { + "n_sig": (0, None), + "mu": (parguess["x_lo"], parguess["x_hi"]), + "sigma": (0, None), + "htail": (0, 0.5), + "tau": (0.1 * parguess["sigma"], 10 * parguess["sigma"]), + "n_bkg": (0, None), + "hstep": (-1, 1), + "x_lo": (None, None), + "x_hi": (None, None), + } + + if func == pgf.gauss_on_uniform: + return { + "n_sig": (0, None), + "mu": (parguess["x_lo"], parguess["x_hi"]), + "sigma": (0, None), + "n_bkg": (0, None), + "x_lo": (None, None), + "x_hi": (None, None), + } else: - log.error(f"get_hpge_E_bounds not implemented for {func.__name__}") + log.error(f"get_hpge_energy_bounds not implemented for {func.__name__}") return [] -class tail_prior: +class TailPrior: """ Generic least-squares cost function with error. """ @@ -485,7 +2032,7 @@ class tail_prior: verbose = 0 errordef = Minuit.LIKELIHOOD # for Minuit to compute errors correctly - def __init__(self, data, model, tail_weight=100): + def __init__(self, data, model, tail_weight=0): self.model = model # model predicts y for given x self.data = data self.tail_weight = tail_weight @@ -493,8 +2040,13 @@ def __init__(self, data, model, tail_weight=100): def _call(self, *pars): return self.__call__(*pars[0]) + def _value(self, *pars): + return self.__call__(*pars[0]) + def __call__( self, + x_lo, + x_hi, n_sig, mu, sigma, @@ -502,349 +2054,339 @@ def __call__( tau, n_bkg, hstep, - lower_range, - upper_range, - components, ): return self.tail_weight * np.log(htail + 0.1) # len(self.data)/ -def staged_fit( - energies, hist, bins, var, func_i, gof_func_i, simplex, mode_guess, tail_weight=100 +def unbinned_staged_energy_fit( + energy, + func, + gof_range=None, + fit_range=None, + guess=None, + guess_func=get_hpge_energy_peak_par_guess, + bounds_func=get_hpge_energy_bounds, + fixed_func=get_hpge_energy_fixed, + guess_kwargs=None, + bounds_kwargs=None, + fixed_kwargs=None, + tol=None, + tail_weight=0, + allow_tail_drop=True, + bin_width=None, + lock_guess=False, + display=0, ): - par_guesses = get_hpge_E_peak_par_guess(hist, bins, var, func_i, mode_guess) - bounds = get_hpge_E_bounds(func_i, par_guesses) - fixed, mask = get_hpge_E_fixed(func_i) + """ + Unbinned fit to energy. This is different to the default fitting as + it will try different fitting methods and choose the best. This is necessary for the lower statistics. + """ - if func_i == pgf.extended_radford_pdf or func_i == pgf.radford_pdf: - cost_func = cost.ExtendedUnbinnedNLL(energies, func_i) + tail_prior( - energies, func_i, tail_weight=tail_weight + if fit_range is None: + fit_range = (np.nanmin(energy), np.nanmax(energy)) + + if gof_range is None: + gof_range = fit_range + + if bin_width is None: + init_bin_width = ( + 2 + * (np.nanpercentile(energy, 75) - np.nanpercentile(energy, 25)) + * len(energy) ** (-1 / 3) + ) + init_hist, init_bins, _ = pgh.get_hist( + energy, dx=init_bin_width, range=fit_range ) - m = Minuit(cost_func, *par_guesses) - m.limits = bounds - for fix in fixed: - m.fixed[fix] = True - - m.values["htail"] = 0 - m.values["tau"] = 0 - m.fixed["htail"] = True - m.fixed["tau"] = True - if simplex == True: - m.simplex().migrad() - else: - m.migrad() try: - # set htail to guess - m.values["htail"] = par_guesses[3] - m.values["tau"] = par_guesses[4] - m.fixed = False - for fix in fixed: - m.fixed[fix] = True - - if simplex == True: - m.simplex().migrad() - else: - m.migrad() - m.hesse() - pars_i = m.values - errs_i = m.errors - cov_i = m.covariance - valid_fit = m.valid - if valid_fit == False: - raise RuntimeError - except: - func_i = pgf.extended_gauss_step_pdf - gof_func_i = pgf.gauss_step_pdf - pars_i, errs_i, cov_i, func_i, gof_func_i, mask, valid_fit = staged_fit( - energies, hist, bins, var, func_i, gof_func_i, simplex, mode_guess + _, init_sigma, _ = pgh.get_gaussian_guess(init_hist, init_bins) + except IndexError: + init_hist, init_bins, _ = pgh.get_hist( + energy, dx=init_bin_width / 2, range=fit_range ) - - # check htail - if ( - m.values["htail"] < 0.01 - or m.values["htail"] < 2 * m.errors["htail"] - or np.isnan(m.values).any() - ): # switch to stat test - func_i = pgf.extended_gauss_step_pdf - gof_func_i = pgf.gauss_step_pdf - pars_i, errs_i, cov_i, func_i, gof_func_i, mask, valid_fit = staged_fit( - energies, hist, bins, var, func_i, gof_func_i, simplex, mode_guess + try: + _, init_sigma, _ = pgh.get_gaussian_guess(init_hist, init_bins) + except IndexError: + init_sigma = np.nanstd(energy) + bin_width = 2 * (init_sigma) * len(energy) ** (-1 / 3) + + gof_hist, gof_bins, gof_var = pgh.get_hist(energy, range=gof_range, dx=bin_width) + + if guess is not None: + if not isinstance(guess, ValueView): + x0 = convert_to_minuit(guess, func) + if lock_guess is True: + x0 = guess + x0["x_lo"] = fit_range[0] + x0["x_hi"] = fit_range[1] + else: + x0["x_lo"] = fit_range[0] + x0["x_hi"] = fit_range[1] + x1 = guess_func( + energy, + func, + fit_range, + bin_width=bin_width, + **guess_kwargs if guess_kwargs is not None else {}, ) - + for arg, val in x1.items(): + if arg not in x0: + x0[arg] = val + if len(x0) == len(x1): + cs, _ = pgb.goodness_of_fit( + gof_hist, gof_bins, None, func.pdf_norm, x0, method="Pearson" + ) + cs2, _ = pgb.goodness_of_fit( + gof_hist, gof_bins, None, func.pdf_norm, x1, method="Pearson" + ) + if cs >= cs2: + x0 = x1 + else: + x0 = x1 else: - cost_func = cost.ExtendedUnbinnedNLL(energies, func_i) - m = Minuit(cost_func, *par_guesses) - m.limits = bounds - for fix in fixed: - m.fixed[fix] = True - if simplex == True: + if func == pgf.hpge_peak: + x0_notail = guess_func( + energy, + pgf.gauss_on_step, + fit_range, + bin_width=bin_width, + **guess_kwargs if guess_kwargs is not None else {}, + ) + c = cost.ExtendedUnbinnedNLL(energy, pgf.gauss_on_step.pdf_ext) + m = Minuit(c, *x0_notail) + bounds = bounds_func( + pgf.gauss_on_step, + x0_notail, + **bounds_kwargs if bounds_kwargs is not None else {}, + ) + for arg, val in bounds.items(): + m.limits[arg] = val + fixed, mask = fixed_func( + pgf.gauss_on_step, + **fixed_kwargs if fixed_kwargs is not None else {}, + ) + m.fixed[fixed] = True m.simplex().migrad() - else: - m.migrad() - - m.hesse() - - pars_i = m.values - errs_i = m.errors - cov_i = m.covariance - - valid_fit = m.valid - - return pars_i, errs_i, cov_i, func_i, gof_func_i, mask, valid_fit - - -def hpge_fit_E_peaks( - E_uncal, - mode_guesses, - wwidths, - n_bins=50, - funcs=pgf.gauss_step_cdf, - method="unbinned", - gof_funcs=None, - n_events=None, - allowed_p_val=0.05, - uncal_is_int=False, - simplex=False, - tail_weight=100, -): - """Fit the Energy peaks specified using the function given - - Parameters - ---------- - E_uncal : array - unbinned energy data to be fit - mode_guesses : array - array of guesses for modes of each peak - wwidths : float or array of float - array of widths to use for the fit windows (in units of E_uncal), - typically on the order of 10 sigma where sigma is the peak width - n_bins : int or array of ints - array of number of bins to use for the fit window histogramming - funcs : function or array of functions - funcs to be used to fit each region - method : str - default is unbinned fit can specify to use binned fit method instead - gof_funcs : function or array of functions - functions to use for calculation goodness of fit if unspecified will use same func as fit - uncal_is_int : bool - if True, attempts will be made to avoid picket-fencing when binning - E_uncal - simplex : bool determining whether to do a round of simpson minimisation before gradient minimisation - n_events : int number of events to use for unbinned fit - allowed_p_val : lower limit on p_val of fit - - Returns - ------- - pars : list of array - a list of best-fit parameters for each peak fit - covs : list of 2D arrays - a list of covariance matrices for each pars - binwidths : list - a list of bin widths used for each peak fit - ranges: list of array - a list of [Euc_min, Euc_max] used for each peak fit - """ - pars = np.zeros(len(mode_guesses), dtype="object") - errors = np.zeros(len(mode_guesses), dtype="object") - covs = np.zeros(len(mode_guesses), dtype="object") - binws = np.zeros(len(mode_guesses)) - ranges = np.zeros(len(mode_guesses), dtype="object") - p_vals = np.zeros(len(mode_guesses)) - valid_pks = np.zeros(len(mode_guesses), dtype=bool) - out_funcs = np.zeros(len(mode_guesses), dtype="object") - - for i_peak, mode_guess in enumerate(mode_guesses): - # get args for this peak - wwidth_i = wwidths if not isinstance(wwidths, list) else wwidths[i_peak] - n_bins_i = n_bins if np.isscalar(n_bins) else n_bins[i_peak] - func_i = funcs[i_peak] if hasattr(funcs, "__len__") else funcs - wleft_i = wwidth_i / 2 if np.isscalar(wwidth_i) else wwidth_i[0] - wright_i = wwidth_i / 2 if np.isscalar(wwidth_i) else wwidth_i[1] - if gof_funcs is not None: - gof_func_i = ( - gof_funcs[i_peak] if hasattr(gof_funcs, "__len__") else gof_funcs + m.hesse() + x0 = guess_func( + energy, + func, + fit_range, + bin_width=bin_width, + **guess_kwargs if guess_kwargs is not None else {}, ) - else: - gof_func_i = func_i - - try: - # bin a histogram - Euc_min = mode_guesses[i_peak] - wleft_i - Euc_max = mode_guesses[i_peak] + wright_i - if uncal_is_int == True: - Euc_min, Euc_max, n_bins_i = pgh.better_int_binning( - x_lo=Euc_min, x_hi=Euc_max, n_bins=n_bins_i - ) + if m.valid: + for arg in x0_notail.to_dict(): + x0[arg] = x0_notail[arg] - if method == "unbinned": - energies = E_uncal[(E_uncal > Euc_min) & (E_uncal < Euc_max)][:n_events] - hist, bins, var = pgh.get_hist( - energies, bins=n_bins_i, range=(Euc_min, Euc_max) - ) - if func_i == pgf.extended_radford_pdf or pgf.extended_gauss_step_pdf: - ( - pars_i, - errs_i, - cov_i, - func_i, - gof_func_i, - mask, - valid_fit, - ) = staged_fit( - energies, - hist, - bins, - var, - func_i, - gof_func_i, - simplex, - mode_guess, - tail_weight=tail_weight, - ) - if pars_i["n_sig"] < 100: - valid_fit = False - else: - par_guesses = get_hpge_E_peak_par_guess(hist, bins, var, func_i) - bounds = get_hpge_E_bounds(func_i, par_guesses) - fixed, mask = get_hpge_E_fixed(func_i) - - cost_func = cost.ExtendedUnbinnedNLL(energies, func_i) - m = Minuit(cost_func, *par_guesses) - m.limits = bounds - for fix in fixed: - m.fixed[fix] = True - if simplex == True: - m.simplex().migrad() - else: - m.migrad() - m.hesse() + else: + x0 = guess_func( + energy, + func, + fit_range, + bin_width=bin_width, + **guess_kwargs if guess_kwargs is not None else {}, + ) - pars_i = m.values - errs_i = m.errors - cov_i = m.covariance - valid_fit = m.valid + if (func == pgf.hpge_peak) and allow_tail_drop is True: + fit_no_tail = unbinned_staged_energy_fit( + energy, + func=pgf.gauss_on_step, + gof_range=gof_range, + fit_range=fit_range, + guess=None, + guess_func=guess_func, + bounds_func=bounds_func, + fixed_func=fixed_func, + guess_kwargs=guess_kwargs, + bounds_kwargs=bounds_kwargs, + fixed_kwargs=fixed_kwargs, + tol=tol, + tail_weight=None, + allow_tail_drop=False, + bin_width=bin_width, + ) - csqr = pgf.goodness_of_fit( - hist, - bins, - None, - gof_func_i, - pars_i, - method="Pearson", - scale_bins=True, - ) + c = cost.ExtendedUnbinnedNLL(energy, func.pdf_ext) + TailPrior( + energy, func, tail_weight=tail_weight + ) + else: + c = cost.ExtendedUnbinnedNLL(energy, func.pdf_ext) + + fixed, mask = fixed_func(func, **fixed_kwargs if fixed_kwargs is not None else {}) + bounds = bounds_func(func, x0, **bounds_kwargs if bounds_kwargs is not None else {}) + + # try without simplex + m = Minuit(c, *x0) + if tol is not None: + m.tol = tol + m.fixed[fixed] = True + for arg, val in bounds.items(): + m.limits[arg] = val + m.migrad() + m.hesse() + + valid1 = ( + m.valid + & (~np.isnan(np.array(m.errors)[mask]).any()) + & (~(np.array(m.errors)[mask] == 0).all()) + ) - else: - hist, bins, var = pgh.get_hist( - E_uncal, bins=n_bins_i, range=(Euc_min, Euc_max) - ) - par_guesses = get_hpge_E_peak_par_guess(hist, bins, var, func_i) - bounds = get_hpge_E_bounds(func_i, par_guesses) - fixed, mask = get_hpge_E_fixed(func_i) - pars_i, errs_i, cov_i = pgf.fit_binned( - func_i, - hist, - bins, - var=var, - guess=par_guesses, - cost_func=method, - Extended=True, - fixed=fixed, - simplex=simplex, - bounds=bounds, - ) - valid_fit = True + cs = pgb.goodness_of_fit( + gof_hist, + gof_bins, + gof_var, + func.get_pdf, + m.values, + method="Pearson", + scale_bins=True, + ) + cs = (cs[0], cs[1] + len(np.where(mask)[0])) + + fit1 = (m.values, m.errors, m.covariance, cs, func, mask, valid1, m) + + # Now try with simplex + m2 = Minuit(c, *x0) + if tol is not None: + m2.tol = tol + m2.fixed[fixed] = True + for arg, val in bounds.items(): + m2.limits[arg] = val + m2.simplex().migrad() + m2.hesse() + + valid2 = ( + m2.valid + & (~np.isnan(np.array(m2.errors)[mask]).any()) + & (~(np.array(m2.errors)[mask] == 0).all()) + ) - csqr = pgf.goodness_of_fit( - hist, - bins, - None, - gof_func_i, - pars_i, - method="Pearson", - scale_bins=False, - ) + cs2 = pgb.goodness_of_fit( + gof_hist, + gof_bins, + gof_var, + func.get_pdf, + m2.values, + method="Pearson", + scale_bins=True, + ) + cs2 = (cs2[0], cs2[1] + len(np.where(mask)[0])) + + fit2 = (m2.values, m2.errors, m2.covariance, cs2, func, mask, valid2, m2) + + frac_errors1 = np.sum(np.abs(np.array(m.errors)[mask] / np.array(m.values)[mask])) + frac_errors2 = np.sum(np.abs(np.array(m2.errors)[mask] / np.array(m2.values)[mask])) + + if display > 1: + hist, bins, _ = pgh.get_hist(energy, range=fit_range, dx=bin_width) + bin_cs = (bins[:-1] + bins[1:]) / 2 + + m_fit = func.get_pdf(bin_cs, *m.values) * np.diff(bin_cs)[0] + m2_fit = func.get_pdf(bin_cs, *m2.values) * np.diff(bin_cs)[0] + guess_fit = func.get_pdf(bin_cs, *x0) * np.diff(bin_cs)[0] + plt.figure() + plt.step(bin_cs, hist, label="hist") + plt.plot(bin_cs, guess_fit, label="Guess") + plt.plot(bin_cs, m_fit, label=f"Fit 1: {cs}") + plt.plot(bin_cs, m2_fit, label=f"Fit 2: {cs2}") + plt.legend() + plt.show() - if np.isnan(pars_i).any(): - log.debug( - f"hpge_fit_E_peaks: fit failed for i_peak={i_peak} at loc {mode_guesses[i_peak]:g}, par is nan : {pars_i}" + if valid1 is False and valid2 is False: + log.debug("Extra simplex needed") + m = Minuit(c, *x0) + if tol is not None: + m.tol = tol + m.fixed[fixed] = True + for arg, val in bounds.items(): + m.limits[arg] = val + m.simplex().simplex().migrad() + m.hesse() + cs = pgb.goodness_of_fit( + gof_hist, + gof_bins, + gof_var, + func.get_pdf, + m.values, + method="Pearson", + scale_bins=True, + ) + cs = (cs[0], cs[1] + len(np.where(mask)[0])) + valid3 = ( + m.valid + & (~np.isnan(np.array(m.errors)[mask]).any()) + & (~(np.array(m.errors)[mask] == 0).all()) + ) + if valid3 is False: + try: + m.minos() + valid3 = ( + m.valid + & (~np.isnan(np.array(m.errors)[mask]).any()) + & (~(np.array(m.errors)[mask] == 0).all()) ) + except Exception: raise RuntimeError - p_val = scipy.stats.chi2.sf(csqr[0], csqr[1] + len(np.where(mask)[0])) - - total_events = pgf.get_total_events_func(func_i, pars_i, errors=errs_i) - if ( - sum(sum(c) if c is not None else 0 for c in cov_i[mask, :][:, mask]) - == np.inf - or sum(sum(c) if c is not None else 0 for c in cov_i[mask, :][:, mask]) - == 0 - or np.isnan( - sum(sum(c) if c is not None else 0 for c in cov_i[mask, :][:, mask]) - ) - ): - log.debug( - f"hpge_fit_E_peaks: cov estimation failed for i_peak={i_peak} at loc {mode_guesses[i_peak]:g}" - ) - valid_pks[i_peak] = False + fit = (m.values, m.errors, m.covariance, cs, func, mask, valid3, m) - elif valid_fit == False: - log.debug( - f"hpge_fit_E_peaks: peak fitting failed for i_peak={i_peak} at loc {mode_guesses[i_peak]:g}" - ) - valid_pks[i_peak] = False + elif valid2 is False: + fit = fit1 - elif ( - np.abs(np.array(errs_i)[mask] / np.array(pars_i)[mask]) < 1e-7 - ).any() or np.isnan(np.array(errs_i)[mask]).any(): - log.debug( - f"hpge_fit_E_peaks: failed for i_peak={i_peak} at loc {mode_guesses[i_peak]:g}, parameter error too low" - ) - valid_pks[i_peak] = False + elif valid1 is False: + fit = fit2 - elif np.abs(total_events[0] - np.sum(hist)) / np.sum(hist) > 0.1: - log.debug( - f"hpge_fit_E_peaks: fit failed for i_peak={i_peak} at loc {mode_guesses[i_peak]:g}, total_events is outside limit" - ) - valid_pks[i_peak] = False + elif cs[0] * 1.05 < cs2[0]: + fit = fit1 - elif p_val < allowed_p_val or np.isnan(p_val): - log.debug( - f"hpge_fit_E_peaks: fit failed for i_peak={i_peak}, p-value too low: {p_val}" - ) - valid_pks[i_peak] = False - else: - valid_pks[i_peak] = True + elif cs2[0] * 1.05 < cs[0]: + fit = fit2 - except: - log.debug( - f"hpge_fit_E_peaks: fit failed for i_peak={i_peak}, unknown error" - ) - valid_pks[i_peak] = False - pars_i, errs_i, cov_i = return_nans(func_i) - p_val = 0 + elif frac_errors1 < frac_errors2: + fit = fit1 - # get binning - binw_1 = (bins[-1] - bins[0]) / (len(bins) - 1) + elif frac_errors1 > frac_errors2: + fit = fit2 - pars[i_peak] = pars_i - errors[i_peak] = errs_i - covs[i_peak] = cov_i - binws[i_peak] = binw_1 - ranges[i_peak] = [Euc_min, Euc_max] - p_vals[i_peak] = p_val - out_funcs[i_peak] = func_i + else: + raise RuntimeError + + if (func == pgf.hpge_peak) and allow_tail_drop is True: + p_val = chi2.sf(fit[3][0], fit[3][1]) + p_val_no_tail = chi2.sf(fit_no_tail[3][0], fit_no_tail[3][1]) + if fit[0]["htail"] < fit[1]["htail"] or p_val_no_tail > p_val: + debug_string = f'dropping tail tail val : {fit[0]["htail"]} tail err : {fit[1]["htail"]} ' + debug_string += f"p_val no tail: : {p_val_no_tail} p_val with tail: {p_val}" + log.debug(debug_string) + + if display > 0: + m_fit = pgf.gauss_on_step.get_pdf(bin_cs, *fit_no_tail[0]) + m_fit_tail = pgf.hpge_peak.get_pdf(bin_cs, *fit[0]) + plt.figure() + plt.step(bin_cs, hist, where="mid", label="hist") + plt.plot( + bin_cs, + m_fit * np.diff(bin_cs)[0], + label=f"Drop tail: {p_val_no_tail}", + ) + plt.plot( + bin_cs, + m_fit_tail * np.diff(bin_cs)[0], + label=f"Drop tail: {p_val}", + ) + plt.legend() + plt.show() - return (pars, errors, covs, binws, ranges, p_vals, valid_pks, out_funcs) + fit = fit_no_tail + return fit def poly_wrapper(x, *pars): - return pgf.poly(x, pars) + return pgf.nb_poly(x, np.array(pars)) -def hpge_fit_E_scale(mus, mu_vars, Es_keV, deg=0, fixed=None): +def hpge_fit_energy_scale(mus, mu_vars, energies_kev, deg=0, fixed=None): """Find best fit of poly(E) = mus +/- sqrt(mu_vars) - Compare to hpge_fit_E_cal_func which fits for E = poly(mu) + Compare to hpge_fit_energy_cal_func which fits for E = poly(mu) Parameters ---------- @@ -852,8 +2394,8 @@ def hpge_fit_E_scale(mus, mu_vars, Es_keV, deg=0, fixed=None): uncalibrated energies mu_vars : array variances in the mus - Es_keV : array - energies to fit to, in keV + energies_kev : array + energies to fit to, in kev deg : int degree for energy scale fit. deg=0 corresponds to a simple scaling mu = scale * E. Otherwise deg follows the definition in np.polyfit @@ -868,13 +2410,17 @@ def hpge_fit_E_scale(mus, mu_vars, Es_keV, deg=0, fixed=None): covariance matrix for the best fit parameters. """ if deg == 0: - scale, scale_cov = pgu.fit_simple_scaling(Es_keV, mus, var=mu_vars) - pars = np.array([scale, 0]) - cov = np.array([[scale_cov, 0], [0, 0]]) + scale, scale_cov = fit_simple_scaling(energies_kev, mus, var=mu_vars) + pars = np.array([0, scale]) + cov = np.array([[0, 0], [0, scale_cov]]) errs = np.diag(np.sqrt(cov)) else: - poly_pars = np.polyfit(Es_keV, mus, deg=deg, w=1 / np.sqrt(mu_vars)) - c = cost.LeastSquares(Es_keV, mus, np.sqrt(mu_vars), poly_wrapper) + poly_pars = ( + Polynomial.fit(energies_kev, mus, deg=deg, w=1 / np.sqrt(mu_vars)) + .convert() + .coef + ) + c = cost.LeastSquares(energies_kev, mus, np.sqrt(mu_vars), poly_wrapper) if fixed is not None: for idx, val in fixed.items(): if val is True or val is None: @@ -894,9 +2440,11 @@ def hpge_fit_E_scale(mus, mu_vars, Es_keV, deg=0, fixed=None): return pars, errs, cov -def hpge_fit_E_cal_func(mus, mu_vars, Es_keV, E_scale_pars, deg=0, fixed=None): +def hpge_fit_energy_cal_func( + mus, mu_vars, energies_kev, energy_scale_pars, deg=0, fixed=None +): """Find best fit of E = poly(mus +/- sqrt(mu_vars)) - This is an inversion of hpge_fit_E_scale. + This is an inversion of hpge_fit_energy_scale. E uncertainties are computed from mu_vars / dmu/dE where mu = poly(E) is the E_scale function @@ -906,10 +2454,10 @@ def hpge_fit_E_cal_func(mus, mu_vars, Es_keV, E_scale_pars, deg=0, fixed=None): uncalibrated energies mu_vars : array variances in the mus - Es_keV : array - energies to fit to, in keV - E_scale_pars : array - Parameters from the escale fit (keV to ADC) used for calculating + energies_kev : array + energies to fit to, in kev + energy_scale_pars : array + Parameters from the escale fit (kev to ADC) used for calculating uncertainties deg : int degree for energy scale fit. deg=0 corresponds to a simple scaling @@ -926,24 +2474,26 @@ def hpge_fit_E_cal_func(mus, mu_vars, Es_keV, E_scale_pars, deg=0, fixed=None): covariance matrix for the best fit parameters. """ if deg == 0: - E_vars = mu_vars / E_scale_pars[0] ** 2 - scale, scale_cov = pgu.fit_simple_scaling(mus, Es_keV, var=E_vars) - pars = np.array([scale, 0]) - cov = np.array([[scale_cov, 0], [0, 0]]) + e_vars = mu_vars / energy_scale_pars[1] ** 2 + scale, scale_cov = fit_simple_scaling(mus, energies_kev, var=e_vars) + pars = np.array([0, scale]) + cov = np.array([[0, 0], [0, scale_cov]]) errs = np.diag(np.sqrt(cov)) else: - dmudEs = np.zeros(len(mus)) - for n in range(len(E_scale_pars) - 1): - dmudEs += E_scale_pars[n] * mus ** (len(E_scale_pars) - 2 - n) - E_weights = dmudEs * mu_vars - poly_pars = np.polyfit(mus, Es_keV, deg=deg, w=1 / E_weights) + d_mu_d_es = np.zeros(len(mus)) + for n in range(len(energy_scale_pars) - 1): + d_mu_d_es += energy_scale_pars[n + 1] * mus ** (n + 1) + e_weights = np.sqrt(d_mu_d_es * mu_vars) + poly_pars = ( + Polynomial.fit(mus, energies_kev, deg=deg, w=1 / e_weights).convert().coef + ) if fixed is not None: for idx, val in fixed.items(): if val is True or val is None: pass else: poly_pars[idx] = val - c = cost.LeastSquares(mus, Es_keV, E_weights, poly_wrapper) + c = cost.LeastSquares(mus, energies_kev, e_weights, poly_wrapper) m = Minuit(c, *poly_pars) if fixed is not None: for idx in list(fixed): @@ -957,322 +2507,7 @@ def hpge_fit_E_cal_func(mus, mu_vars, Es_keV, E_scale_pars, deg=0, fixed=None): return pars, errs, cov -def hpge_E_calibration( - E_uncal, - peaks_keV, - guess_keV, - deg=0, - uncal_is_int=False, - range_keV=None, - funcs=pgf.gauss_step_cdf, - gof_funcs=None, - method="unbinned", - gof_func=None, - n_events=None, - simplex=False, - allowed_p_val=0.05, - tail_weight=100, - verbose=True, -): - """Calibrate HPGe data to a set of known peaks - - Parameters - ---------- - E_uncal : array - unbinned energy data to be calibrated - peaks_keV : array - list of peak energies to be fit to. Each must be in the data - guess_keV : float - a rough initial guess at the conversion factor from E_uncal to keV. Must - be positive - deg : non-negative int - degree of the polynomial for the E_cal function E_keV = poly(E_uncal). - deg = 0 corresponds to a simple scaling E_keV = scale * E_uncal. - Otherwise follows the convention in np.polyfit - uncal_is_int : bool - if True, attempts will be made to avoid picket-fencing when binning - E_uncal - range_keV : float, tuple, array of floats, or array of tuples of floats - ranges around which the peak fitting is performed - if tuple(s) are supplied, they provide the left and right ranges - funcs - DOCME - gof_funcs : function or array of functions - functions to use for calculation goodness of fit if unspecified will use same func as fit - method : str - default is unbinned fit can specify to use binned fit method instead - gof_func - DOCME - n_events : int - number of events to use for unbinned fit - simplex : bool - DOCME - allowed_p_val - lower limit on p_val of fit - verbose : bool - print debug statements - - Returns - ------- - pars, cov : array, 2D array - array of calibration function parameters and their covariances. The form - of the function is E_keV = poly(E_uncal). Assumes poly() is - overwhelmingly dominated by the linear term. pars follows convention in - np.polyfit unless deg=0, in which case it is the (lone) scale factor - results : dict with the following elements - 'detected_peaks_locs', 'detected_peaks_keV' : array, array - array of rough uncalibrated/calibrated energies at which the fit peaks were - found in the initial peak search - 'pt_pars', 'pt_cov' : list of (array), list of (2D array) - arrays of gaussian parameters / covariances fit to the peak tops in - the first refinement - 'pt_cal_pars', 'pt_cal_cov' : array, 2D array - array of calibration parameters E_uncal = poly(E_keV) for fit to - means of gausses fit to tops of each peak - 'pk_pars', 'pk_cov', 'pk_binws', 'pk_ranges' : list of (array), list of (2D array), list, list of (array) - the best fit parameters, covariances, bin width and energy range for the local fit to each peak - 'pk_cal_pars', 'pk_cal_cov' : array, 2D array - array of calibration parameters E_uncal = poly(E_keV) for fit to - means from full peak fits - 'fwhms', 'dfwhms' : array, array - the numeric fwhms and their uncertainties for each peak. - """ - results = {} - - if not isinstance(range_keV, list): - range_keV = [range_keV for peak in peaks_keV] - - if not hasattr(funcs, "__len__"): - funcs = [funcs for peak in peaks_keV] - - # sanity checks - E_uncal = np.asarray(E_uncal) - peaks_keV = np.asarray(peaks_keV) # peaks_keV = np.sort(peaks_keV) - deg = int(deg) - if guess_keV <= 0: - log.error(f"hpge_E_cal warning: invalid guess_keV = {guess_keV}") - return None, None, results - if deg < 0: - log.error(f"hpge_E_cal warning: invalid deg = {deg}") - return None, None, results - - # bin the histogram in ~1 keV bins for the initial rough peak search - Euc_min = peaks_keV[0] / guess_keV * 0.6 - Euc_max = peaks_keV[-1] / guess_keV * 1.1 - dEuc = 1 / guess_keV - if uncal_is_int: - Euc_min, Euc_max, dEuc = pgh.better_int_binning( - x_lo=Euc_min, x_hi=Euc_max, dx=dEuc - ) - hist, bins, var = pgh.get_hist(E_uncal, range=(Euc_min, Euc_max), dx=dEuc) - - # Run the initial rough peak search - detected_peaks_locs, detected_peaks_keV, roughpars = hpge_find_E_peaks( - hist, bins, var, peaks_keV, n_sigma=5, deg=deg - ) - log.info(f"{len(detected_peaks_locs)} peaks found:") - log.info(f"\t Energy | Position ") - for i, (Li, Ei) in enumerate(zip(detected_peaks_locs, detected_peaks_keV)): - log.info(f"\t{i}".ljust(4) + str(Ei).ljust(9) + f"| {Li:g}".ljust(5)) - - # re-bin the histogram in ~0.2 keV bins with updated E scale par for peak-top fits - Euc_min, Euc_max = ( - (np.poly1d(roughpars) - i).roots - for i in (peaks_keV[0] * 0.9, peaks_keV[-1] * 1.1) - ) - Euc_min = Euc_min[np.logical_and(Euc_min >= 0, Euc_min <= max(Euc_max))][0] - Euc_max = Euc_max[ - np.logical_and(Euc_max >= Euc_min, Euc_max <= np.nanmax(E_uncal) * 1.1) - ][0] - dEuc = 0.2 / roughpars[-2] - - if uncal_is_int: - Euc_min, Euc_max, dEuc = pgh.better_int_binning( - x_lo=Euc_min, x_hi=Euc_max, dx=dEuc - ) - hist, bins, var = pgh.get_hist(E_uncal, range=(Euc_min, Euc_max), dx=dEuc) - - # run peak getter after rebinning - got_peaks_locs, got_peaks_keV, roughpars = hpge_get_E_peaks( - hist, bins, var, roughpars, peaks_keV, n_sigma=3 - ) - results["got_peaks_locs"] = got_peaks_locs - results["got_peaks_keV"] = got_peaks_keV - - log.info(f"{len(got_peaks_locs)} peaks obtained:") - log.info(f"\t Energy | Position ") - for i, (Li, Ei) in enumerate(zip(got_peaks_locs, got_peaks_keV)): - log.info(f"\t{i}".ljust(4) + str(Ei).ljust(9) + f"| {Li:g}".ljust(5)) - - # Drop non-gotten peaks - idx = [i for i, E in enumerate(peaks_keV) if E in got_peaks_keV] - range_keV = [range_keV[i] for i in idx] - funcs = [funcs[i] for i in idx] - gof_funcs = [gof_funcs[i] for i in idx] - - # Drop peaks to not be fitted - tmp = zip( - *[ - a - for a in zip(got_peaks_locs, got_peaks_keV, range_keV, funcs) - if a[2] and a[3] - ] - ) - got_peaks_locs, got_peaks_keV, range_keV, funcs = list(map(list, tmp)) - got_peaks_locs = np.asarray(got_peaks_locs) - got_peaks_keV = np.asarray(got_peaks_keV) - - # Now do a series of full fits to the peak shapes - - # First calculate range around peaks to fit - if range_keV is None: - # Need to do initial fit - pt_pars, pt_covs = hpge_fit_E_peak_tops( - hist, bins, var, got_peaks_locs, n_to_fit=7 - ) - # Drop failed fits - fitidx = [i is not None for i in pt_pars] - results["got_peaks_locs"] = got_peaks_locs = got_peaks_locs[fitidx] - results["got_peaks_keV"] = got_peaks_keV = got_peaks_keV[fitidx] - pt_pars = np.asarray(pt_pars)[fitidx] - pt_covs = np.asarray(pt_covs)[fitidx] - range_uncal = np.stack(pt_pars)[:, 1].astype(float) * 20 - n_bins = 50 - elif np.isscalar(range_keV): - derco = np.polyder(np.poly1d(roughpars)).coefficients - der = [pgf.poly(Ei, derco) for Ei in got_peaks_keV] - range_uncal = [float(range_keV) / d for d in der] - n_bins = [int(range_keV / 0.5 / d) for d in der] - elif isinstance(range_keV, tuple): - rangeleft_keV, rangeright_keV = range_keV - derco = np.polyder(np.poly1d(roughpars)).coefficients - der = [pgf.poly(Ei, derco) for Ei in got_peaks_keV] - range_uncal = [(rangeleft_keV / d, rangeright_keV / d) for d in der] - n_bins = [int(sum(range_keV) / 0.5 / d) for d in der] - elif isinstance(range_keV, list): - derco = np.polyder(np.poly1d(roughpars)).coefficients - der = [pgf.poly(Ei, derco) for Ei in got_peaks_keV] - range_uncal = [ - (r[0] / d, r[1] / d) if isinstance(r, tuple) else r / d - for r, d in zip(range_keV, der) - ] - n_bins = [ - int(sum(r) / 0.5 / d) if isinstance(r, tuple) else int(r / 0.2 / d) - for r, d in zip(range_keV, der) - ] - - ( - pk_pars, - pk_errors, - pk_covs, - pk_binws, - pk_ranges, - pk_pvals, - valid_pks, - pk_funcs, - ) = hpge_fit_E_peaks( - E_uncal, - got_peaks_locs, - range_uncal, - n_bins=n_bins, - funcs=funcs, - method=method, - gof_funcs=gof_funcs, - n_events=n_events, - uncal_is_int=False, - simplex=simplex, - allowed_p_val=allowed_p_val, - tail_weight=tail_weight, - ) - results["pk_pars"] = pk_pars - results["pk_errors"] = pk_errors - results["pk_covs"] = pk_covs - results["pk_binws"] = pk_binws - results["pk_ranges"] = pk_ranges - results["pk_pvals"] = pk_pvals - results["pk_validities"] = valid_pks - results["pk_funcs"] = pk_funcs - # Drop failed fits - fitidx = [i == True for i in valid_pks] - fitted_peaks_keV = results["fitted_keV"] = got_peaks_keV[fitidx] - funcs = [f for i, f in zip(fitidx, funcs) if i] - pk_pars = np.asarray(pk_pars, dtype=object)[fitidx] # ragged - pk_errors = np.asarray(pk_errors, dtype=object)[fitidx] - pk_covs = np.asarray(pk_covs, dtype=object)[fitidx] - pk_binws = np.asarray(pk_binws)[fitidx] - pk_ranges = np.asarray(pk_ranges)[fitidx] - pk_pvals = np.asarray(pk_pvals)[fitidx] - pk_funcs = np.asarray(pk_funcs)[fitidx] - log.info(f"{sum(fitidx)} peaks fitted:") - for i, (Ei, parsi, errorsi, covsi, func_i) in enumerate( - zip(fitted_peaks_keV, pk_pars, pk_errors, pk_covs, pk_funcs) - ): - varnames = func_i.__code__.co_varnames[1 : len(pk_pars[-1]) + 1] - parsi = np.asarray(parsi, dtype=float) - errorsi = np.asarray(errorsi, dtype=float) - covsi = np.asarray(covsi, dtype=float) - # parsigsi = np.sqrt(covsi.diagonal()) - log.info(f"\tEnergy: {str(Ei)}") - log.info(f"\t\tParameter | Value +/- Sigma ") - for vari, pari, errorsi in zip(varnames, parsi, errorsi): - log.info( - f'\t\t{str(vari).ljust(10)} | {("%4.2f" % pari).rjust(8)} +/- {("%4.2f" % errorsi).ljust(8)}' - ) - # fwhm?? - - # Do a second calibration to the results of the full peak fits - mus = [ - pgf.get_mu_func(func_i, pars_i, errors=errors_i) - for func_i, pars_i, errors_i in zip(pk_funcs, pk_pars, pk_errors) - ] - mus, mu_vars = zip(*mus) - mus = np.asarray(mus) - mu_vars = np.asarray(mu_vars) ** 2 - - try: - pars, errs, cov = hpge_fit_E_scale(mus, mu_vars, fitted_peaks_keV, deg=deg) - results["pk_cal_pars"] = pars - results["pk_cal_errs"] = errs - results["pk_cal_cov"] = cov - except ValueError: - log.error("Failed to fit enough peaks to get accurate calibration") - return None, None, None, results - - # Invert the E scale fit to get a calibration function - pars, errs, cov = hpge_fit_E_cal_func(mus, mu_vars, fitted_peaks_keV, pars, deg=deg) - - # Finally, calculate fwhms in keV - uncal_fwhms = [ - pgf.get_fwhm_func(func_i, pars_i, cov=covs_i) - for func_i, pars_i, covs_i in zip(pk_funcs, pk_pars, pk_covs) - ] - uncal_fwhms, uncal_fwhm_errs = zip(*uncal_fwhms) - uncal_fwhms = np.asarray(uncal_fwhms) - uncal_fwhm_errs = np.asarray(uncal_fwhm_errs) - derco = np.polyder(np.poly1d(pars)).coefficients - der = [pgf.poly(Ei, derco) for Ei in fitted_peaks_keV] - - cal_fwhms = uncal_fwhms * der - cal_fwhms_errs = uncal_fwhm_errs * der - results["pk_fwhms"] = np.asarray( - [(u * d, e * d) for u, e, d in zip(uncal_fwhms, uncal_fwhm_errs, der)] - ) - - log.info(f"{len(cal_fwhms)} FWHMs found:") - log.info(f"\t Energy | FWHM ") - for i, (Ei, fwhm, fwhme) in enumerate( - zip(fitted_peaks_keV, cal_fwhms, cal_fwhms_errs) - ): - log.info( - f"\t{i}".ljust(4) - + str(Ei).ljust(9) - + f"| {fwhm:.2f}+-{fwhme:.2f} keV".ljust(5) - ) - - return pars, cov, results - - -def poly_match(xx, yy, deg=-1, rtol=1e-5, atol=1e-8): +def poly_match(xx, yy, deg=-1, rtol=1e-5, atol=1e-8, fixed=None): """Find the polynomial function best matching pol(xx) = yy Finds the poly fit of xx to yy that obtains the most matches between pol(xx) @@ -1343,20 +2578,31 @@ def poly_match(xx, yy, deg=-1, rtol=1e-5, atol=1e-8): # simple shift if deg == -1: - pars_i = np.array([1, (np.sum(yy_i) - np.sum(xx_i)) / len(yy_i)]) - polxx = xx_i + pars_i[1] + pars_i = np.array([(np.sum(yy_i) - np.sum(xx_i)) / len(yy_i), 1]) + polxx = xx_i + pars_i[0] # simple scaling elif deg == 0: - pars_i = np.array([np.sum(yy_i * xx_i) / np.sum(xx_i * xx_i), 0]) - polxx = pars_i[0] * xx_i + pars_i = np.array([0, np.sum(yy_i * xx_i) / np.sum(xx_i * xx_i)]) + polxx = pars_i[1] * xx_i # generic poly of degree >= 1 else: - pars_i = np.polyfit(xx_i, yy_i, deg) + poly_pars = Polynomial.fit(xx_i, yy_i, deg=deg).convert().coef + c = cost.LeastSquares(xx_i, yy_i, np.full_like(yy_i, 1), poly_wrapper) + if fixed is not None: + for idx, val in fixed.items(): + if val is True or val is None: + pass + else: + poly_pars[idx] = val + m = Minuit(c, *poly_pars) + if fixed is not None: + for idx in list(fixed): + m.fixed[idx] = True + pars_i = np.array(m.values) polxx = np.zeros(len(yy_i)) - xxn = np.ones(len(yy_i)) - polxx = pgf.poly(xx_i, pars_i) + polxx = pgf.nb_poly(xx_i, pars_i) # by here we have the best polxx. Search for matches and store pars_i if # its the best so far @@ -1418,371 +2664,3 @@ def poly_match(xx, yy, deg=-1, rtol=1e-5, atol=1e-8): gof = np.inf return pars, best_ixtup, best_iytup - - -def get_i_local_extrema(data, delta): - """Get lists of indices of the local maxima and minima of data - - The "local" extrema are those maxima / minima that have heights / depths of - at least delta. - Converted from MATLAB script at: http://billauer.co.il/peakdet.html - - Parameters - ---------- - data : array-like - the array of data within which extrema will be found - delta : scalar - the absolute level by which data must vary (in one direction) about an - extremum in order for it to be tagged - - Returns - ------- - imaxes, imins : 2-tuple ( array, array ) - A 2-tuple containing arrays of variable length that hold the indices of - the identified local maxima (first tuple element) and minima (second - tuple element) - """ - - # prepare output - imaxes, imins = [], [] - - # sanity checks - data = np.asarray(data) - if not np.isscalar(delta): - log.error("get_i_local_extrema: Input argument delta must be a scalar") - return np.array(imaxes), np.array(imins) - if delta <= 0: - log.error(f"get_i_local_extrema: delta ({delta}) must be positive") - return np.array(imaxes), np.array(imins) - - # now loop over data - imax, imin = 0, 0 - find_max = True - for i in range(len(data)): - if data[i] > data[imax]: - imax = i - if data[i] < data[imin]: - imin = i - - if find_max: - # if the sample is less than the current max by more than delta, - # declare the previous one a maximum, then set this as the new "min" - if data[i] < data[imax] - delta: - imaxes.append(imax) - imin = i - find_max = False - else: - # if the sample is more than the current min by more than delta, - # declare the previous one a minimum, then set this as the new "max" - if data[i] > data[imin] + delta: - imins.append(imin) - imax = i - find_max = True - - return np.array(imaxes), np.array(imins) - - -def get_i_local_maxima(data, delta): - return get_i_local_extrema(data, delta)[0] - - -def get_i_local_minima(data, delta): - return get_i_local_extrema(data, delta)[1] - - -def get_most_prominent_peaks( - energySeries, xlo, xhi, xpb, max_num_peaks=np.inf, test=False -): - """find the most prominent peaks in a spectrum by looking for spikes in - derivative of spectrum energySeries: array of measured energies - max_num_peaks = maximum number of most prominent peaks to find return a - histogram around the most prominent peak in a spectrum of a given - percentage of width - """ - nb = int((xhi - xlo) / xpb) - hist, bin_edges = np.histogram(energySeries, range=(xlo, xhi), bins=nb) - bin_centers = pgh.get_bin_centers(bin_edges) - - # median filter along the spectrum, do this as a "baseline subtraction" - hist_med = medfilt(hist, 21) - hist = hist - hist_med - - # identify peaks with a scipy function (could be improved ...) - peak_idxs = find_peaks_cwt(hist, np.arange(1, 6, 0.1), min_snr=5) - peak_energies = bin_centers[peak_idxs] - - # pick the num_peaks most prominent peaks - if max_num_peaks < len(peak_energies): - peak_vals = hist[peak_idxs] - sort_idxs = np.argsort(peak_vals) - peak_idxs_max = peak_idxs[sort_idxs[-max_num_peaks:]] - peak_energies = np.sort(bin_centers[peak_idxs_max]) - - if test: - plt.plot(bin_centers, hist, ls="steps", lw=1, c="b") - for e in peak_energies: - plt.axvline(e, color="r", lw=1, alpha=0.6) - plt.xlabel("Energy [uncal]", ha="right", x=1) - plt.ylabel("Filtered Spectrum", ha="right", y=1) - plt.tight_layout() - plt.show() - exit() - - return peak_energies - - -def match_peaks(data_pks, cal_pks): - """ - Match uncalibrated peaks with literature energy values. - """ - from itertools import combinations - - from scipy.stats import linregress - - n_pks = len(cal_pks) if len(cal_pks) < len(data_pks) else len(data_pks) - - cal_sets = combinations(range(len(cal_pks)), n_pks) - data_sets = combinations(range(len(data_pks)), n_pks) - - best_err, best_m, best_b = np.inf, None, None - for i, cal_set in enumerate(cal_sets): - cal = cal_pks[list(cal_set)] # lit energies for this set - - for data_set in data_sets: - data = data_pks[list(data_set)] # uncal energies for this set - - m, b, _, _, _ = linregress(data, y=cal) - err = np.sum((cal - (m * data + b)) ** 2) - - if err < best_err: - best_err, best_m, best_b = err, m, b - - print(i, best_err) - print("cal:", cal) - print("data:", data) - plt.scatter(data, cal, label=f"min.err:{err:.2e}") - xs = np.linspace(data[0], data[-1], 10) - plt.plot( - xs, best_m * xs + best_b, c="r", label=f"y = {best_m:.2f} x + {best_b:.2f}" - ) - plt.xlabel("Energy (uncal)", ha="right", x=1) - plt.ylabel("Energy (keV)", ha="right", y=1) - plt.legend() - plt.tight_layout() - plt.show() - exit() - - return best_m, best_b - - -def calibrate_tl208(energy_series, cal_peaks=None, plotFigure=None): - """ - energy_series: array of energies we want to calibrate - cal_peaks: array of peaks to fit - 1.) we find the 2614 peak by looking for the tallest peak at >0.1 the max adc value - 2.) fit that peak to get a rough guess at a calibration to find other peaks with - 3.) fit each peak in peak_energies - 4.) do a linear fit to the peak centroids to find a calibration - """ - - if cal_peaks is None: - cal_peaks = np.array( - [238.632, 510.770, 583.191, 727.330, 860.564, 2614.553] - ) # get_calibration_energies(peak_energies) - else: - cal_peaks = np.array(cal_peaks) - - if len(energy_series) < 100: - return 1, 0 - - # get 10 most prominent ~high e peaks - max_adc = np.amax(energy_series) - energy_hi = energy_series # [ (energy_series > np.percentile(energy_series, 20)) & (energy_series < np.percentile(energy_series, 99.9))] - - peak_energies, peak_e_err = get_most_prominent_peaks( - energy_hi, - ) - rough_kev_per_adc, rough_kev_offset = match_peaks(peak_energies, cal_peaks) - e_cal_rough = rough_kev_per_adc * energy_series + rough_kev_offset - - # return rough_kev_per_adc, rough_kev_offset - # print(energy_series) - # plt.ion() - # plt.figure() - # # for peak in cal_peaks: - # # plt.axvline(peak, c="r", ls=":") - # # energy_series.hist() - # # for peak in peak_energies: - # # plt.axvline(peak, c="r", ls=":") - # # - # plt.hist(energy_series) - # # plt.hist(e_cal_rough[e_cal_rough>100], bins=2700) - # val = input("do i exist?") - # exit() - - ############################################### - # Do a real fit to every peak in peak_energies - ############################################### - max_adc = np.amax(energy_series) - - peak_num = len(cal_peaks) - centers = np.zeros(peak_num) - fit_result_map = {} - bin_size = 0.2 # keV - - if plotFigure is not None: - plot_map = {} - - for i, energy in enumerate(cal_peaks): - window_width = 10 # keV - window_width_in_adc = (window_width) / rough_kev_per_adc - energy_in_adc = (energy - rough_kev_offset) / rough_kev_per_adc - bin_size_adc = (bin_size) / rough_kev_per_adc - - peak_vals = energy_series[ - (energy_series > energy_in_adc - window_width_in_adc) - & (energy_series < energy_in_adc + window_width_in_adc) - ] - - peak_hist, bins = np.histogram( - peak_vals, - bins=np.arange( - energy_in_adc - window_width_in_adc, - energy_in_adc + window_width_in_adc + bin_size_adc, - bin_size_adc, - ), - ) - bin_centers = pgh.get_bin_centers(bins) - # plt.ion() - # plt.figure() - # plt.plot(bin_centers,peak_hist, color="k", ls="steps") - - # inp = input("q to quit...") - # if inp == "q": exit() - - try: - guess_e, guess_sigma, guess_area = get_gaussian_guess( - peak_hist, bin_centers - ) - except IndexError: - print(f"\n\nIt looks like there may not be a peak at {energy} keV") - print("Here is a plot of the area I'm searching for a peak...") - plt.ion() - plt.figure(figsize=(12, 6)) - plt.subplot(121) - plt.plot(bin_centers, peak_hist, color="k", ls="steps") - plt.subplot(122) - plt.hist(e_cal_rough, bins=2700, histtype="step") - input("-->press any key to continue...") - sys.exit() - - plt.plot( - bin_centers, gauss(bin_centers, guess_e, guess_sigma, guess_area), color="b" - ) - - # inp = input("q to quit...") - # if inp == "q": exit() - - bounds = ( - [0.9 * guess_e, 0.5 * guess_sigma, 0, 0, 0, 0, 0], - [ - 1.1 * guess_e, - 2 * guess_sigma, - 0.1, - 0.75, - window_width_in_adc, - 10, - 5 * guess_area, - ], - ) - params = fit_binned( - radford_peak, - peak_hist, - bin_centers, - [guess_e, guess_sigma, 1e-3, 0.7, 5, 0, guess_area], - ) # bounds=bounds) - - plt.plot(bin_centers, radford_peak(bin_centers, *params), color="r") - - # inp = input("q to quit...") - # if inp == "q": exit() - - fit_result_map[energy] = params - centers[i] = params[0] - - if plotFigure is not None: - plot_map[energy] = (bin_centers, peak_hist) - - # Do a linear fit to find the calibration - linear_cal = np.polyfit(centers, cal_peaks, deg=1) - - if plotFigure is not None: - plt.figure(plotFigure.number) - plt.clf() - - grid = gs.GridSpec(peak_num, 3) - ax_line = plt.subplot(grid[:, 1]) - ax_spec = plt.subplot(grid[:, 2]) - - for i, energy in enumerate(cal_peaks): - ax_peak = plt.subplot(grid[i, 0]) - bin_centers, peak_hist = plot_map[energy] - params = fit_result_map[energy] - ax_peak.plot( - bin_centers * rough_kev_per_adc + rough_kev_offset, - peak_hist, - ls="steps-mid", - color="k", - ) - fit = radford_peak(bin_centers, *params) - ax_peak.plot( - bin_centers * rough_kev_per_adc + rough_kev_offset, fit, color="b" - ) - - ax_peak.set_xlabel("Energy [keV]") - - ax_line.scatter( - centers, - cal_peaks, - ) - - x = np.arange(0, max_adc, 1) - ax_line.plot(x, linear_cal[0] * x + linear_cal[1]) - ax_line.set_xlabel("ADC") - ax_line.set_ylabel("Energy [keV]") - - energies_cal = energy_series * linear_cal[0] + linear_cal[1] - peak_hist, bins = np.histogram(energies_cal, bins=np.arange(0, 2700)) - ax_spec.semilogy(pgh.get_bin_centers(bins), peak_hist, ls="steps-mid") - ax_spec.set_xlabel("Energy [keV]") - - return linear_cal - - -def get_calibration_energies(cal_type): - if cal_type == "th228": - return np.array( - [ - 238, - 277, - 300, - 452, - 510.77, - 583.191, - 727, - 763, - 785, - 860.564, - 1620, - 2614.533, - ], - dtype="double", - ) - - elif cal_type == "uwmjlab": - # return np.array([239, 295, 351, 510, 583, 609, 911, 969, 1120, - # 1258, 1378, 1401, 1460, 1588, 1764, 2204, 2615], - # dtype="double") - return np.array([239, 911, 1460, 1764, 2615], dtype="double") - else: - raise ValueError diff --git a/src/pygama/pargen/energy_optimisation.py b/src/pygama/pargen/energy_optimisation.py index 905d126f0..41f3d9d83 100644 --- a/src/pygama/pargen/energy_optimisation.py +++ b/src/pygama/pargen/energy_optimisation.py @@ -5,217 +5,56 @@ to provide the best energy resolution at Qbb """ -import json import logging -import os -import pathlib -import pickle as pkl -import sys -from collections import namedtuple import lgdo.lh5 as lh5 -import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np -import pandas as pd -from iminuit import Minuit, cost, util -from matplotlib.backends.backend_pdf import PdfPages -from matplotlib.colors import LogNorm -from scipy.optimize import curve_fit, minimize -from scipy.stats import chisquare, norm -from sklearn.exceptions import ConvergenceWarning -from sklearn.gaussian_process import GaussianProcessRegressor -from sklearn.gaussian_process.kernels import RBF, ConstantKernel -from sklearn.utils._testing import ignore_warnings +import pygama.math.distributions as pgd import pygama.math.histogram as pgh -import pygama.math.peak_fitting as pgf -import pygama.pargen.cuts as cts -import pygama.pargen.dsp_optimize as opt import pygama.pargen.energy_cal as pgc +from pygama.pargen.utils import convert_to_minuit, return_nans log = logging.getLogger(__name__) sto = lh5.LH5Store() -def run_optimisation( - file, - opt_config, - dsp_config, - cuts, - fom, - db_dict=None, - n_events=8000, - wf_field="waveform", - **fom_kwargs, -): - """ - Runs optimisation on .lh5 file - - Parameters - ---------- - file: string - path to raw .lh5 file - opt_config: str - path to JSON dictionary to configure optimisation - dsp_config: str - path to JSON dictionary specifying dsp configuration - fom: function - When given the output lh5 table of a DSP iteration, the - fom_function must return a scalar figure-of-merit value upon which the - optimization will be based. Should accept verbosity as a second argument - db_dict: dict - Dictionary specifying any values to put in processing chain e.g. pz constant - n_events : int - Number of events to run over - """ - grid = set_par_space(opt_config) - waveforms = sto.read(f"/raw/{wf_field}", file, idx=cuts, n_rows=n_events)[0] - baseline = sto.read("/raw/baseline", file, idx=cuts, n_rows=n_events)[0] - tb_data = lh5.Table(col_dict={f"{wf_field}": waveforms, "baseline": baseline}) - return opt.run_grid(tb_data, dsp_config, grid, fom, db_dict, **fom_kwargs) - - -def run_optimisation_multiprocessed( - file, - opt_config, - dsp_config, - cuts, - lh5_path, - fom=None, - db_dict=None, - processes=5, - n_events=8000, - **fom_kwargs, -): - """ - Runs optimisation on .lh5 file, this version multiprocesses the grid points, it also can handle multiple grids being passed - as long as they are the same dimensions. - - Parameters - ---------- - file: string - path to raw .lh5 file - opt_config: str - path to JSON dictionary to configure optimisation - dsp_config: str - path to JSON dictionary specifying dsp configuration - fom: function - When given the output lh5 table of a DSP iteration, the - fom_function must return a scalar figure-of-merit value upon which the - optimization will be based. Should accept verbosity as a second argument - n_events : int - Number of events to run over - db_dict: dict - Dictionary specifying any values to put in processing chain e.g. pz constant - processes : int - Number of separate processes to run for the multiprocessing - """ - - def form_dict(in_dict, length): - keys = list(in_dict.keys()) - out_list = [] - for i in range(length): - out_list.append({keys[0]: 0}) - for key in keys: - if isinstance(in_dict[key], list): - if len(in_dict[key]) == length: - for i in range(length): - out_list[i][key] = in_dict[key][i] - else: - for i in range(length): - out_list[i][key] = in_dict[key] - else: - for i in range(length): - out_list[i][key] = in_dict[key] - return out_list - - if not isinstance(opt_config, list): - opt_config = [opt_config] - grid = [] - for i, opt_conf in enumerate(opt_config): - grid.append(set_par_space(opt_conf)) - if fom_kwargs: - if "fom_kwargs" in fom_kwargs: - fom_kwargs = fom_kwargs["fom_kwargs"] - fom_kwargs = form_dict(fom_kwargs, len(grid)) - sto = lh5.LH5Store() - waveforms = sto.read(f"{lh5_path}/{wf_field}", file, idx=cuts, n_rows=n_events)[0] - baseline = sto.read(f"{lh5_path}/baseline", file, idx=cuts, n_rows=n_events)[0] - tb_data = lh5.Table(col_dict={f"{wf_field}": waveforms, "baseline": baseline}) - return opt.run_grid_multiprocess_parallel( - tb_data, - dsp_config, - grid, - fom, - db_dict=db_dict, - processes=processes, - fom_kwargs=fom_kwargs, - ) - - -def set_par_space(opt_config): +def simple_guess(energy, func, fit_range=None, bin_width=None): """ - Generates grid for optimizer from dictionary of form {param : {start: , end: , spacing: }} - """ - par_space = opt.ParGrid() - for name in opt_config.keys(): - p_values = opt_config[name] - for param in p_values.keys(): - str_vals = set_values(p_values[param]) - par_space.add_dimension(name, param, str_vals) - return par_space - - -def set_values(par_values): - """ - Finds values for grid + Simple guess for peak fitting """ - string_values = np.arange( - par_values["start"], par_values["end"], par_values["spacing"] - ) - try: - string_values = [f'{val:.4f}*{par_values["unit"]}' for val in string_values] - except: - string_values = [f"{val:.4f}" for val in string_values] - return string_values + if fit_range is None: + fit_range = (np.nanmin(energy), np.nanmax(energy)) + energy = energy[(energy >= fit_range[0]) & (energy <= fit_range[1])] + if bin_width is None: + init_bin_width = ( + 2 + * (np.nanpercentile(energy, 75) - np.nanpercentile(energy, 25)) + * len(energy) ** (-1 / 3) + ) + init_hist, init_bins, _ = pgh.get_hist( + energy, dx=init_bin_width, range=fit_range + ) + try: + _, init_sigma, _ = pgh.get_gaussian_guess(init_hist, init_bins) + except IndexError: + init_hist, init_bins, _ = pgh.get_hist( + energy, dx=init_bin_width / 2, range=fit_range + ) + try: + _, init_sigma, _ = pgh.get_gaussian_guess(init_hist, init_bins) + except IndexError: + init_sigma = np.nanstd(energy) + bin_width = (init_sigma) * len(energy) ** (-1 / 3) -def simple_guess(hist, bins, var, func_i, fit_range): - """ - Simple guess for peak fitting - """ - if func_i == pgf.extended_radford_pdf: - bin_cs = (bins[1:] + bins[:-1]) / 2 - _, sigma, amp = pgh.get_gaussian_guess(hist, bins) - i_0 = np.nanargmax(hist) - mu = bin_cs[i_0] - height = hist[i_0] - bg0 = np.mean(hist[-10:]) - step = np.mean(hist[:10]) - bg0 - htail = 1.0 / 5 - tau = 0.5 * sigma + hist, bins, var = pgh.get_hist(energy, dx=bin_width, range=fit_range) - hstep = step / (bg0 + np.mean(hist[:10])) - dx = np.diff(bins)[0] - n_bins_range = int((4 * sigma) // dx) - nsig_guess = np.sum(hist[i_0 - n_bins_range : i_0 + n_bins_range]) - nbkg_guess = np.sum(hist) - nsig_guess - parguess = [ - nsig_guess, - mu, - sigma, - htail, - tau, - nbkg_guess, - hstep, - fit_range[0], - fit_range[1], - 0, - ] # - return parguess + # make binning dynamic based on max, % of events/ n of events? + hist, bins, var = pgh.get_hist(energy, range=fit_range, dx=bin_width) - elif func_i == pgf.extended_gauss_step_pdf: + if func == pgd.hpge_peak or func == pgd.gauss_on_step: mu, sigma, amp = pgh.get_gaussian_guess(hist, bins) i_0 = np.argmax(hist) bg = np.mean(hist[-10:]) @@ -223,195 +62,44 @@ def simple_guess(hist, bins, var, func_i, fit_range): hstep = step / (bg + np.mean(hist[:10])) dx = np.diff(bins)[0] n_bins_range = int((4 * sigma) // dx) - nsig_guess = np.sum(hist[i_0 - n_bins_range : i_0 + n_bins_range]) - nbkg_guess = np.sum(hist) - nsig_guess - return [nsig_guess, mu, sigma, nbkg_guess, hstep, fit_range[0], fit_range[1], 0] - - -def unbinned_energy_fit( - energy, - func, - gof_func, - gof_range, - fit_range=(np.inf, np.inf), - guess=None, - tol=None, - verbose=False, - display=0, -): - """ - Unbinned fit to energy. This is different to the default fitting as - it will try different fitting methods and choose the best. This is necessary for the lower statistics. - """ - - bin_width = 1 - lower_bound = (np.nanmin(energy) // bin_width) * bin_width - upper_bound = ((np.nanmax(energy) // bin_width) + 1) * bin_width - hist1, bins, var = pgh.get_hist( - energy, dx=bin_width, range=(lower_bound, upper_bound) - ) - bin_cs1 = (bins[:-1] + bins[1:]) / 2 - if guess is not None: - x0 = [*guess[:-2], fit_range[0], fit_range[1], False] - else: - if func == pgf.extended_radford_pdf: - x0 = simple_guess(hist1, bins, var, pgf.extended_gauss_step_pdf, fit_range) - if verbose: - print(x0) - c = cost.ExtendedUnbinnedNLL(energy, pgf.extended_gauss_step_pdf) - m = Minuit(c, *x0) - m.fixed[-3:] = True - m.simplex().migrad() - m.hesse() - if guess is not None: - x0_rad = [*guess[:-2], fit_range[0], fit_range[1], False] - else: - x0_rad = simple_guess(hist1, bins, var, func, fit_range) - x0 = m.values[:3] - x0 += x0_rad[3:5] - x0 += m.values[3:] - else: - x0 = simple_guess(hist1, bins, var, func, fit_range) - if verbose: - print(x0) - c = cost.ExtendedUnbinnedNLL(energy, func) - m = Minuit(c, *x0) - if tol is not None: - m.tol = tol - m.fixed[-3:] = True - m.migrad() - m.hesse() - - hist, bins, var = pgh.get_hist(energy, dx=1, range=gof_range) - bin_cs = (bins[:-1] + bins[1:]) / 2 - m_fit = func(bin_cs1, *m.values)[1] - - valid1 = ( - m.valid - # & m.accurate - & (~np.isnan(m.errors).any()) - & (~(np.array(m.errors[:-3]) == 0).all()) - ) - - cs = pgf.goodness_of_fit( - hist, bins, None, gof_func, m.values[:-3], method="Pearson" - ) - cs = cs[0] / cs[1] - m2 = Minuit(c, *x0) - if tol is not None: - m2.tol = tol - m2.fixed[-3:] = True - m2.simplex().migrad() - m2.hesse() - m2_fit = func(bin_cs1, *m2.values)[1] - valid2 = ( - m2.valid - # & m2.accurate - & (~np.isnan(m.errors).any()) - & (~(np.array(m2.errors[:-3]) == 0).all()) - ) - - cs2 = pgf.goodness_of_fit( - hist, bins, None, gof_func, m2.values[:-3], method="Pearson" - ) - cs2 = cs2[0] / cs2[1] - - frac_errors1 = np.sum(np.abs(np.array(m.errors)[:-3] / np.array(m.values)[:-3])) - frac_errors2 = np.sum(np.abs(np.array(m2.errors)[:-3] / np.array(m2.values)[:-3])) - - if verbose: - print(m) - print(m2) - print(frac_errors1, frac_errors2) - - if display > 1: - m_fit = gof_func(bin_cs1, *m.values) - m2_fit = gof_func(bin_cs1, *m2.values) - plt.figure() - plt.plot(bin_cs1, hist1, label=f"hist") - plt.plot(bin_cs1, func(bin_cs1, *x0)[1], label=f"Guess") - plt.plot(bin_cs1, m_fit, label=f"Fit 1: {cs}") - plt.plot(bin_cs1, m2_fit, label=f"Fit 2: {cs2}") - plt.legend() - plt.show() - - if valid1 == False and valid2 == False: - log.debug("Extra simplex needed") - m = Minuit(c, *x0) - if tol is not None: - m.tol = tol - m.fixed[-3:] = True - m.limits = pgc.get_hpge_E_bounds(func) - m.simplex().simplex().migrad() - m.hesse() - if verbose: - print(m) - cs = pgf.goodness_of_fit( - hist, bins, None, gof_func, m.values[:-3], method="Pearson" - ) - cs = cs[0] / cs[1] - valid3 = ( - m.valid - # & m.accurate - & (~np.isnan(m.errors).any()) - & (~(np.array(m.errors[:-3]) == 0).all()) - ) - if valid3 is False: - try: - m.minos() - valid3 = ( - m.valid - & (~np.isnan(m.errors).any()) - & (~(np.array(m.errors[:-3]) == 0).all()) - ) - except: - raise RuntimeError - - pars = np.array(m.values)[:-1] - errs = np.array(m.errors)[:-1] - cov = np.array(m.covariance)[:-1, :-1] - csqr = cs - - elif valid2 == False or cs * 1.05 < cs2: - pars = np.array(m.values)[:-1] - errs = np.array(m.errors)[:-3] - cov = np.array(m.covariance)[:-1, :-1] - csqr = cs - - elif valid1 == False or cs2 * 1.05 < cs: - pars = np.array(m2.values)[:-1] - errs = np.array(m2.errors)[:-3] - cov = np.array(m2.covariance)[:-1, :-1] - csqr = cs2 - - elif frac_errors1 < frac_errors2: - pars = np.array(m.values)[:-1] - errs = np.array(m.errors)[:-3] - cov = np.array(m.covariance)[:-1, :-1] - csqr = cs + nsig = np.sum(hist[i_0 - n_bins_range : i_0 + n_bins_range]) + nbkg = np.sum(hist) - nsig + + parguess = { + "n_sig": nsig, + "mu": mu, + "sigma": sigma, + "n_bkg": nbkg, + "hstep": hstep, + "x_lo": fit_range[0], + "x_hi": fit_range[1], + } - elif frac_errors1 > frac_errors2: - pars = np.array(m2.values)[:-1] - errs = np.array(m2.errors)[:-3] - cov = np.array(m2.covariance)[:-1, :-1] - csqr = cs2 + if func == pgd.hpge_peak: + htail = 1.0 / 5 + tau = 0.5 * sigma + parguess["htail"] = htail + parguess["tau"] = tau else: - raise RuntimeError + log.error(f"simple_guess not implemented for {func.__name__}") + return return_nans(func) - return pars, errs, cov, csqr + return convert_to_minuit(parguess, func).values def get_peak_fwhm_with_dt_corr( - Energies, + energies, alpha, dt, func, - gof_func, peak, kev_width, guess=None, kev=False, + frac_max=0.5, + bin_width=1, + allow_tail_drop=False, display=0, ): """ @@ -420,11 +108,10 @@ def get_peak_fwhm_with_dt_corr( """ correction = np.multiply( - np.multiply(alpha, dt, dtype="float64"), Energies, dtype="float64" + np.multiply(alpha, dt, dtype="float64"), energies, dtype="float64" ) - ct_energy = np.add(correction, Energies) + ct_energy = np.add(correction, energies) - bin_width = 1 lower_bound = (np.nanmin(ct_energy) // bin_width) * bin_width upper_bound = ((np.nanmax(ct_energy) // bin_width) + 1) * bin_width hist, bins, var = pgh.get_hist( @@ -437,101 +124,63 @@ def get_peak_fwhm_with_dt_corr( upper_bound = mu + ((kev_width[1] - 2) * adc_to_kev) win_idxs = (ct_energy > lower_bound) & (ct_energy < upper_bound) fit_range = (lower_bound, upper_bound) - if peak > 1500: - gof_range = (mu - (7 * adc_to_kev), mu + (7 * adc_to_kev)) - else: - gof_range = (mu - (5 * adc_to_kev), mu + (5 * adc_to_kev)) - # if kev==True: - # else: - # tol=0.01 tol = None try: + ( + energy_pars, + energy_err, + cov, + chisqr, + func, + _, + _, + _, + ) = pgc.unbinned_staged_energy_fit( + ct_energy[win_idxs], + func=func, + fit_range=fit_range, + guess_func=simple_guess, + tol=tol, + guess=guess, + allow_tail_drop=allow_tail_drop, + bin_width=bin_width, + display=display, + ) if display > 0: - energy_pars, energy_err, cov, chisqr = unbinned_energy_fit( - ct_energy[win_idxs], - func, - gof_func, - gof_range, - fit_range, - tol=tol, - guess=guess, - verbose=True, - display=display, - ) - print(energy_pars) - print(energy_err) - print(cov) plt.figure() xs = np.arange(lower_bound, upper_bound, bin_width) - hist, bins, var = pgh.get_hist( + fit_hist, fit_bins, _ = pgh.get_hist( ct_energy, dx=bin_width, range=(lower_bound, upper_bound) ) - plt.plot((bins[1:] + bins[:-1]) / 2, hist) - plt.plot(xs, gof_func(xs, *energy_pars)) + plt.step(pgh.get_bin_centers(fit_bins), fit_hist) + plt.plot(xs, func.get_pdf(xs, *energy_pars)) plt.show() - else: - energy_pars, energy_err, cov, chisqr = unbinned_energy_fit( - ct_energy[win_idxs], - func, - gof_func, - gof_range, - fit_range, - guess=guess, - tol=tol, - ) - if func == pgf.extended_radford_pdf: - if energy_pars[3] < 1e-6 and energy_err[3] < 1e-6: - fwhm = energy_pars[2] * 2 * np.sqrt(2 * np.log(2)) - fwhm_err = np.sqrt(cov[2][2]) * 2 * np.sqrt(2 * np.log(2)) - else: - fwhm = pgf.radford_fwhm(energy_pars[2], energy_pars[3], energy_pars[4]) - elif func == pgf.extended_gauss_step_pdf: - fwhm = energy_pars[2] * 2 * np.sqrt(2 * np.log(2)) - fwhm_err = np.sqrt(cov[2][2]) * 2 * np.sqrt(2 * np.log(2)) + fwhm = func.get_fwfm(energy_pars, frac_max=frac_max) xs = np.arange(lower_bound, upper_bound, 0.1) - y = func(xs, *energy_pars)[1] + y = func.get_pdf(xs, *energy_pars) max_val = np.amax(y) - fwhm_o_max = fwhm / max_val rng = np.random.default_rng(1) # generate set of bootstrapped parameters par_b = rng.multivariate_normal(energy_pars, cov, size=100) - y_max = np.array([func(xs, *p)[1] for p in par_b]) + y_max = np.array([func.get_pdf(xs, *p) for p in par_b]) maxs = np.nanmax(y_max, axis=1) - yerr_boot = np.nanstd(y_max, axis=0) - - if func == pgf.extended_radford_pdf and not ( - energy_pars[3] < 1e-6 and energy_err[3] < 1e-6 - ): - y_b = np.zeros(len(par_b)) - for i, p in enumerate(par_b): - try: - y_b[i] = pgf.radford_fwhm(p[2], p[3], p[4]) # - except: - y_b[i] = np.nan - fwhm_err = np.nanstd(y_b, axis=0) - if fwhm_err == 0: - fwhm, fwhm_err = pgf.radford_fwhm( - energy_pars[2], - energy_pars[3], - energy_pars[4], - cov=cov[:, :-2][:-2, :], - ) - fwhm_o_max_err = np.nanstd(y_b / maxs, axis=0) - else: - max_err = np.nanstd(maxs) - fwhm_o_max_err = fwhm_o_max * np.sqrt( - (np.array(fwhm_err) / np.array(fwhm)) ** 2 - + (np.array(max_err) / np.array(max_val)) ** 2 - ) + y_b = np.zeros(len(par_b)) + for i, p in enumerate(par_b): + try: + y_b[i] = func.get_fwfm(p, frac_max=frac_max) + except Exception: + y_b[i] = np.nan + fwhm_err = np.nanstd(y_b, axis=0) + fwhm_o_max_err = np.nanstd(y_b / maxs, axis=0) if display > 1: plt.figure() - plt.plot((bins[1:] + bins[:-1]) / 2, hist) + plt.step(pgh.get_bin_centers(bins), hist) for i in range(100): plt.plot(xs, y_max[i, :]) plt.show() @@ -541,19 +190,20 @@ def get_peak_fwhm_with_dt_corr( hist, bins, var = pgh.get_hist( ct_energy, dx=bin_width, range=(lower_bound, upper_bound) ) - plt.plot((bins[1:] + bins[:-1]) / 2, hist) - plt.plot(xs, gof_func(xs, *energy_pars)) + plt.step(pgh.get_bin_centers(bins), hist) + plt.plot(xs, y, color="orange") + yerr_boot = np.nanstd(y_max, axis=0) plt.fill_between( xs, y - yerr_boot, y + yerr_boot, facecolor="C1", alpha=0.5 ) plt.show() - except: - return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, None + except Exception: + return np.nan, np.nan, np.nan, np.nan, (np.nan, np.nan), np.nan, np.nan, None - if kev == True: - fwhm *= peak / energy_pars[1] - fwhm_err *= peak / energy_pars[1] + if kev is True: + fwhm *= peak / energy_pars["mu"] + fwhm_err *= peak / energy_pars["mu"] return ( fwhm, @@ -561,197 +211,137 @@ def get_peak_fwhm_with_dt_corr( fwhm_err, fwhm_o_max_err, chisqr, - energy_pars[0], - energy_err[0], + energy_pars["n_sig"], + energy_err["n_sig"], energy_pars, ) -def fom_FWHM_with_dt_corr_fit(tb_in, kwarg_dict, ctc_parameter, idxs=None, display=0): +def fom_fwhm_with_alpha_fit( + tb_in, kwarg_dict, ctc_parameter, nsteps=11, idxs=None, frac_max=0.2, display=0 +): """ FOM for sweeping over ctc values to find the best value, returns the best found fwhm with its error, the corresponding alpha value and the number of events in the fitted peak, also the reduced chisquare of the """ parameter = kwarg_dict["parameter"] func = kwarg_dict["func"] - gof_func = kwarg_dict["gof_func"] - Energies = tb_in[parameter].nda - Energies = Energies.astype("float64") + energies = tb_in[parameter].nda + energies = energies.astype("float64") peak = kwarg_dict["peak"] kev_width = kwarg_dict["kev_width"] + bin_width = kwarg_dict.get("bin_width", 1) min_alpha = 0 max_alpha = 3.50e-06 - astep = 1.250e-07 - if ctc_parameter == "QDrift": - dt = tb_in["dt_eff"].nda - elif ctc_parameter == "dt": - dt = np.subtract(tb_in["tp_99"].nda, tb_in["tp_0_est"].nda, dtype="float64") - elif ctc_parameter == "rt": - dt = np.subtract(tb_in["tp_99"].nda, tb_in["tp_01"].nda, dtype="float64") - + alphas = np.linspace(min_alpha, max_alpha, nsteps, dtype="float64") + try: + dt = tb_in[ctc_parameter].nda + except KeyError: + dt = tb_in.eval(ctc_parameter) if idxs is not None: - Energies = Energies[idxs] + energies = energies[idxs] dt = dt[idxs] - - if np.isnan(Energies).any(): - return { - "fwhm": np.nan, - "fwhm_err": np.nan, - "alpha": np.nan, - "alpha_err": np.nan, - "chisquare": np.nan, - "n_sig": np.nan, - "n_sig_err": np.nan, - } - if np.isnan(dt).any(): - return { - "fwhm": np.nan, - "fwhm_err": np.nan, - "alpha": np.nan, - "alpha_err": np.nan, - "chisquare": np.nan, - "n_sig": np.nan, - "n_sig_err": np.nan, - } - - alphas = np.array( - [ - 0.000e00, - 1.250e-07, - 2.500e-07, - 3.750e-07, - 5.000e-07, - 6.250e-07, - 7.500e-07, - 8.750e-07, - 1.000e-06, - 1.125e-06, - 1.250e-06, - 1.375e-06, - 1.500e-06, - 1.625e-06, - 1.750e-06, - 1.875e-06, - 2.000e-06, - 2.125e-06, - 2.250e-06, - 2.375e-06, - 2.500e-06, - 2.625e-06, - 2.750e-06, - 2.875e-06, - 3.000e-06, - 3.125e-06, - 3.250e-06, - 3.375e-06, - 3.500e-06, - ], - dtype="float64", - ) - fwhms = np.array([]) - final_alphas = np.array([]) - fwhm_errs = np.array([]) - guess = None - best_fwhm = np.inf - for alpha in alphas: - ( - _, - fwhm_o_max, - _, - fwhm_o_max_err, - _, - _, - _, - fit_pars, - ) = get_peak_fwhm_with_dt_corr( - Energies, alpha, dt, func, gof_func, peak, kev_width, guess=guess - ) - if not np.isnan(fwhm_o_max): - fwhms = np.append(fwhms, fwhm_o_max) - final_alphas = np.append(final_alphas, alpha) - fwhm_errs = np.append(fwhm_errs, fwhm_o_max_err) - guess = fit_pars - if fwhms[-1] < best_fwhm: - best_fwhm = fwhms[-1] - best_fit = fit_pars - log.info(f"alpha: {alpha}, fwhm/max:{fwhm_o_max}+-{fwhm_o_max_err}") - - # Make sure fit isn't based on only a few points - if len(fwhms) < 10: - log.debug("less than 10 fits successful") - return { - "fwhm": np.nan, - "fwhm_err": np.nan, - "alpha": np.nan, - "alpha_err": np.nan, - "chisquare": np.nan, - "n_sig": np.nan, - "n_sig_err": np.nan, - } - - ids = (fwhm_errs < 2 * np.nanpercentile(fwhm_errs, 50)) & (fwhm_errs > 0) - # Fit alpha curve to get best alpha - try: - alphas = np.arange( - final_alphas[ids][0], final_alphas[ids][-1], astep / 20, dtype="float64" - ) - alpha_fit, cov = np.polyfit( - final_alphas[ids], fwhms[ids], w=1 / fwhm_errs[ids], deg=4, cov=True - ) - fit_vals = np.polynomial.polynomial.polyval(alphas, alpha_fit[::-1]) - alpha = alphas[np.nanargmin(fit_vals)] + if np.isnan(energies).any(): + log.debug("nan in energies") + raise RuntimeError + if np.isnan(dt).any(): + log.debug("nan in dts") + raise RuntimeError + fwhms = np.array([]) + final_alphas = np.array([]) + fwhm_errs = np.array([]) + best_fwhm = np.inf + early_break = False + for alpha in alphas: + ( + _, + fwhm_o_max, + _, + fwhm_o_max_err, + _, + _, + _, + fit_pars, + ) = get_peak_fwhm_with_dt_corr( + energies, + alpha, + dt, + func, + peak, + kev_width, + guess=None, + frac_max=0.5, + bin_width=bin_width, + allow_tail_drop=False, + ) + if not np.isnan(fwhm_o_max): + fwhms = np.append(fwhms, fwhm_o_max) + final_alphas = np.append(final_alphas, alpha) + fwhm_errs = np.append(fwhm_errs, fwhm_o_max_err) + if fwhms[-1] < best_fwhm: + best_fwhm = fwhms[-1] + log.info(f"alpha: {alpha}, fwhm/max:{fwhm_o_max:.4f}+-{fwhm_o_max_err:.4f}") + + ids = (fwhm_errs < 2 * np.nanpercentile(fwhm_errs, 50)) & ( + fwhm_errs > 1e-10 + ) + if len(fwhms[ids]) > 5: + if (np.diff(fwhms[ids])[-3:] > 0).all(): + early_break = True + break + + # Make sure fit isn't based on only a few points + if len(fwhms) < nsteps * 0.2 and early_break is False: + log.debug("less than 20% fits successful") + raise RuntimeError + + ids = (fwhm_errs < 2 * np.nanpercentile(fwhm_errs, 50)) & (fwhm_errs > 1e-10) + # Fit alpha curve to get best alpha - rng = np.random.default_rng(1) - alpha_pars_b = rng.multivariate_normal(alpha_fit, cov, size=1000) - fits = np.array( - [ - np.polynomial.polynomial.polyval(alphas, pars[::-1]) - for pars in alpha_pars_b - ] - ) - min_alphas = np.array([alphas[np.nanargmin(fit)] for fit in fits]) - alpha_err = np.nanstd(min_alphas) - if display > 0: - plt.figure() - yerr_boot = np.std(fits, axis=0) - plt.errorbar(final_alphas, fwhms, yerr=fwhm_errs, linestyle=" ") - plt.plot(alphas, fit_vals) - plt.fill_between( - alphas, - fit_vals - yerr_boot, - fit_vals + yerr_boot, - facecolor="C1", - alpha=0.5, + try: + alphas = np.linspace( + final_alphas[ids][0], + final_alphas[ids][-1], + nsteps * 20, + dtype="float64", ) - plt.show() + alpha_fit, cov = np.polyfit( + final_alphas[ids], fwhms[ids], w=1 / fwhm_errs[ids], deg=4, cov=True + ) + fit_vals = np.polynomial.polynomial.polyval(alphas, alpha_fit[::-1]) + alpha = alphas[np.nanargmin(fit_vals)] - except: - log.debug("alpha fit failed") - return { - "fwhm": np.nan, - "fwhm_err": np.nan, - "alpha": np.nan, - "alpha_err": np.nan, - "chisquare": np.nan, - "n_sig": np.nan, - "n_sig_err": np.nan, - } + rng = np.random.default_rng(1) + alpha_pars_b = rng.multivariate_normal(alpha_fit, cov, size=1000) + fits = np.array( + [ + np.polynomial.polynomial.polyval(alphas, pars[::-1]) + for pars in alpha_pars_b + ] + ) + min_alphas = np.array([alphas[np.nanargmin(fit)] for fit in fits]) + alpha_err = np.nanstd(min_alphas) + if display > 0: + plt.figure() + yerr_boot = np.nanstd(fits, axis=0) + plt.errorbar(final_alphas, fwhms, yerr=fwhm_errs, linestyle=" ") + plt.plot(alphas, fit_vals) + plt.fill_between( + alphas, + fit_vals - yerr_boot, + fit_vals + yerr_boot, + facecolor="C1", + alpha=0.5, + ) + plt.show() - if np.isnan(fit_vals).all(): - log.debug("alpha fit all nan") - return { - "fwhm": np.nan, - "fwhm_err": np.nan, - "alpha": np.nan, - "alpha_err": np.nan, - "chisquare": np.nan, - "n_sig": np.nan, - "n_sig_err": np.nan, - } + except Exception: + log.debug("alpha fit failed") - else: - # Return fwhm of optimal alpha in kev with error + if np.isnan(fit_vals).all(): + log.debug("alpha fit all nan") + raise RuntimeError ( final_fwhm, _, @@ -762,40 +352,22 @@ def fom_FWHM_with_dt_corr_fit(tb_in, kwarg_dict, ctc_parameter, idxs=None, displ n_sig_err, _, ) = get_peak_fwhm_with_dt_corr( - Energies, + energies, alpha, dt, func, - gof_func, peak, kev_width, - guess=best_fit, + guess=None, kev=True, + frac_max=frac_max, + allow_tail_drop=True, + bin_width=bin_width, display=display, ) - if np.isnan(final_fwhm) or np.isnan(final_err): - ( - final_fwhm, - _, - final_err, - _, - csqr, - n_sig, - n_sig_err, - _, - ) = get_peak_fwhm_with_dt_corr( - Energies, - alpha, - dt, - func, - gof_func, - peak, - kev_width, - kev=True, - display=display, - ) if np.isnan(final_fwhm) or np.isnan(final_err): log.debug(f"final fit failed, alpha was {alpha}") + raise RuntimeError return { "fwhm": final_fwhm, "fwhm_err": final_err, @@ -805,418 +377,124 @@ def fom_FWHM_with_dt_corr_fit(tb_in, kwarg_dict, ctc_parameter, idxs=None, displ "n_sig": n_sig, "n_sig_err": n_sig_err, } + except Exception: + return { + "fwhm": np.nan, + "fwhm_err": np.nan, + "alpha": 0, + "alpha_err": np.nan, + "chisquare": (np.nan, np.nan), + "n_sig": np.nan, + "n_sig_err": np.nan, + } -def fom_all_fit(tb_in, kwarg_dict): - """ - FOM to run over different ctc parameters - """ - ctc_parameters = ["QDrift"] #'dt', - output_dict = {} - for param in ctc_parameters: - out = fom_FWHM_with_dt_corr_fit(tb_in, kwarg_dict, param) - output_dict[param] = out - return output_dict - - -def fom_FWHM_fit(tb_in, kwarg_dict): +def fom_fwhm_no_alpha_sweep( + tb_in, kwarg_dict, ctc_param=None, alpha=0, idxs=None, frac_max=0.5, display=0 +): """ FOM with no ctc sweep, used for optimising ftp. """ parameter = kwarg_dict["parameter"] func = kwarg_dict["func"] - gof_func = kwarg_dict["gof_func"] - Energies = tb_in[parameter].nda - Energies = Energies.astype("float64") + energies = tb_in[parameter].nda + energies = energies.astype("float64") peak = kwarg_dict["peak"] kev_width = kwarg_dict["kev_width"] - try: - alpha = kwarg_dict["alpha"] - if isinstance(alpha, dict): - alpha = alpha[parameter] - except KeyError: - alpha = 0 - try: - ctc_param = kwarg_dict["ctc_param"] - dt = tb_in[ctc_param].nda - except KeyError: + alpha = kwarg_dict.get("alpha", alpha) + bin_width = kwarg_dict.get("bin_width", 1) + if isinstance(alpha, dict): + alpha = alpha[parameter] + if "ctc_param" in kwarg_dict or ctc_param is not None: + ctc_param = kwarg_dict.get("ctc_param", ctc_param) + try: + dt = tb_in[ctc_param].nda + except KeyError: + dt = tb_in.eval(ctc_param) + dt = tb_in[ctc_param].nda + else: dt = 0 - if np.isnan(Energies).any(): + if idxs is not None: + energies = energies[idxs] + dt = dt[idxs] + + if np.isnan(energies).any(): return { + "fwhm": np.nan, "fwhm_o_max": np.nan, - "max_o_fwhm": np.nan, + "fwhm_err": np.nan, + "fwhm_o_max_err": np.nan, "chisquare": np.nan, "n_sig": np.nan, "n_sig_err": np.nan, } - ( - _, + fwhm, final_fwhm_o_max, - _, + fwhm_err, final_fwhm_o_max_err, csqr, n_sig, n_sig_err, + fit_pars, ) = get_peak_fwhm_with_dt_corr( - Energies, alpha, dt, func, gof_func, peak=peak, kev_width=kev_width, kev=True + energies, + alpha, + dt, + func, + peak=peak, + kev_width=kev_width, + frac_max=frac_max, + kev=True, + bin_width=bin_width, + display=display, ) return { + "fwhm": fwhm, "fwhm_o_max": final_fwhm_o_max, - "max_o_fwhm": final_fwhm_o_max_err, + "fwhm_err": fwhm_err, + "fwhm_o_max_err": final_fwhm_o_max_err, "chisquare": csqr, "n_sig": n_sig, "n_sig_err": n_sig_err, } -def get_wf_indexes(sorted_indexs, n_events): - out_list = [] - if isinstance(n_events, list): - for i in range(len(n_events)): - new_list = [] - for idx, entry in enumerate(sorted_indexs): - if (entry >= np.sum(n_events[:i])) and ( - entry < np.sum(n_events[: i + 1]) - ): - new_list.append(idx) - out_list.append(new_list) - else: - for i in range(int(len(sorted_indexs) / n_events)): - new_list = [] - for idx, entry in enumerate(sorted_indexs): - if (entry >= i * n_events) and (entry < (i + 1) * n_events): - new_list.append(idx) - out_list.append(new_list) - return out_list - - -def index_data(data, indexes, wf_field="waveform"): - new_baselines = lh5.Array(data["baseline"].nda[indexes]) - new_waveform_values = data[wf_field]["values"].nda[indexes] - new_waveform_dts = data[wf_field]["dt"].nda[indexes] - new_waveform_t0 = data[wf_field]["t0"].nda[indexes] - new_waveform = lh5.WaveformTable( - None, new_waveform_t0, "ns", new_waveform_dts, "ns", new_waveform_values - ) - new_data = lh5.Table(col_dict={wf_field: new_waveform, "baseline": new_baselines}) - return new_data - - -def event_selection( - raw_files, - lh5_path, - dsp_config, - db_dict, - peaks_keV, - peak_idxs, - kev_widths, - cut_parameters={"bl_mean": 4, "bl_std": 4, "pz_std": 4}, - pulser_mask=None, - energy_parameter="trapTmax", - wf_field: str = "waveform", - n_events=10000, - threshold=1000, -): - if not isinstance(peak_idxs, list): - peak_idxs = [peak_idxs] - if not isinstance(kev_widths, list): - kev_widths = [kev_widths] - - df = sto.read(lh5_path, raw_files, field_mask=["daqenergy", "timestamp"])[ - 0 - ].view_as("pd") - - if pulser_mask is None: - pulser_props = cts.find_pulser_properties(df, energy="daqenergy") - if len(pulser_props) > 0: - final_mask = None - for entry in pulser_props: - e_cut = (df.daqenergy.values < entry[0] + entry[1]) & ( - df.daqenergy.values > entry[0] - entry[1] - ) - if final_mask is None: - final_mask = e_cut - else: - final_mask = final_mask | e_cut - ids = final_mask - log.debug(f"pulser found: {pulser_props}") - else: - log.debug("no_pulser") - ids = np.zeros(len(df.daqenergy.values), dtype=bool) - # Get events around peak using raw file values - else: - ids = pulser_mask - initial_mask = (df.daqenergy.values > threshold) & (~ids) - rough_energy = df.daqenergy.values[initial_mask] - initial_idxs = np.where(initial_mask)[0] - - guess_keV = 2620 / np.nanpercentile(rough_energy, 99) - Euc_min = threshold / guess_keV * 0.6 - Euc_max = 2620 / guess_keV * 1.1 - dEuc = 1 # / guess_keV - hist, bins, var = pgh.get_hist(rough_energy, range=(Euc_min, Euc_max), dx=dEuc) - detected_peaks_locs, detected_peaks_keV, roughpars = pgc.hpge_find_E_peaks( - hist, - bins, - var, - np.array([238.632, 583.191, 727.330, 860.564, 1620.5, 2103.53, 2614.553]), - ) - log.debug(f"detected {detected_peaks_keV} keV peaks at {detected_peaks_locs}") - - masks = [] - for peak_idx in peak_idxs: - peak = peaks_keV[peak_idx] - kev_width = kev_widths[peak_idx] - try: - if peak not in detected_peaks_keV: - raise ValueError - detected_peak_idx = np.where(detected_peaks_keV == peak)[0] - peak_loc = detected_peaks_locs[detected_peak_idx] - log.info(f"{peak} peak found at {peak_loc}") - rough_adc_to_kev = roughpars[0] - e_lower_lim = peak_loc - (1.1 * kev_width[0]) / rough_adc_to_kev - e_upper_lim = peak_loc + (1.1 * kev_width[1]) / rough_adc_to_kev - except: - log.debug(f"{peak} peak not found attempting to use rough parameters") - peak_loc = (peak - roughpars[1]) / roughpars[0] - rough_adc_to_kev = roughpars[0] - e_lower_lim = peak_loc - (1.5 * kev_width[0]) / rough_adc_to_kev - e_upper_lim = peak_loc + (1.5 * kev_width[1]) / rough_adc_to_kev - log.debug(f"lower_lim:{e_lower_lim}, upper_lim:{e_upper_lim}") - e_mask = (rough_energy > e_lower_lim) & (rough_energy < e_upper_lim) - e_idxs = initial_idxs[e_mask][: int(2.5 * n_events)] - masks.append(e_idxs) - log.debug(f"{len(e_idxs)} events found in energy range for {peak}") - - idx_list_lens = [len(masks[peak_idx]) for peak_idx in peak_idxs] - - sort_index = np.argsort(np.concatenate(masks)) - idx_list = get_wf_indexes(sort_index, idx_list_lens) - idxs = np.array(sorted(np.concatenate(masks))) - - input_data = sto.read(f"{lh5_path}", raw_files, idx=idxs, n_rows=len(idxs))[0] - - if isinstance(dsp_config, str): - with open(dsp_config) as r: - dsp_config = json.load(r) - - dsp_config["outputs"] = cts.get_keys( - dsp_config["outputs"], list(cut_parameters) - ) + [energy_parameter] - - log.debug("Processing data") - tb_data = opt.run_one_dsp(input_data, dsp_config, db_dict=db_dict) - - cut_dict = cts.generate_cuts(tb_data, cut_parameters) - log.debug(f"Cuts are: {cut_dict}") - log.debug("Loaded Cuts") - ct_mask = cts.get_cut_indexes(tb_data, cut_dict) - - final_events = [] - out_events = [] - for peak_idx in peak_idxs: - peak = peaks_keV[peak_idx] - kev_width = kev_widths[peak_idx] - - peak_ids = np.array(idx_list[peak_idx]) - peak_ct_mask = ct_mask[peak_ids] - peak_ids = peak_ids[peak_ct_mask] - - energy = tb_data[energy_parameter].nda[peak_ids] - - hist, bins, var = pgh.get_hist( - energy, range=(int(threshold), int(np.nanmax(energy))), dx=1 - ) - peak_loc = pgh.get_bin_centers(bins)[np.nanargmax(hist)] - rough_adc_to_kev = peak / peak_loc - - e_lower_lim = peak_loc - (1.5 * kev_width[0]) / rough_adc_to_kev - e_upper_lim = peak_loc + (1.5 * kev_width[1]) / rough_adc_to_kev - - e_ranges = (int(peak_loc - e_lower_lim), int(e_upper_lim - peak_loc)) - ( - params, - errors, - covs, - bins, - ranges, - p_val, - valid_pks, - pk_funcs, - ) = pgc.hpge_fit_E_peaks( - energy, - [peak_loc], - [e_ranges], - n_bins=(np.nanmax(energy) - np.nanmin(energy)) // 1, - uncal_is_int=True, - ) - if params[0] is None or np.isnan(params[0]).any(): - log.debug("Fit failed, using max guess") - hist, bins, var = pgh.get_hist( - energy, range=(int(e_lower_lim), int(e_upper_lim)), dx=1 - ) - params = [[0, pgh.get_bin_centers(bins)[np.nanargmax(hist)], 0, 0, 0, 0]] - updated_adc_to_kev = peak / params[0][1] - e_lower_lim = params[0][1] - (kev_width[0]) / updated_adc_to_kev - e_upper_lim = params[0][1] + (kev_width[1]) / updated_adc_to_kev - log.info(f"lower lim is :{e_lower_lim}, upper lim is {e_upper_lim}") - final_mask = (energy > e_lower_lim) & (energy < e_upper_lim) - final_events.append(peak_ids[final_mask][:n_events]) - out_events.append(idxs[final_events[-1]]) - log.info(f"{len(peak_ids[final_mask][:n_events])} passed selections for {peak}") - if len(peak_ids[final_mask]) < 0.5 * n_events: - log.warning("Less than half number of specified events found") - elif len(peak_ids[final_mask]) < 0.1 * n_events: - log.error("Less than 10% number of specified events found") - out_events = np.unique(np.concatenate(out_events)) - sort_index = np.argsort(np.concatenate(final_events)) - idx_list = get_wf_indexes(sort_index, [len(mask) for mask in final_events]) - return out_events, idx_list - - -def fwhm_slope(x, m0, m1, m2): - """ - Fit the energy resolution curve - """ - return np.sqrt(m0 + m1 * x + m2 * (x**2)) - - -def interpolate_energy(peak_energies, points, err_points, energy): - nan_mask = np.isnan(points) | (points < 0) - if len(points[~nan_mask]) < 3: - return np.nan, np.nan, np.nan - else: - param_guess = [2, 0.001, 0.000001] # - # param_bounds = (0, [10., 1. ])# - try: - fit_pars, fit_covs = curve_fit( - fwhm_slope, - peak_energies[~nan_mask], - points[~nan_mask], - sigma=err_points[~nan_mask], - p0=param_guess, - absolute_sigma=True, - ) # bounds=param_bounds, - fit_qbb = fwhm_slope(energy, *fit_pars) - - xs = np.arange(peak_energies[0], peak_energies[-1], 0.1) - - rng = np.random.default_rng(1) - - # generate set of bootstrapped parameters - par_b = rng.multivariate_normal(fit_pars, fit_covs, size=1000) - qbb_vals = np.array([fwhm_slope(energy, *p) for p in par_b]) - qbb_err = np.nanstd(qbb_vals) - except: - return np.nan, np.nan, np.nan - - if nan_mask[-1] == True or nan_mask[-2] == True: - qbb_err = np.nan - if qbb_err / fit_qbb > 0.1: - qbb_err = np.nan - - return fit_qbb, qbb_err, fit_pars - - -def fom_FWHM(tb_in, kwarg_dict, ctc_parameter, alpha, idxs=None, display=0): - """ - FOM for sweeping over ctc values to find the best value, returns the best found fwhm - """ - parameter = kwarg_dict["parameter"] - func = kwarg_dict["func"] - cs_func = kwarg_dict["gof_func"] - Energies = tb_in[parameter].nda - Energies = Energies.astype("float64") - peak = kwarg_dict["peak"] - kev_width = kwarg_dict["kev_width"] - - if ctc_parameter == "QDrift": - dt = tb_in["dt_eff"].nda - elif ctc_parameter == "dt": - dt = np.subtract(tb_in["tp_99"].nda, tb_in["tp_0_est"].nda, dtype="float64") - elif ctc_parameter == "rt": - dt = np.subtract(tb_in["tp_99"].nda, tb_in["tp_01"].nda, dtype="float64") - if np.isnan(Energies).any() or np.isnan(dt).any(): - if np.isnan(Energies).any(): - log.debug(f"nan energy values for peak {peak}") - else: - log.debug(f"nan dt values for peak {peak}") - return { - "fwhm": np.nan, - "fwhm_err": np.nan, - "alpha": np.nan, - "chisquare": np.nan, - "n_sig": np.nan, - "n_sig_err": np.nan, - } - - if idxs is not None: - Energies = Energies[idxs] - dt = dt[idxs] - - # Return fwhm of optimal alpha in kev with error - try: - ( - final_fwhm, - _, - final_err, - _, - csqr, - n_sig, - n_sig_err, - _, - ) = get_peak_fwhm_with_dt_corr( - Energies, - alpha, - dt, - func, - cs_func, - peak, - kev_width, - kev=True, - display=display, - ) - except: - final_fwhm = np.nan - final_err = np.nan - csqr = np.nan - n_sig = np.nan - n_sig_err = np.nan - return { - "fwhm": final_fwhm, - "fwhm_err": final_err, - "alpha": alpha, - "chisquare": csqr, - "n_sig": n_sig, - "n_sig_err": n_sig_err, - } - - -def single_peak_fom(data, kwarg_dict): - peaks = kwarg_dict["peaks_keV"] +def fom_single_peak_alpha_sweep(data, kwarg_dict, display=0): idx_list = kwarg_dict["idx_list"] ctc_param = kwarg_dict["ctc_param"] peak_dicts = kwarg_dict["peak_dicts"] - - out_dict = fom_FWHM_with_dt_corr_fit( - data, peak_dicts[0], ctc_param, idxs=idx_list[0], display=0 + frac_max = kwarg_dict.get("frac_max", 0.2) + out_dict = fom_fwhm_with_alpha_fit( + data, + peak_dicts[0], + ctc_param, + idxs=idx_list[0], + frac_max=frac_max, + display=display, ) - out_dict["y_val"] = out_dict["fwhm"] - out_dict["y_err"] = out_dict["fwhm_err"] return out_dict -def new_fom(data, kwarg_dict): - peaks = kwarg_dict["peaks_keV"] +def fom_interpolate_energy_res_with_single_peak_alpha_sweep( + data, kwarg_dict, display=0 +): + peaks = kwarg_dict["peaks_kev"] idx_list = kwarg_dict["idx_list"] ctc_param = kwarg_dict["ctc_param"] - peak_dicts = kwarg_dict["peak_dicts"] - - out_dict = fom_FWHM_with_dt_corr_fit( - data, peak_dicts[-1], ctc_param, idxs=idx_list[-1], display=0 + interp_energy = kwarg_dict.get("interp_energy", {"Qbb": 2039}) + fwhm_func = kwarg_dict.get("fwhm_func", pgc.FWHMLinear) + frac_max = kwarg_dict.get("frac_max", 0.2) + + out_dict = fom_fwhm_with_alpha_fit( + data, + peak_dicts[-1], + ctc_param, + idxs=idx_list[-1], + frac_max=frac_max, + display=display, ) alpha = out_dict["alpha"] log.info(alpha) @@ -1224,14 +502,16 @@ def new_fom(data, kwarg_dict): fwhm_errs = [] n_sig = [] n_sig_err = [] - for i, peak in enumerate(peaks[:-1]): - out_peak_dict = fom_FWHM( - data, peak_dicts[i], ctc_param, alpha, idxs=idx_list[i], display=0 + for i, _ in enumerate(peaks[:-1]): + out_peak_dict = fom_fwhm_no_alpha_sweep( + data, + peak_dicts[i], + ctc_param, + alpha=alpha, + idxs=idx_list[i], + frac_max=frac_max, + display=display, ) - # n_sig_minimum = peak_dicts[i]["n_sig_minimum"] - # if peak_dict["n_sig"] 0.1: + interp_res_err = np.nan + + log.info(f"{list(interp_energy)[0]} fwhm is {interp_res} keV +- {interp_res_err}") return { - "y_val": qbb, - "y_err": qbb_err, - "qbb_fwhm": qbb, - "qbb_fwhm_err": qbb_err, + f"{list(interp_energy)[0]}_fwhm": interp_res, + f"{list(interp_energy)[0]}_fwhm_err": interp_res_err, "alpha": alpha, "peaks": peaks.tolist(), "fwhms": fwhms, "fwhm_errs": fwhm_errs, - "n_events": n_sig, + "n_sig": n_sig, "n_sig_err": n_sig_err, } - - -OptimiserDimension = namedtuple( - "OptimiserDimension", "name parameter min_val max_val rounding unit" -) - - -class BayesianOptimizer: - np.random.seed(55) - lambda_param = 0.01 - eta_param = 0 - # FIXME: the following throws a TypeError - # kernel=ConstantKernel(1.0, constant_value_bounds="fixed") * RBF(1, length_scale_bounds="fixed") #+ WhiteKernel(noise_level=0.0111) - - def __init__(self, acq_func, batch_size, kernel=None): - self.dims = [] - self.current_iter = 0 - - self.batch_size = batch_size - self.iters = 0 - - self.gauss_pr = GaussianProcessRegressor(kernel=kernel) - self.best_samples_ = pd.DataFrame(columns=["x", "y", "ei"]) - self.distances_ = [] - - if acq_func == "ei": - self.acq_function = self._get_expected_improvement - elif acq_func == "ucb": - self.acq_function = self._get_ucb - elif acq_func == "lcb": - self.acq_function = self._get_lcb - - def add_dimension(self, name, parameter, min_val, max_val, rounding=2, unit=None): - self.dims.append( - OptimiserDimension(name, parameter, min_val, max_val, rounding, unit) - ) - - def get_n_dimensions(self): - return len(self.dims) - - def add_initial_values(self, x_init, y_init, yerr_init): - self.x_init = x_init - self.y_init = y_init - self.yerr_init = yerr_init - - def _get_expected_improvement(self, x_new): - mean_y_new, sigma_y_new = self.gauss_pr.predict( - np.array([x_new]), return_std=True - ) - - mean_y = self.gauss_pr.predict(self.x_init) - min_mean_y = np.min(mean_y) - z = (mean_y_new[0] - min_mean_y - 1) / (sigma_y_new[0] + 1e-9) - exp_imp = (mean_y_new[0] - min_mean_y - 1) * norm.cdf(z) + sigma_y_new[ - 0 - ] * norm.pdf(z) - return exp_imp - - def _get_ucb(self, x_new): - mean_y_new, sigma_y_new = self.gauss_pr.predict( - np.array([x_new]), return_std=True - ) - return mean_y_new[0] + self.lambda_param * sigma_y_new[0] - - def _get_lcb(self, x_new): - mean_y_new, sigma_y_new = self.gauss_pr.predict( - np.array([x_new]), return_std=True - ) - return mean_y_new[0] - self.lambda_param * sigma_y_new[0] - - def _get_next_probable_point(self): - min_ei = float(sys.maxsize) - x_optimal = None - # Trial with an array of random data points - rands = np.random.uniform( - np.array([dim.min_val for dim in self.dims]), - np.array([dim.max_val for dim in self.dims]), - (self.batch_size, self.get_n_dimensions()), - ) - for x_start in rands: - response = minimize( - fun=self.acq_function, - x0=x_start, - bounds=[(dim.min_val, dim.max_val) for dim in self.dims], - method="L-BFGS-B", - ) - if response.fun < min_ei: - min_ei = response.fun - x_optimal = [ - y.round(dim.rounding) for y, dim in zip(response.x, self.dims) - ] - if x_optimal in self.x_init and self.iters < 5: - if self.iters < 5: - self.iters += 1 - x_optimal, min_ei = self._get_next_probable_point() - else: - perturb = np.random.uniform( - np.array([(dim.max_val - dim.min_val) / 100 for dim in self.dims]), - np.array([(dim.max_val - dim.min_val) / 10 for dim in self.dims]), - (1, len(self.dims)), - ) - x_optimal += perturb - x_optimal = [ - y.round(dim.rounding) for y, dim in zip(x_optimal[0], self.dims) - ] - for i, y in enumerate(x_optimal): - if y > self.dims[i].max_val: - x_optimal[i] = self.dims[i].max_val - elif y < self.dims[i].min_val: - x_optimal[i] = self.dims[i].min_val - - return x_optimal, min_ei - - def _extend_prior_with_posterior_data(self, x, y, yerr): - self.x_init = np.append(self.x_init, np.array([x]), axis=0) - self.y_init = np.append(self.y_init, np.array(y), axis=0) - self.yerr_init = np.append(self.yerr_init, np.array(yerr), axis=0) - - def get_first_point(self): - y_min_ind = np.nanargmin(self.y_init) - self.y_min = self.y_init[y_min_ind] - self.optimal_x = self.x_init[y_min_ind] - self.optimal_ei = None - return self.optimal_x, self.optimal_ei - - @ignore_warnings(category=ConvergenceWarning) - def iterate_values(self): - nan_idxs = np.isnan(self.y_init) - self.gauss_pr.fit(self.x_init[~nan_idxs], np.array(self.y_init)[~nan_idxs]) - x_next, ei = self._get_next_probable_point() - return x_next, ei - - def update_db_dict(self, db_dict): - if self.current_iter == 0: - x_new, ei = self.get_first_point() - x_new, ei = self.iterate_values() - self.current_x = x_new - self.current_ei = ei - for i, val in enumerate(x_new): - name, parameter, min_val, max_val, rounding, unit = self.dims[i] - if unit is not None: - value_str = f"{val}*{unit}" - else: - value_str = f"{val}" - if name not in db_dict.keys(): - db_dict[name] = {parameter: value_str} - else: - db_dict[name][parameter] = value_str - self.current_iter += 1 - return db_dict - - def update(self, results): - y_val = results["y_val"] - y_err = results["y_err"] - self._extend_prior_with_posterior_data( - self.current_x, np.array([y_val]), np.array([y_err]) - ) - - if np.isnan(y_val) | np.isnan(y_err): - pass - else: - if y_val < self.y_min: - self.y_min = y_val - self.optimal_x = self.current_x - self.optimal_ei = self.current_ei - self.optimal_results = results - - if self.current_iter == 1: - self.prev_x = self.current_x - else: - self.distances_.append( - np.linalg.norm(np.array(self.prev_x) - np.array(self.current_x)) - ) - self.prev_x = self.current_x - - self.best_samples_ = pd.concat( - [ - self.best_samples_, - pd.DataFrame( - {"x": self.optimal_x, "y": self.y_min, "ei": self.optimal_ei} - ), - ], - ignore_index=True, - ) - - def get_best_vals(self): - out_dict = {} - for i, val in enumerate(self.optimal_x): - name, parameter, min_val, max_val, rounding, unit = self.dims[i] - value_str = f"{val}*{unit}" - if name not in out_dict.keys(): - out_dict[name] = {parameter: value_str} - else: - out_dict[name][parameter] = value_str - return out_dict - - @ignore_warnings(category=ConvergenceWarning) - def plot(self, init_samples=None): - nan_idxs = np.isnan(self.y_init) - fail_idxs = np.isnan(self.yerr_init) - self.gauss_pr.fit(self.x_init[~nan_idxs], np.array(self.y_init)[~nan_idxs]) - if (len(self.dims) != 2) and (len(self.dims) != 1): - raise Exception("Acquisition Function Plotting not implemented for dim!=2") - elif len(self.dims) == 1: - points = np.arange(self.dims[0].min_val, self.dims[0].max_val, 0.1) - ys = np.zeros_like(points) - ys_err = np.zeros_like(points) - for i, point in enumerate(points): - ys[i], ys_err[i] = self.gauss_pr.predict( - np.array([point]).reshape(1, -1), return_std=True - ) - fig = plt.figure() - - plt.scatter(np.array(self.x_init), np.array(self.y_init), label="Samples") - plt.scatter( - np.array(self.x_init)[fail_idxs], - np.array(self.y_init)[fail_idxs], - color="green", - label="Failed samples", - ) - plt.fill_between(points, ys - ys_err, ys + ys_err, alpha=0.1) - if init_samples is not None: - init_ys = np.array( - [ - np.where(init_sample == self.x_init)[0][0] - for init_sample in init_samples - ] - ) - plt.scatter( - np.array(init_samples)[:, 0], - np.array(self.y_init)[init_ys], - color="red", - label="Init Samples", - ) - plt.scatter(self.optimal_x[0], self.y_min, color="orange", label="Optimal") - - plt.xlabel( - f"{self.dims[0].name}-{self.dims[0].parameter}({self.dims[0].unit})" - ) - plt.ylabel(f"Kernel Value") - plt.legend() - elif len(self.dims) == 2: - x, y = np.mgrid[ - self.dims[0].min_val : self.dims[0].max_val : 0.1, - self.dims[1].min_val : self.dims[1].max_val : 0.1, - ] - points = np.vstack((x.flatten(), y.flatten())).T - out_grid = np.zeros( - ( - int((self.dims[0].max_val - self.dims[0].min_val) * 10), - int((self.dims[1].max_val - self.dims[1].min_val) * 10), - ) - ) - - j = 0 - for i, _ in np.ndenumerate(out_grid): - out_grid[i] = self.gauss_pr.predict( - points[j].reshape(1, -1), return_std=False - ) - j += 1 - - fig = plt.figure() - plt.imshow( - out_grid, - norm=LogNorm(), - origin="lower", - aspect="auto", - extent=(0, out_grid.shape[1], 0, out_grid.shape[0]), - ) - plt.scatter( - np.array(self.x_init - self.dims[1].min_val)[:, 1] * 10, - np.array(self.x_init - self.dims[0].min_val)[:, 0] * 10, - ) - if init_samples is not None: - plt.scatter( - (init_samples[:, 1] - self.dims[1].min_val) * 10, - (init_samples[:, 0] - self.dims[0].min_val) * 10, - color="red", - ) - plt.scatter( - (self.optimal_x[1] - self.dims[1].min_val) * 10, - (self.optimal_x[0] - self.dims[0].min_val) * 10, - color="orange", - ) - ticks, labels = plt.xticks() - labels = np.linspace(self.dims[1].min_val, self.dims[1].max_val, 5) - ticks = np.linspace(0, out_grid.shape[1], 5) - plt.xticks(ticks=ticks, labels=labels, rotation=45) - ticks, labels = plt.yticks() - labels = np.linspace(self.dims[0].min_val, self.dims[0].max_val, 5) - ticks = np.linspace(0, out_grid.shape[0], 5) - plt.yticks(ticks=ticks, labels=labels, rotation=45) - plt.xlabel( - f"{self.dims[1].name}-{self.dims[1].parameter}({self.dims[1].unit})" - ) - plt.ylabel( - f"{self.dims[0].name}-{self.dims[0].parameter}({self.dims[0].unit})" - ) - plt.title(f"{self.dims[0].name} Kernel Prediction") - plt.tight_layout() - plt.close() - return fig - - @ignore_warnings(category=ConvergenceWarning) - def plot_acq(self, init_samples=None): - nan_idxs = np.isnan(self.y_init) - self.gauss_pr.fit(self.x_init[~nan_idxs], np.array(self.y_init)[~nan_idxs]) - if (len(self.dims) != 2) and (len(self.dims) != 1): - raise Exception("Acquisition Function Plotting not implemented for dim!=2") - elif len(self.dims) == 1: - points = np.arange(self.dims[0].min_val, self.dims[0].max_val, 0.1) - ys = np.zeros_like(points) - for i, point in enumerate(points): - ys[i] = self.acq_function(np.array([point]).reshape(1, -1)[0]) - fig = plt.figure() - plt.plot(points, ys) - plt.scatter(np.array(self.x_init), np.array(self.y_init), label="Samples") - if init_samples is not None: - init_ys = np.array( - [ - np.where(init_sample == self.x_init)[0][0] - for init_sample in init_samples - ] - ) - plt.scatter( - np.array(init_samples)[:, 0], - np.array(self.y_init)[init_ys], - color="red", - label="Init Samples", - ) - plt.scatter(self.optimal_x[0], self.y_min, color="orange", label="Optimal") - - plt.xlabel( - f"{self.dims[0].name}-{self.dims[0].parameter}({self.dims[0].unit})" - ) - plt.ylabel(f"Acquisition Function Value") - plt.legend() - - elif len(self.dims) == 2: - x, y = np.mgrid[ - self.dims[0].min_val : self.dims[0].max_val : 0.1, - self.dims[1].min_val : self.dims[1].max_val : 0.1, - ] - points = np.vstack((x.flatten(), y.flatten())).T - out_grid = np.zeros( - ( - int((self.dims[0].max_val - self.dims[0].min_val) * 10), - int((self.dims[1].max_val - self.dims[1].min_val) * 10), - ) - ) - - j = 0 - for i, _ in np.ndenumerate(out_grid): - out_grid[i] = self.acq_function(points[j]) - j += 1 - - fig = plt.figure() - plt.imshow( - out_grid, - norm=LogNorm(), - origin="lower", - aspect="auto", - extent=(0, out_grid.shape[1], 0, out_grid.shape[0]), - ) - plt.scatter( - np.array(self.x_init - self.dims[1].min_val)[:, 1] * 10, - np.array(self.x_init - self.dims[0].min_val)[:, 0] * 10, - ) - if init_samples is not None: - plt.scatter( - (init_samples[:, 1] - self.dims[1].min_val) * 10, - (init_samples[:, 0] - self.dims[0].min_val) * 10, - color="red", - ) - plt.scatter( - (self.optimal_x[1] - self.dims[1].min_val) * 10, - (self.optimal_x[0] - self.dims[0].min_val) * 10, - color="orange", - ) - ticks, labels = plt.xticks() - labels = np.linspace(self.dims[1].min_val, self.dims[1].max_val, 5) - ticks = np.linspace(0, out_grid.shape[1], 5) - plt.xticks(ticks=ticks, labels=labels, rotation=45) - ticks, labels = plt.yticks() - labels = np.linspace(self.dims[0].min_val, self.dims[0].max_val, 5) - ticks = np.linspace(0, out_grid.shape[0], 5) - plt.yticks(ticks=ticks, labels=labels, rotation=45) - plt.xlabel( - f"{self.dims[1].name}-{self.dims[1].parameter}({self.dims[1].unit})" - ) - plt.ylabel( - f"{self.dims[0].name}-{self.dims[0].parameter}({self.dims[0].unit})" - ) - plt.title(f"{self.dims[0].name} Acquisition Space") - plt.tight_layout() - plt.close() - return fig - - -def run_optimisation( - tb_data, - dsp_config, - fom_function, - optimisers, - fom_kwargs=None, - db_dict=None, - nan_val=10, - n_iter=10, -): - if not isinstance(optimisers, list): - optimisers = [optimisers] - if not isinstance(fom_kwargs, list): - fom_kwargs = [fom_kwargs] - if not isinstance(fom_function, list): - fom_function = [fom_function] - - for j in range(n_iter): - for optimiser in optimisers: - db_dict = optimiser.update_db_dict(db_dict) - - log.info(f"Iteration number: {j+1}") - log.info(f"Processing with {db_dict}") - - tb_out = opt.run_one_dsp(tb_data, dsp_config, db_dict=db_dict) - - res = np.ndarray(shape=len(optimisers), dtype="O") - - for i in range(len(optimisers)): - if fom_kwargs[i] is not None: - if len(fom_function) > 1: - res[i] = fom_function[i](tb_out, fom_kwargs[i]) - else: - res[i] = fom_function[0](tb_out, fom_kwargs[i]) - else: - if len(fom_function) > 1: - res[i] = fom_function[i](tb_out) - else: - res[i] = fom_function[0](tb_out) - - log.info(f"Results of iteration {j+1} are {res}") - - for i, optimiser in enumerate(optimisers): - if np.isnan(res[i]["y_val"]): - if isinstance(nan_val, list): - res[i]["y_val"] = nan_val[i] - else: - res[i]["y_val"] = nan_val - - optimiser.update(res[i]) - - out_param_dict = {} - out_results_list = [] - for optimiser in optimisers: - param_dict = optimiser.get_best_vals() - out_param_dict.update(param_dict) - results_dict = optimiser.optimal_results - if np.isnan(results_dict["y_val"]): - log.error(f"Energy optimisation failed for {optimiser.dims[0][0]}") - out_results_list.append(results_dict) - - return out_param_dict, out_results_list - - -def get_ctc_grid(grids, ctc_param): - """ - Reshapes optimizer grids to be in easier form - """ - error_grids = [] - dt_grids = [] - alpha_grids = [] - alpha_error_grids = [] - nevents_grids = [] - for grid in grids: - shape = grid.shape - dt_grid = np.ndarray(shape=shape) - alpha_grid = np.ndarray(shape=shape) - error_grid = np.ndarray(shape=shape) - alpha_error_grid = np.ndarray(shape=shape) - nevents_grid = np.ndarray(shape=shape) - for i in range(shape[0]): - for j in range(shape[1]): - dt_grid[i, j] = grid[i, j][ctc_param]["fwhm"] - error_grid[i, j] = grid[i, j][ctc_param]["fwhm_err"] - nevents_grid[i, j] = grid[i, j][ctc_param]["n_sig"] - try: - alpha_grid[i, j] = grid[i, j][ctc_param]["alpha"] - except: - pass - try: - alpha_error_grid[i, j] = grid[i, j][ctc_param]["alpha_err"] - except: - pass - dt_grids.append(dt_grid) - alpha_grids.append(alpha_grid) - error_grids.append(error_grid) - alpha_error_grids.append(alpha_error_grid) - nevents_grids.append(nevents_grid) - return dt_grids, error_grids, alpha_grids, alpha_error_grids, nevents_grids - - -def interpolate_energy_old(peak_energies, grids, error_grids, energy, nevents_grids): - """ - Interpolates fwhm vs energy for every grid point - """ - - grid_no = len(grids) - grid_shape = grids[0].shape - out_grid = np.empty(grid_shape) - out_grid_err = np.empty(grid_shape) - n_event_lim = np.array( - [0.98 * np.nanpercentile(nevents_grid, 50) for nevents_grid in nevents_grids] - ) - for index, x in np.ndenumerate(grids[0]): - points = np.array([grids[i][index] for i in range(len(grids))]) - err_points = np.array([error_grids[i][index] for i in range(len(grids))]) - n_sigs = np.array([nevents_grids[i][index] for i in range(len(grids))]) - nan_mask = ( - np.isnan(points) - | (points < 0) - | (0.1 * points < err_points) - | (n_sigs < n_event_lim) - ) - try: - if len(points[nan_mask]) > 2: - raise ValueError - elif nan_mask[-1] == True or nan_mask[-2] == True: - raise ValueError - param_guess = [0.2, 0.001, 0.000001] - param_bounds = param_bounds = (0, [1, np.inf, np.inf]) # ,0.1 - fit_pars, fit_covs = curve_fit( - fwhm_slope, - peak_energies[~nan_mask], - points[~nan_mask], - sigma=err_points[~nan_mask], - p0=param_guess, - bounds=param_bounds, - absolute_sigma=True, - ) # - fit_qbb = fwhm_slope(energy, *fit_pars) - sderrs = np.sqrt(np.diag(fit_covs)) - qbb_err = fwhm_slope(energy, *(fit_pars + sderrs)) - fwhm_slope( - energy, *fit_pars - ) - out_grid[index] = fit_qbb - out_grid_err[index] = qbb_err - except: - out_grid[index] = np.nan - out_grid_err[index] = np.nan - return out_grid, out_grid_err - - -def find_lowest_grid_point_save(grid, err_grid, opt_dict): - """ - Finds the lowest grid point, if more than one with same value returns shortest filter. - """ - opt_name = list(opt_dict.keys())[0] - print(opt_name) - keys = list(opt_dict[opt_name].keys()) - param_list = [] - shape = [] - db_dict = {} - for key in keys: - param_dict = opt_dict[opt_name][key] - grid_axis = np.arange( - param_dict["start"], param_dict["end"], param_dict["spacing"] - ) - unit = param_dict.get("unit") - param_list.append(grid_axis) - shape.append(len(grid_axis)) - - total_lengths = np.zeros(shape) - - for index, x in np.ndenumerate(total_lengths): - for i, param in enumerate(param_list): - total_lengths[index] += param[index[i]] - min_val = np.nanmin(grid) - lowest_ixs = np.where(grid == min_val) - try: - fwhm_dict = {"fwhm": min_val, "fwhm_err": err_grid[lowest_ixs][0]} - except: - print(lowest_ixs) - if len(lowest_ixs[0]) == 1: - for i, key in enumerate(keys): - if i == 0: - if unit is not None: - db_dict[opt_name] = { - key: f"{param_list[i][lowest_ixs[i]][0]}*{unit}" - } - else: - db_dict[opt_name] = {key: f"{param_list[i][lowest_ixs[i]][0]}"} - else: - if unit is not None: - db_dict[opt_name][key] = f"{param_list[i][lowest_ixs[i]][0]}*{unit}" - else: - db_dict[opt_name][key] = f"{param_list[i][lowest_ixs[i]][0]}" - else: - shortest_length = np.argmin(total_lengths[lowest_ixs]) - final_idxs = [lowest_ix[shortest_length] for lowest_ix in lowest_ixs] - for i, key in enumerate(keys): - if unit is not None: - db_dict[opt_name] = {key: f"{param_list[i][lowest_ixs[i]][0]}*{unit}"} - else: - db_dict[opt_name] = {key: f"{param_list[i][lowest_ixs[i]][0]}"} - return lowest_ixs, fwhm_dict, db_dict - - -def interpolate_grid(energies, grids, int_energy, deg, nevents_grids): - """ - Interpolates energy vs parameter for every grid point using polynomial. - """ - grid_no = len(grids) - grid_shape = grids[0].shape - out_grid = np.empty(grid_shape) - n_event_lim = np.array( - [0.98 * np.nanpercentile(nevents_grid, 50) for nevents_grid in nevents_grids] - ) - for index, x in np.ndenumerate(grids[0]): - points = np.array([grids[i][index] for i in range(len(grids))]) - n_sigs = np.array([nevents_grids[i][index] for i in range(len(grids))]) - nan_mask = np.isnan(points) | (points < 0) | (n_sigs < n_event_lim) - try: - if len(points[~nan_mask]) < 3: - raise IndexError - fit_point = np.polynomial.polynomial.polyfit( - energies[~nan_mask], points[~nan_mask], deg=deg - ) - out_grid[index] = np.polynomial.polynomial.polyval(int_energy, fit_point) - except: - out_grid[index] = np.nan - return out_grid - - -def get_best_vals(peak_grids, peak_energies, param, opt_dict, save_path=None): - """ - Finds best filter parameters - """ - dt_grids, error_grids, alpha_grids, alpha_error_grids, nevents_grids = get_ctc_grid( - peak_grids, param - ) - qbb_grid, qbb_errs = interpolate_energy( - peak_energies, dt_grids, error_grids, 2039.061, nevents_grids - ) - qbb_alphas = interpolate_grid( - peak_energies[2:], alpha_grids[2:], 2039.061, 1, nevents_grids[2:] - ) - ixs, fwhm_dict, db_dict = find_lowest_grid_point_save(qbb_grid, qbb_errs, opt_dict) - out_grid = {"fwhm": qbb_grid, "fwhm_err": qbb_errs, "alphas": qbb_alphas} - - if isinstance(save_path, str): - mpl.use("pdf") - e_param = list(opt_dict.keys())[0] - opt_dict = opt_dict[e_param] - - detector = save_path.split("/")[-1] - save_path = os.path.join(save_path, f"{e_param}-{param}.pdf") - pathlib.Path(os.path.dirname(save_path)).mkdir(parents=True, exist_ok=True) - - with PdfPages(save_path) as pdf: - keys = list(opt_dict.keys()) - print(keys) - x_dict = opt_dict[keys[1]] - xvals = np.arange(x_dict["start"], x_dict["end"], x_dict["spacing"]) - xs = ( - np.arange(0, len(xvals), 1), - np.arange(x_dict["start"], x_dict["end"], x_dict["spacing"]), - ) - y_dict = opt_dict[keys[0]] - yvals = np.arange(y_dict["start"], y_dict["end"], y_dict["spacing"]) - ys = ( - np.arange(0, len(yvals), 1), - np.arange(y_dict["start"], y_dict["end"], y_dict["spacing"]), - ) - for i, x in enumerate(xs[1]): - xs[1][i] = round(x, 1) - for i, y in enumerate(ys[1]): - ys[1][i] = round(y, 1) - print(ixs) - points = np.array( - [dt_grids[i][ixs[0][0], ixs[1][0]] for i in range(len(dt_grids))] - ) - err_points = np.array( - [error_grids[i][ixs[0][0], ixs[1][0]] for i in range(len(error_grids))] - ) - alpha_points = np.array( - [alpha_grids[i][ixs[0][0], ixs[1][0]] for i in range(len(alpha_grids))] - ) - alpha_error_points = np.array( - [ - alpha_error_grids[i][ixs[0][0], ixs[1][0]] - for i in range(len(alpha_error_grids)) - ] - ) - param_guess = [0.2, 0.001, 0.000001] - param_bounds = (0, [1, np.inf, np.inf]) # ,0.1 - nan_mask = np.isnan(points) - nan_mask = nan_mask | (points < 0) | (0.1 * points < err_points) - fit_pars, fit_covs = curve_fit( - fwhm_slope, - peak_energies[~nan_mask], - points[~nan_mask], - sigma=err_points[~nan_mask], - p0=param_guess, - bounds=param_bounds, - absolute_sigma=True, - ) # - energy_x = np.arange(200, 2600, 10) - plt.rcParams["figure.figsize"] = (12, 18) - plt.rcParams["font.size"] = 12 - plt.figure() - for i, dt_grid in enumerate(dt_grids): - plt.subplot(3, 2, i + 1) - v_min = np.nanmin(np.abs(dt_grid)) - if v_min == 0: - for j in range(10): - v_min = np.nanpercentile(np.abs(dt_grid), j + 1) - if v_min > 0.1: - break - plt.imshow( - dt_grid, - norm=LogNorm(vmin=v_min, vmax=np.nanpercentile(dt_grid, 98)), - cmap="viridis", - ) - - plt.xticks(xs[0], xs[1]) - plt.yticks(ys[0], ys[1]) - - plt.xlabel(f"{keys[1]} (us)") - plt.ylabel(f"{keys[0]} (us)") - plt.title(f"{peak_energies[i]:.1f} kev") - plt.xticks(rotation=45) - cbar = plt.colorbar() - cbar.set_label("FWHM (keV)") - plt.tight_layout() - plt.suptitle(f"{detector}-{e_param}-{param}") - pdf.savefig() - plt.close() - - plt.figure() - - plt.imshow( - qbb_grid, - norm=LogNorm( - vmin=np.nanmin(qbb_grid), vmax=np.nanpercentile(dt_grid, 98) - ), - cmap="viridis", - ) - plt.xlabel(f"{keys[1]} (us)") - plt.ylabel(f"{keys[0]} (us)") - plt.title(f"Qbb") - plt.xticks(rotation=45) - cbar = plt.colorbar() - cbar.set_label("FWHM (keV)") - plt.tight_layout() - plt.suptitle(f"{detector}-{e_param}-{param}") - pdf.savefig() - plt.close() - - fig, (ax1, ax2) = plt.subplots(2, 1, constrained_layout=True, sharex=True) - ax1.errorbar(peak_energies, points, yerr=err_points, fmt=" ") - ax1.plot(energy_x, fwhm_slope(energy_x, *fit_pars)) - ax1.errorbar( - [2039], qbb_grid[ixs[0], ixs[1]], yerr=qbb_errs[ixs[0], ixs[1]], fmt=" " - ) - ax1.set_ylabel("FWHM energy resolution (keV)", ha="right", y=1) - ax2.scatter( - peak_energies, - (points - fwhm_slope(peak_energies, *fit_pars)) / err_points, - lw=1, - c="b", - ) - ax2.set_xlabel("Energy (keV)", ha="right", x=1) - ax2.set_ylabel("Standardised Residuals", ha="right", y=1) - fig.suptitle(f"{detector}-{e_param}-{param}") - pdf.savefig() - plt.close() - - try: - alphas = qbb_alphas[ixs[0], ixs[1]][0] - if isinstance(save_path, str): - alpha_fit = np.polynomial.polynomial.polyfit( - peak_energies[2:], alpha_points[2:], deg=1 - ) - fig, (ax1, ax2) = plt.subplots( - 2, 1, constrained_layout=True, sharex=True - ) - ax1.errorbar( - peak_energies[:], - alpha_points[:], - yerr=alpha_error_points[:], - linestyle=" ", - ) - ax1.plot( - peak_energies[2:], - np.polynomial.polynomial.polyval(peak_energies[2:], alpha_fit), - ) - ax1.scatter([2039], qbb_alphas[ixs[0], ixs[1]]) - ax1.set_ylabel("Charge Trapping Value", ha="right", y=1) - ax2.scatter( - peak_energies[2:], - ( - alpha_points[2:] - - np.polynomial.polynomial.polyval( - peak_energies[2:], alpha_fit - ) - ) - / alpha_points[2:], - lw=1, - c="b", - ) - ax2.set_xlabel("Energy (keV)", ha="right", x=1) - ax2.set_ylabel("Residuals (%)", ha="right", y=1) - fig.suptitle(f"{detector}-{param}") - pdf.savefig() - plt.close() - except: - alphas = np.nan - else: - try: - alphas = qbb_alphas[ixs[0], ixs[1]][0] - except: - alphas = np.nan - return alphas, fwhm_dict, db_dict, out_grid - - -def get_filter_params( - grids, matched_configs, peak_energies, parameters, save_path=None -): - """ - Finds best parameters for filter - """ - - full_db_dict = {} - full_fwhm_dict = {} - full_grids = {} - - for param in parameters: - opt_dict = matched_configs[param] - peak_grids = grids[param] - ctc_params = list(peak_grids[0][0, 0].keys()) - ctc_dict = {} - - for ctc_param in ctc_params: - if ctc_param == "QDrift": - alpha, fwhm, db_dict, output_grid = get_best_vals( - peak_grids, peak_energies, ctc_param, opt_dict, save_path=save_path - ) - opt_name = list(opt_dict.keys())[0] - db_dict[opt_name].update({"alpha": alpha}) - - else: - alpha, fwhm, _, output_grid = get_best_vals( - peak_grids, peak_energies, ctc_param, opt_dict, save_path=save_path - ) - try: - full_grids[param][ctc_param] = output_grid - except: - full_grids[param] = {ctc_param: output_grid} - fwhm.update({"alpha": alpha}) - ctc_dict[ctc_param] = fwhm - full_fwhm_dict[param] = ctc_dict - full_db_dict.update(db_dict) - return full_db_dict, full_fwhm_dict, full_grids diff --git a/src/pygama/pargen/extract_tau.py b/src/pygama/pargen/extract_tau.py index 61e833994..281c92b46 100644 --- a/src/pygama/pargen/extract_tau.py +++ b/src/pygama/pargen/extract_tau.py @@ -4,156 +4,188 @@ from __future__ import annotations -import json import logging -import os -import pathlib -import pickle as pkl -from collections import OrderedDict -import matplotlib as mpl - -mpl.use("agg") import lgdo import lgdo.lh5 as lh5 import matplotlib.pyplot as plt import numpy as np +import pygama.math.binned_fitting as pgf import pygama.math.histogram as pgh -import pygama.math.peak_fitting as pgf -import pygama.pargen.cuts as cts import pygama.pargen.dsp_optimize as opt import pygama.pargen.energy_optimisation as om +from pygama.pargen.data_cleaning import get_mode_stdev log = logging.getLogger(__name__) sto = lh5.LH5Store() -def load_data( - raw_file: list[str], - lh5_path: str, - pulser_mask=None, - n_events: int = 10000, - threshold: int = 5000, - wf_field: str = "waveform", -) -> lgdo.Table: - df = sto.read(lh5_path, raw_file, field_mask=["daqenergy", "timestamp"])[0].view_as( - "pd" - ) - - if pulser_mask is None: - pulser_props = cts.find_pulser_properties(df, energy="daqenergy") - if len(pulser_props) > 0: - final_mask = None - for entry in pulser_props: - e_cut = (df.daqenergy.values < entry[0] + entry[1]) & ( - df.daqenergy.values > entry[0] - entry[1] - ) - if final_mask is None: - final_mask = e_cut - else: - final_mask = final_mask | e_cut - ids = final_mask - log.debug(f"pulser found: {pulser_props}") +class ExtractTau: + def __init__(self, dsp_config, wf_field, debug_mode=False): + self.dsp_config = dsp_config + self.wf_field = wf_field + self.output_dict = {} + self.results_dict = {} + self.debug_mode = debug_mode + + def get_decay_constant( + self, slopes: np.array, wfs: lgdo.WaveformTable, display: int = 0 + ) -> dict: + """ + Finds the decay constant from the modal value of the tail slope after cuts + and saves it to the specified json. Updates self.output_dict with tau value + + Parameters + ---------- + - slopes: numpy array of tail slopes + - wfs: WaveformTable object containing waveform data + - display: integer indicating the level of display (0: no display, 1: plot histogram, 2: show histogram) + + Returns + ------- + - out_plot_dict: dictionary containing the plot figure (only returned if display > 0) + """ + + mode, stdev = get_mode_stdev(slopes) + tau = round(-1 / (mode), 1) + err = round((-1 / (mode + (stdev / np.sqrt(len(slopes))))) - tau, 1) + + sampling_rate = wfs["dt"].nda[0] + units = wfs["dt"].attrs["units"] + tau = f"{tau*sampling_rate}*{units}" + + if "pz" in self.output_dict: + self.output_dict["pz"].update({"tau": tau, "tau_err": err}) else: - log.debug("no_pulser") - ids = np.zeros(len(df.daqenergy.values), dtype=bool) - else: - ids = pulser_mask - - cuts = np.where((df.daqenergy.values > threshold) & (~ids))[0] - - waveforms = sto.read(f"{lh5_path}/{wf_field}", raw_file, idx=cuts, n_rows=n_events)[ - 0 - ] - baseline = sto.read(f"{lh5_path}/baseline", raw_file, idx=cuts, n_rows=n_events)[0] - tb_data = lh5.Table(col_dict={f"{wf_field}": waveforms, "baseline": baseline}) - return tb_data - - -def get_decay_constant( - slopes: np.array, wfs: lgdo.WaveformTable, display: int = 0 -) -> dict: - """ - Finds the decay constant from the modal value of the tail slope after cuts - and saves it to the specified json. - - Parameters - ---------- - slopes : array - tail slope array - - dict_file : str - path to json file to save decay constant value to. - It will be saved as a dictionary of form {'pz': {'tau': decay_constant}} - - Returns - ------- - tau_dict : dict - """ - tau_dict = {} + self.output_dict["pz"] = {"tau": tau, "tau_err": err} - pz = tau_dict.get("pz") - - counts, bins, var = pgh.get_hist(slopes, bins=100000, range=(-0.01, 0)) - bin_centres = pgh.get_bin_centers(bins) - high_bin = bin_centres[np.argmax(counts)] - try: - pars, cov = pgf.gauss_mode_width_max( - counts, - bins, - n_bins=10, - cost_func="Least Squares", - inflate_errors=False, - gof_method="var", + self.results_dict.update( + {"single_decay_constant": {"slope_pars": {"mode": mode, "stdev": stdev}}} ) - if np.abs(np.abs(pars[0] - high_bin) / high_bin) > 0.05: - raise ValueError - high_bin = pars[0] - except: - pass - tau = round(-1 / (high_bin), 1) + if display <= 0: + return + else: + out_plot_dict = {} + + return out_plot_dict + + def get_dpz_consts(self, grid_out, opt_dict): + std_grid = np.ndarray(shape=grid_out.shape) + for i in range(grid_out.shape[0]): + for j in range(grid_out.shape[1]): + std_grid[i, j] = grid_out[i, j]["y_val"] + min_point = np.where(std_grid == np.amin(std_grid)) + + opt_name = list(opt_dict.keys())[0] + keys = list(opt_dict[opt_name].keys()) + param_list = [] + shape = [] + db_dict = {} + for key in keys: + param_dict = opt_dict[opt_name][key] + grid_axis = np.arange( + param_dict["start"], param_dict["end"], param_dict["spacing"] + ) + unit = param_dict.get("unit") + param_list.append(grid_axis) + shape.append(len(grid_axis)) + for i, key in enumerate(keys): + unit = opt_dict[opt_name][key].get("unit") + + if unit is not None: + try: + db_dict[opt_name].update( + {key: f"{param_list[i][min_point[i]][0]}*{unit}"} + ) + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif self.debug_mode: + raise (e) + db_dict[opt_name] = { + key: f"{param_list[i][min_point[i]][0]}*{unit}" + } + else: + try: + db_dict[opt_name].update({key: f"{param_list[i][min_point[i]][0]}"}) + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif self.debug_mode: + raise (e) + db_dict[opt_name] = {key: f"{param_list[i][min_point[i]][0]}"} + return db_dict + + def calculate_dpz(self, tb_data, opt_dict): + log.debug("Calculating double pz constants") + pspace = om.set_par_space(opt_dict) + grid_out = opt.run_grid( + tb_data, self.dsp_config, pspace, fom_dpz, self.output_dict, fom_kwargs=None + ) + out_dict = self.get_dpz_consts(grid_out, opt_dict) + if "pz" in self.output_dict: + self.output_dict["pz"].update(out_dict["pz"]) + else: + self.output_dict["pz"] = out_dict["pz"] - sampling_rate = wfs["dt"].nda[0] - units = wfs["dt"].attrs["units"] - tau = f"{tau*sampling_rate}*{units}" + def plot_waveforms_after_correction( + self, tb_data, wf_field, norm_param=None, display=0 + ): + tb_out = opt.run_one_dsp(tb_data, self.dsp_config, db_dict=self.output_dict) + wfs = tb_out[wf_field]["values"].nda + wf_idxs = np.random.choice(len(wfs), 100) + if norm_param is not None: + means = tb_out[norm_param].nda[wf_idxs] + wfs = np.divide(wfs[wf_idxs], np.reshape(means, (len(wf_idxs), 1))) + else: + wfs = wfs[wf_idxs] + fig = plt.figure() + for wf in wfs: + plt.plot(np.arange(0, len(wf), 1), wf) + plt.axhline(1, color="black") + plt.axhline(0, color="black") + plt.xlabel("Samples") + plt.ylabel("ADU") + plot_dict = {"waveforms": fig} + if display > 1: + plt.show() + else: + plt.close() + return plot_dict - tau_dict["pz"] = {"tau": tau} - if display > 0: - out_plot_dict = {} + def plot_slopes(self, slopes, display=0): + high_bin = self.results_dict["single_decay_constant"]["slope_pars"]["mode"] + sigma = self.results_dict["single_decay_constant"]["slope_pars"]["stdev"] plt.rcParams["figure.figsize"] = (10, 6) plt.rcParams["font.size"] = 8 fig, ax = plt.subplots() - bins = np.linspace(-0.01, 0, 100000) # change if needed + bins = np.arange( + np.nanpercentile(slopes, 1), + np.nanpercentile(slopes, 99), + np.nanpercentile(slopes, 51) - np.nanpercentile(slopes, 50), + ) counts, bins, bars = ax.hist(slopes, bins=bins, histtype="step") - plot_max = np.argmax(counts) - in_min = plot_max - 20 - if in_min < 0: - in_min = 0 - in_max = plot_max + 21 - if in_max >= len(bins): - in_min = len(bins) - 1 + ax.axvline(high_bin, color="red") + in_min = high_bin - 4 * sigma + in_max = high_bin + 4 * sigma plt.xlabel("Slope") plt.ylabel("Counts") - plt.yscale("log") - axins = ax.inset_axes([0.5, 0.45, 0.47, 0.47]) + axins = ax.inset_axes([0.6, 0.6, 0.4, 0.4]) axins.hist( - slopes[(slopes > bins[in_min]) & (slopes < bins[in_max])], - bins=200, + slopes[(slopes > in_min) & (slopes < in_max)], + bins=50, histtype="step", ) axins.axvline(high_bin, color="red") - axins.set_xlim(bins[in_min], bins[in_max]) - ax.set_xticks(ax.get_xticks()) - ax.set_xticklabels(labels=ax.get_xticklabels(), rotation=45) - out_plot_dict["slope"] = fig + axins.set_xlim(in_min, in_max) + ax.set_xlim(np.nanpercentile(slopes, 1), np.nanpercentile(slopes, 99)) + out_plot_dict = {"slope": fig} if display > 1: plt.show() else: plt.close() - return tau_dict, out_plot_dict - else: - return tau_dict + return out_plot_dict def fom_dpz(tb_data, verbosity=0, rand_arg=None): @@ -174,123 +206,7 @@ def fom_dpz(tb_data, verbosity=0, rand_arg=None): mu = pars[0] - except: + except Exception: mu = start_bins[max_idx] return {"y_val": np.abs(mu)} - - -def get_dpz_consts(grid_out, opt_dict): - std_grid = np.ndarray(shape=grid_out.shape) - for i in range(grid_out.shape[0]): - for j in range(grid_out.shape[1]): - std_grid[i, j] = grid_out[i, j]["y_val"] - min_val = np.amin(std_grid) - min_point = np.where(std_grid == np.amin(std_grid)) - - opt_name = list(opt_dict.keys())[0] - keys = list(opt_dict[opt_name].keys()) - param_list = [] - shape = [] - db_dict = {} - for key in keys: - param_dict = opt_dict[opt_name][key] - grid_axis = np.arange( - param_dict["start"], param_dict["end"], param_dict["spacing"] - ) - unit = param_dict.get("unit") - param_list.append(grid_axis) - shape.append(len(grid_axis)) - for i, key in enumerate(keys): - unit = opt_dict[opt_name][key].get("unit") - - if unit is not None: - try: - db_dict[opt_name].update( - {key: f"{param_list[i][min_point[i]][0]}*{unit}"} - ) - except: - db_dict[opt_name] = {key: f"{param_list[i][min_point[i]][0]}*{unit}"} - else: - try: - db_dict[opt_name].update({key: f"{param_list[i][min_point[i]][0]}"}) - except: - db_dict[opt_name] = {key: f"{param_list[i][min_point[i]][0]}"} - return db_dict - - -def dsp_preprocess_decay_const( - tb_data, - dsp_config: dict, - double_pz: bool = False, - display: int = 0, - opt_dict: dict = None, - wf_field: str = "waveform", - wf_plot: str = "wf_pz", - norm_param: str = "pz_mean", - cut_parameters: dict = {"bl_mean": 4, "bl_std": 4, "bl_slope": 4}, -) -> dict: - """ - This function calculates the pole zero constant for the input data - - Parameters - ---------- - f_raw : str - The raw file to run the macro on - dsp_config: str - Path to the dsp config file, this is a stripped down version which just includes cuts and slope of decay tail - channel: str - Name of channel to process, should be name of lh5 group in raw files - - Returns - ------- - tau_dict : dict - """ - - tb_out = opt.run_one_dsp(tb_data, dsp_config) - log.debug("Processed Data") - cut_dict = cts.generate_cuts(tb_out, parameters=cut_parameters) - log.debug("Generated Cuts:", cut_dict) - idxs = cts.get_cut_indexes(tb_out, cut_dict) - log.debug("Applied cuts") - log.debug(f"{len(idxs)} events passed cuts") - slopes = tb_out["tail_slope"].nda - log.debug("Calculating pz constant") - if display > 0: - tau_dict, plot_dict = get_decay_constant( - slopes[idxs], tb_data[wf_field], display=display - ) - else: - tau_dict = get_decay_constant(slopes[idxs], tb_data[wf_field]) - if double_pz == True: - log.debug("Calculating double pz constants") - pspace = om.set_par_space(opt_dict) - grid_out = opt.run_grid( - tb_data, dsp_config, pspace, fom_dpz, tau_dict, fom_kwargs=None - ) - out_dict = get_dpz_consts(grid_out, opt_dict) - tau_dict["pz"].update(out_dict["pz"]) - if display > 0: - tb_out = opt.run_one_dsp(tb_data, dsp_config, db_dict=tau_dict) - wfs = tb_out[wf_plot]["values"].nda[idxs] - wf_idxs = np.random.choice(len(wfs), 100) - if norm_param is not None: - means = tb_out[norm_param].nda[idxs][wf_idxs] - wfs = np.divide(wfs[wf_idxs], np.reshape(means, (len(wf_idxs), 1))) - else: - wfs = wfs[wf_idxs] - fig2 = plt.figure() - for wf in wfs: - plt.plot(np.arange(0, len(wf), 1), wf) - plt.axhline(1, color="black") - plt.axhline(0, color="black") - plt.xlabel("Samples") - plt.ylabel("ADU") - plot_dict["waveforms"] = fig2 - if display > 1: - plt.show() - else: - plt.close() - return tau_dict, plot_dict - else: - return tau_dict diff --git a/src/pygama/pargen/lq_cal.py b/src/pygama/pargen/lq_cal.py index aa3c15806..6457de04d 100644 --- a/src/pygama/pargen/lq_cal.py +++ b/src/pygama/pargen/lq_cal.py @@ -1,33 +1,21 @@ from __future__ import annotations -import json import logging -import os -import pathlib import re from datetime import datetime -from typing import Callable -import matplotlib as mpl - -mpl.use("agg") - -import lgdo.lh5_store as lh5 -import matplotlib.cm as cmx import matplotlib.colors as mcolors import matplotlib.dates as mdates import matplotlib.pyplot as plt import numpy as np import pandas as pd -from iminuit import Minuit, cost, util -from matplotlib.backends.backend_pdf import PdfPages +from iminuit import Minuit, cost from matplotlib.colors import LogNorm from scipy.stats import linregress import pygama.math.histogram as pgh -import pygama.math.peak_fitting as pgf -import pygama.pargen.AoE_cal as aoe -from pygama.pargen.utils import * +import pygama.pargen.AoE_cal as AoE +from pygama.math.distributions import gaussian log = logging.getLogger(__name__) @@ -111,7 +99,7 @@ def binned_lq_fit( lq_param: str, cal_energy_param: str, peak: float, - cdf=pgf.gauss_cdf, + cdf=gaussian, sidebands: bool = True, ): """Function for fitting a distribution of LQ values within a specified @@ -159,8 +147,8 @@ def binned_lq_fit( mu = bin_centers[np.argmax(hist)] _, sigma, _ = pgh.get_gaussian_guess(hist, bins) - c1 = cost.BinnedNLL(hist, bins, pgf.gauss_cdf, verbose=0) - m1 = Minuit(c1, mu, sigma) + c1 = cost.BinnedNLL(hist, bins, gaussian.get_cdf, verbose=0) + m1 = Minuit(c1, mu=mu, sigma=sigma) m1.simplex().migrad() m1.hesse() @@ -220,8 +208,7 @@ def fit_time_means(tstamps, means, reses): return out_dict -class cal_lq: - +class LQCal: """A class for calibrating the LQ parameter and determining the LQ cut value""" def __init__( @@ -229,9 +216,9 @@ def __init__( cal_dicts: dict, cal_energy_param: str, eres_func: callable, - cdf: callable = pgf.gauss_cdf, + cdf: callable = gaussian, selection_string: str = "is_valid_cal&is_not_pulser", - plot_options: dict = {}, + debug_mode=False, ): """ Parameters @@ -257,7 +244,7 @@ def __init__( self.eres_func = eres_func self.cdf = cdf self.selection_string = selection_string - self.plot_options = plot_options + self.debug_mode = debug_mode def update_cal_dicts(self, update_dict): if re.match(r"(\d{8})T(\d{6})Z", list(self.cal_dicts)[0]): @@ -277,17 +264,9 @@ def lq_timecorr(self, df, lq_param, output_name="LQ_Timecorr", display=0): """ log.info("Starting LQ time correction") - self.timecorr_df = pd.DataFrame( - columns=["run_timestamp", "mean", "mean_err", "res", "res_err"] - ) + self.timecorr_df = pd.DataFrame() try: if "run_timestamp" in df: - tstamps = sorted(np.unique(df["run_timestamp"])) - means = [] - errors = [] - reses = [] - res_errs = [] - final_tstamps = [] for tstamp, time_df in df.groupby("run_timestamp", sort=True): try: pars, errs, _, _ = binned_lq_fit( @@ -318,7 +297,12 @@ def lq_timecorr(self, df, lq_param, output_name="LQ_Timecorr", display=0): ), ] ) - except: + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif self.debug_mode: + raise (e) + self.timecorr_df = pd.concat( [ self.timecorr_df, @@ -335,6 +319,7 @@ def lq_timecorr(self, df, lq_param, output_name="LQ_Timecorr", display=0): ), ] ) + self.timecorr_df.set_index("run_timestamp", inplace=True) time_dict = fit_time_means( np.array(self.timecorr_df.index), @@ -373,6 +358,7 @@ def lq_timecorr(self, df, lq_param, output_name="LQ_Timecorr", display=0): pd.DataFrame( [ { + "run_timestamp": np.nan, "mean": pars["mu"], "mean_err": errs["mu"], "res": pars["sigma"] / pars["mu"], @@ -386,13 +372,18 @@ def lq_timecorr(self, df, lq_param, output_name="LQ_Timecorr", display=0): ), ] ) - except: + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif self.debug_mode: + raise (e) self.timecorr_df = pd.concat( [ self.timecorr_df, pd.DataFrame( [ { + "run_timestamp": np.nan, "mean": np.nan, "mean_err": np.nan, "res": np.nan, @@ -412,7 +403,11 @@ def lq_timecorr(self, df, lq_param, output_name="LQ_Timecorr", display=0): } ) log.info("LQ time correction finished") - except: + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif self.debug_mode: + raise (e) log.error("LQ time correction failed") self.update_cal_dicts( { @@ -434,7 +429,6 @@ def drift_time_correction( log.info("Starting LQ drift time correction") try: - dt_dict = {} pars = binned_lq_fit(df, lq_param, self.cal_energy_param, peak=1592.5)[0] mean = pars[0] sigma = pars[1] @@ -456,7 +450,11 @@ def drift_time_correction( df[lq_param] - df["dt_eff"] * self.dt_fit_pars[0] - self.dt_fit_pars[1] ) - except: + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif self.debug_mode: + raise (e) log.error("LQ drift time correction failed") self.dt_fit_pars = (np.nan, np.nan) @@ -491,7 +489,11 @@ def get_cut_lq_dep(self, df: pd.DataFrame(), lq_param: str, cal_energy_param: st df["LQ_Cut"] = df[lq_param] < self.cut_val - except: + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif self.debug_mode: + raise (e) log.error("LQ cut determination failed") self.cut_val = np.nan @@ -504,24 +506,6 @@ def get_cut_lq_dep(self, df: pd.DataFrame(), lq_param: str, cal_energy_param: st } ) - def get_results_dict(self): - return { - "cal_energy_param": self.cal_energy_param, - "rt_correction": self.dt_fit_pars, - "cdf": self.cdf.__name__, - "1590-1596keV": self.timecorr_df.to_dict("index"), - "cut_value": self.cut_val, - "sfs": self.low_side_sf.to_dict("index"), - } - - def fill_plot_dict(self, data, plot_dict={}): - for key, item in self.plot_options.items(): - if item["options"] is not None: - plot_dict[key] = item["function"](self, data, **item["options"]) - else: - plot_dict[key] = item["function"](self, data) - return plot_dict - def calibrate(self, df, initial_lq_param): """Run the LQ calibration and calculate the cut value""" @@ -540,7 +524,7 @@ def calibrate(self, df, initial_lq_param): final_lq_param = "LQ_Classifier" peaks_of_interest = [1592.5, 1620.5, 2039, 2103.53, 2614.50] - self.low_side_sf = pd.DataFrame(columns=["peak", "sf", "sf_err"]) + self.low_side_sf = pd.DataFrame() fit_widths = [(40, 25), (25, 40), (0, 0), (25, 40), (50, 50)] self.low_side_peak_dfs = {} @@ -556,13 +540,14 @@ def calibrate(self, df, initial_lq_param): f"({self.cal_energy_param}>{peak-emin})&({self.cal_energy_param}<{peak+emax})" ) - cut_df, sf, sf_err = aoe.compton_sf_sweep( + cut_df, sf, sf_err = AoE.compton_sf_sweep( peak_df[self.cal_energy_param].to_numpy(), peak_df[final_lq_param].to_numpy(), self.cut_val, peak, fwhm, cut_range=(0, 0.6), + n_samples=10, mode="less", ) self.low_side_sf = pd.concat( @@ -574,16 +559,19 @@ def calibrate(self, df, initial_lq_param): self.low_side_peak_dfs[peak] = cut_df else: emin, emax = fit_widths[i] + fit_range = (peak - emin, peak + emax) peak_df = select_df.query( - f"({self.cal_energy_param}>{peak-emin})&({self.cal_energy_param}<{peak+emax})" + f"({self.cal_energy_param}>{fit_range[0]})&({self.cal_energy_param}<{fit_range[1]})" ) - cut_df, sf, sf_err = aoe.get_sf_sweep( + cut_df, sf, sf_err = AoE.get_sf_sweep( peak_df[self.cal_energy_param].to_numpy(), peak_df[final_lq_param].to_numpy(), self.cut_val, peak, fwhm, + fit_range=fit_range, cut_range=(0, 0.6), + n_samples=10, mode="less", ) self.low_side_sf = pd.concat( @@ -594,7 +582,11 @@ def calibrate(self, df, initial_lq_param): ) self.low_side_peak_dfs[peak] = cut_df log.info(f"{peak}keV: {sf:2.1f} +/- {sf_err:2.1f} %") - except: + except BaseException as e: + if e == KeyboardInterrupt: + raise (e) + elif self.debug_mode: + raise (e) self.low_side_sf = pd.concat( [ self.low_side_sf, @@ -606,7 +598,7 @@ def calibrate(self, df, initial_lq_param): def plot_lq_mean_time( - lq_class, data, lq_param="LQ_Timecorr", figsize=[12, 8], fontsize=12 + lq_class, data, lq_param="LQ_Timecorr", figsize=(12, 8), fontsize=12 ) -> plt.figure: """Plots the mean LQ value calculated for each given timestamp""" @@ -647,18 +639,18 @@ def plot_lq_mean_time( color="yellow", alpha=0.2, ) - # except: + # except Exception: # pass ax.set_xlabel("time") ax.set_ylabel("LQ mean") - myFmt = mdates.DateFormatter("%b %d") - ax.xaxis.set_major_formatter(myFmt) + myfmt = mdates.DateFormatter("%b %d") + ax.xaxis.set_major_formatter(myfmt) plt.close() return fig def plot_drift_time_correction( - lq_class, data, lq_param="LQ_Timecorr", figsize=[12, 8], fontsize=12 + lq_class, data, lq_param="LQ_Timecorr", figsize=(12, 8), fontsize=12 ) -> plt.figure: """Plots a 2D histogram of LQ versus effective drift time in a 6 keV window around the DEP. Additionally plots the fit results for the @@ -696,7 +688,7 @@ def plot_drift_time_correction( plt.title("LQ versus Drift Time for DEP") - except: + except Exception: pass plt.tight_layout() @@ -704,7 +696,7 @@ def plot_drift_time_correction( return fig -def plot_lq_cut_fit(lq_class, data, figsize=[12, 8], fontsize=12) -> plt.figure: +def plot_lq_cut_fit(lq_class, data, figsize=(12, 8), fontsize=12) -> plt.figure: """Plots the final histogram of LQ values for events in the DEP, and the fit results used for determining the cut value""" @@ -723,7 +715,7 @@ def plot_lq_cut_fit(lq_class, data, figsize=[12, 8], fontsize=12) -> plt.figure: dx = np.diff(bins) ax1.plot( xs, - pgf.gauss_pdf(xs, fit_pars[0], fit_pars[1], ls) * dx, + gaussian.pdf_norm(xs, fit_pars[0], fit_pars[1]) * dx * ls, label="Gaussian Fit", ) @@ -733,13 +725,13 @@ def plot_lq_cut_fit(lq_class, data, figsize=[12, 8], fontsize=12) -> plt.figure: bin_centers = (bins[:-1] + bins[1:]) / 2 reses = ( - hist - (pgf.gauss_pdf(bin_centers, fit_pars[0], fit_pars[1], ls) * dx) - ) / (pgf.gauss_pdf(bin_centers, fit_pars[0], fit_pars[1], ls) * dx) + hist - (gaussian.pdf_norm(bin_centers, fit_pars[0], fit_pars[1]) * dx * ls) + ) / (gaussian.pdf_norm(bin_centers, fit_pars[0], fit_pars[1]) * dx * ls) ax2.plot(bin_centers, reses, marker="s", linestyle="") ax2.set_xlabel("LQ") ax2.set_ylabel("residuals") - except: + except Exception: pass plt.tight_layout() @@ -748,7 +740,7 @@ def plot_lq_cut_fit(lq_class, data, figsize=[12, 8], fontsize=12) -> plt.figure: def plot_survival_fraction_curves( - lq_class, data, figsize=[12, 8], fontsize=12 + lq_class, data, figsize=(12, 8), fontsize=12 ) -> plt.figure: """Plots the survival fraction curves as a function of LQ cut values for every peak of interest""" @@ -772,11 +764,11 @@ def plot_survival_fraction_curves( survival_df.index, survival_df["sf"], yerr=survival_df["sf_err"], - label=f'{aoe.get_peak_label(peak)} {peak} keV: {lq_class.low_side_sf.loc[peak]["sf"]:2.1f} +/- {lq_class.low_side_sf.loc[peak]["sf_err"]:2.1f} %', + label=f'{AoE.get_peak_label(peak)} {peak} keV: {lq_class.low_side_sf.loc[peak]["sf"]:2.1f} +/- {lq_class.low_side_sf.loc[peak]["sf_err"]:2.1f} %', ) - except: + except Exception: pass - except: + except Exception: pass vals, labels = plt.yticks() plt.yticks(vals, [f"{x:,.0f} %" for x in vals]) @@ -789,7 +781,7 @@ def plot_survival_fraction_curves( def plot_sf_vs_energy( - lq_class, data, xrange=(900, 3000), n_bins=701, figsize=[12, 8], fontsize=12 + lq_class, data, xrange=(900, 3000), n_bins=701, figsize=(12, 8), fontsize=12 ) -> plt.figure: """Plots the survival fraction as a function of energy""" @@ -812,7 +804,7 @@ def plot_sf_vs_energy( survival_fracs = counts_pass / (counts + 10**-99) plt.step(pgh.get_bin_centers(bins_pass), 100 * survival_fracs) - except: + except Exception: pass plt.ylim([0, 100]) vals, labels = plt.yticks() @@ -830,7 +822,7 @@ def plot_spectra( n_bins=2101, xrange_inset=(1580, 1640), n_bins_inset=200, - figsize=[12, 8], + figsize=(12, 8), fontsize=12, ) -> plt.figure: """Plots a 2D histogram of the LQ classifier vs calibrated energy""" @@ -903,7 +895,7 @@ def plot_spectra( bins=bins, histtype="step", ) - except: + except Exception: pass ax.set_xlim(xrange) ax.set_yscale("log") @@ -922,7 +914,7 @@ def plot_classifier( yrange=(-2, 8), xn_bins=700, yn_bins=500, - figsize=[12, 8], + figsize=(12, 8), fontsize=12, ) -> plt.figure: plt.rcParams["figure.figsize"] = figsize @@ -939,7 +931,7 @@ def plot_classifier( ], norm=LogNorm(), ) - except: + except Exception: pass plt.xlabel("energy (keV)") plt.ylabel(lq_param) diff --git a/src/pygama/pargen/mse_psd.py b/src/pygama/pargen/mse_psd.py deleted file mode 100644 index 11a78f65b..000000000 --- a/src/pygama/pargen/mse_psd.py +++ /dev/null @@ -1,350 +0,0 @@ -""" -- get_avse_cut (does AvsE) -- get_ae_cut (does A/E) -""" - -import matplotlib.gridspec as gs -import matplotlib.pyplot as plt -import numpy as np -from matplotlib.colors import LogNorm - -from pygama.math.histogram import get_bin_centers -from pygama.math.peak_fitting import * - - -def get_avse_cut(e_cal, current, plotFigure=None): - # DEP range and nearby BG for BG subtraction - dep_idxs = (e_cal > 1585) & ( - e_cal < 1595 - ) # (asymmetric around 1592 to account for CT) - bg_idxs = (e_cal > 1560) & (e_cal < 1570) - - # SEP range and nearby BG for BG subtraction - sep_idxs = (e_cal > 2090) & (e_cal < 2115) - bg_sep_idxs = (e_cal > 2060) & (e_cal < 2080) - - # Bin the data in 2D, 25 keV bins - # xedges = np.arange(200,2300,25) - # e_cent = get_bin_centers(xedges) - # y_max = np.zeros_like(e_cent) - # plt.ion() - # plt.figure() - - compton_shoulder = 2614 * (1 - 1.0 / (1 + 2 * 2614 / 511)) - - # peaks = [238,510,583,727,860,1078,1512,1592,1806,compton_shoulder, 2614] - # peaks = np.array(peaks) - # e_cent = peaks - # y_max = np.zeros_like(e_cent) - - xedges = np.arange(200, 2300, 25) - e_cent = get_bin_centers(xedges) - y_max = np.zeros_like(e_cent) - - # plt.ion() - # plt.figure() - for i, peak in enumerate(e_cent): - plt.clf() - # e_bin_idxs = (e_cal > peak-10) & (e_cal < peak+10 ) - e_bin_idxs = (e_cal > xedges[i]) & (e_cal < xedges[i + 1]) - a_ebin = current[e_bin_idxs] - a_5 = np.percentile(a_ebin, 25) - a_95 = np.percentile(a_ebin, 99) - - h, a_bins = np.histogram(a_ebin, bins=np.linspace(a_5, a_95, 500)) - - a_bins_cent = get_bin_centers(a_bins) - a_mode = a_bins_cent[np.argmax(h)] - y_max[i] = a_mode - - p0 = get_gaussian_guess(h, a_bins_cent) - fit_idxs = a_bins_cent > p0[0] - 5 * p0[1] - p = fit_binned(gauss, h[fit_idxs], a_bins_cent[fit_idxs], p0) - y_max[i] = p[0] - - # plt.plot(a_bins_cent,h,ls="steps") - # plt.axvline(a_mode, c="r") - # plt.title("Energy: {} keV".format(e_cent[i])) - # - # fit = gauss(a_bins_cent[fit_idxs], *p) - # plt.plot(a_bins_cent[fit_idxs], fit, c="g") - - # guess = gauss(a_bins_cent[fit_idxs], *p0) - # plt.plot(a_bins_cent[fit_idxs], guess, c="r") - - # inp = input("q to quit") - # if inp == "q": exit() - - # quadratic fit - # first fit a line: - p_lin = np.polyfit(e_cent, y_max, 1) - # find residuals: - resid = y_max - np.poly1d(p_lin)(e_cent) - resid_std = np.std(resid) - # a really big residual is a pulser peak: - puls_idx = resid > resid_std - - e_cent_cut = e_cent[~puls_idx] - y_max_cut = y_max[~puls_idx] - - avse_quad = np.polyfit(e_cent_cut, y_max_cut, 2) - avse_lin = np.polyfit(e_cent_cut, y_max_cut, 1) - - a_adjusted = current - np.poly1d(avse_quad)(e_cal) - - # Look at DEP, bg subtract the AvsE spectrum - h_dep, bins = np.histogram(a_adjusted[dep_idxs], bins=5000) - h_bg, bins = np.histogram(a_adjusted[bg_idxs], bins=bins) - bin_centers = get_bin_centers(bins) - h_bgs = h_dep - h_bg - # fit AvsE peak to gaussian to get the 90% cut - p0 = get_gaussian_guess(h_bgs, bin_centers) - p = fit_binned(gauss, h_bgs, bin_centers, p0) - fit = gauss(bin_centers, *p) - - ae_mean, ae_std = p[0], p[1] - ae_cut = p[0] - 1.28 * p[1] # cuts at 10% of CDF - - avse2, avse1, avse0 = avse_quad[:] - avse_cut = ae_cut - avse_mean = ae_mean - avse_std = ae_std - - # plt.figure() - # x = np.linspace(0,2700,5000) - # plt.scatter(e_cent_cut, y_max_cut, color="k", s=10) - # plt.scatter(e_cent[puls_idx], y_max[puls_idx], color="r", s=10) - # plt.plot(x, np.poly1d(avse_quad)(x)) - # plt.plot(x, np.poly1d(avse_lin)(x)) - # - # plt.figure() - # xedges = np.arange(1000,2700,25) - # aa_5 = np.percentile(a_adjusted,5) - # aa_95 = np.percentile(a_adjusted,99) - # yedges = np.linspace(aa_5, aa_95,1000) - # H, xedges, yedges = np.histogram2d(e_cal, a_adjusted, bins=( xedges, yedges)) - # plt.imshow(H.T, interpolation='nearest', origin='low', aspect="auto", - # extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]], cmap="OrRd", norm=LogNorm()) - # plt.scatter(e_cent_cut, y_max_cut - np.poly1d(avse_quad)(e_cent_cut) , color="k", s=10) - # plt.axhline(ae_cut, color="k", ls="--") - # - # inp = input("q to quit") - # if inp == "q": exit() - - if plotFigure is not None: - #### - # Plot A/E distributions - ### - plt.figure(plotFigure.number) - plt.clf() - grid = gs.GridSpec(2, 2) - - ax_dep = plt.subplot(grid[0, 0]) - ax_sep = plt.subplot(grid[1, 0]) - ax_ae = plt.subplot(grid[:, 1]) - - # adjust a over e for mean - a_over_e = a_adjusted - ae_cut_mod = ae_cut - - h_dep, bins = np.histogram( - a_over_e[dep_idxs], bins=np.linspace(-12 * ae_std, 8 * ae_std, 100) - ) - h_bg, bins = np.histogram(a_over_e[bg_idxs], bins=bins) - bin_centers = bins[:-1] + 0.5 * (bins[1] - bins[0]) - h_bgs = h_dep - h_bg - - h_sep, bins = np.histogram(a_over_e[sep_idxs], bins=bins) - h_sepbg, bins = np.histogram(a_over_e[bg_sep_idxs], bins=bins) - h_bgs_sep = h_sep - h_sepbg - - # ax_ae.plot(bin_centers,h_bgs / np.sum(h_bgs), ls="steps-mid", color = "b", label = "DEP (BG subtracted)") - # ax_ae.plot(bin_centers, h_bgs_sep/ np.sum(h_bgs), ls="steps-mid", color = "g", label = "SEP (BG subtracted)") - - ax_ae.plot( - bin_centers, h_bgs, ls="steps-mid", color="b", label="DEP (BG subtracted)" - ) - ax_ae.plot( - bin_centers, - h_bgs_sep, - ls="steps-mid", - color="g", - label="SEP (BG subtracted)", - ) - ax_ae.axvline(ae_cut_mod, color="r", ls=":") - ax_ae.set_xlim(-12 * ae_std, 8 * ae_std) - ax_ae.legend(loc=2) - - ax_ae.set_xlabel("A/E value [arb]") - - ### - # Plot SEP/DEP before/after cut - ## - ae_cut_idxs = a_over_e > ae_cut_mod - e_cal_aepass = e_cal[ae_cut_idxs] - - pad = 50 - bin_size = 0.2 # keV - bins = np.arange(1592 - pad, 1592 + pad + bin_size, bin_size) - - ax_dep.hist( - e_cal[(e_cal > 1592 - pad) & (e_cal < 1592 + pad)], - histtype="step", - color="k", - label="DEP", - bins=bins, - ) - ax_dep.hist( - e_cal_aepass[(e_cal_aepass > 1592 - pad) & (e_cal_aepass < 1592 + pad)], - histtype="step", - color="b", - label="After Cut", - bins=bins, - ) - ax_dep.legend(loc=2) - ax_dep.set_xlabel("Energy [keV]") - - bins = np.arange(2103 - pad, 2103 + pad + bin_size, bin_size) - ax_sep.hist( - e_cal[(e_cal > 2103 - pad) & (e_cal < 2103 + pad)], - histtype="step", - color="k", - label="SEP", - bins=bins, - ) - ax_sep.hist( - e_cal_aepass[(e_cal_aepass > 2103 - pad) & (e_cal_aepass < 2103 + pad)], - histtype="step", - color="b", - label="After Cut", - bins=bins, - ) - ax_sep.legend(loc=2) - ax_sep.set_xlabel("Energy [keV]") - - return avse2, avse1, avse0, avse_cut, avse_mean, avse_std - - -def get_ae_cut(e_cal, current, plotFigure=None): - # try to get a rough A/E cut - - # DEP range and nearby BG for BG subtraction - dep_idxs = (e_cal > 1585) & ( - e_cal < 1595 - ) # (asymmetric around 1592 to account for CT) - bg_idxs = (e_cal > 1560) & (e_cal < 1570) - - # SEP range and nearby BG for BG subtraction - sep_idxs = (e_cal > 2090) & (e_cal < 2115) - bg_sep_idxs = (e_cal > 2060) & (e_cal < 2080) - - a_over_e = current / e_cal - - # # peaks = [2381] - # peaks = [1512, 1592,1620,1806,2381] - # ae_cents = np.zeros((len(peaks))) - - h_dep, bins = np.histogram(a_over_e[dep_idxs], bins=500) - h_bg, bins = np.histogram(a_over_e[bg_idxs], bins=bins) - bin_centers = get_bin_centers(bins) - h_bgs = h_dep - h_bg - - p0 = get_gaussian_guess(h_bgs, bin_centers) - p = fit_binned(gauss, h_bgs, bin_centers, p0) - fit = gauss(bin_centers, *p) - - ae_mean, ae_std = p[0], p[1] - ae_cut = p[0] - 1.28 * p[1] # cuts at 10% of CDF - - if plotFigure is not None: - #### - # Plot A/E distributions - ### - plt.figure(plotFigure.number) - plt.clf() - grid = gs.GridSpec(2, 2) - - ax_dep = plt.subplot(grid[0, 0]) - ax_sep = plt.subplot(grid[1, 0]) - ax_ae = plt.subplot(grid[:, 1]) - - # adjust a over e for mean - - a_over_e = (a_over_e - ae_mean) / ae_std - ae_cut_mod = (ae_cut - ae_mean) / ae_std - - h_dep, bins = np.histogram(a_over_e[dep_idxs], bins=np.linspace(-8, 6, 50)) - h_bg, bins = np.histogram(a_over_e[bg_idxs], bins=bins) - bin_centers = bins[:-1] + 0.5 * (bins[1] - bins[0]) - h_bgs = h_dep - h_bg - - h_sep, bins = np.histogram(a_over_e[sep_idxs], bins=bins) - h_sepbg, bins = np.histogram(a_over_e[bg_sep_idxs], bins=bins) - h_bgs_sep = h_sep - h_sepbg - - ax_ae.plot( - bin_centers, - h_bgs / np.sum(h_bgs), - ls="steps-mid", - color="b", - label="DEP (BG subtracted)", - ) - ax_ae.plot( - bin_centers, - h_bgs_sep / np.sum(h_bgs), - ls="steps-mid", - color="g", - label="SEP (BG subtracted)", - ) - # plt.plot(bin_centers, fit, color="g") - ax_ae.axvline(ae_cut_mod, color="r", ls=":") - ax_ae.set_xlim(-8, 5) - ax_ae.legend(loc=2) - - ax_ae.set_xlabel("A/E value [arb]") - - ### - # Plot SEP/DEP before/after cut - ## - ae_cut_idxs = a_over_e > ae_cut_mod - e_cal_aepass = e_cal[ae_cut_idxs] - - pad = 50 - bins = np.linspace(1592 - pad, 1592 + pad, 2 * pad + 1) - - ax_dep.hist( - e_cal[(e_cal > 1592 - pad) & (e_cal < 1592 + pad)], - histtype="step", - color="k", - label="DEP", - bins=bins, - ) - ax_dep.hist( - e_cal_aepass[(e_cal_aepass > 1592 - pad) & (e_cal_aepass < 1592 + pad)], - histtype="step", - color="b", - label="After Cut", - bins=bins, - ) - ax_dep.legend(loc=2) - ax_dep.set_xlabel("Energy [keV]") - - bins = np.linspace(2103 - pad, 2103 + pad, 2 * pad + 1) - ax_sep.hist( - e_cal[(e_cal > 2103 - pad) & (e_cal < 2103 + pad)], - histtype="step", - color="k", - label="SEP", - bins=bins, - ) - ax_sep.hist( - e_cal_aepass[(e_cal_aepass > 2103 - pad) & (e_cal_aepass < 2103 + pad)], - histtype="step", - color="b", - label="After Cut", - bins=bins, - ) - ax_sep.legend(loc=2) - ax_sep.set_xlabel("Energy [keV]") - - return ae_cut, ae_mean, ae_std diff --git a/src/pygama/pargen/noise_optimization.py b/src/pygama/pargen/noise_optimization.py index 96c2ce65b..ff3515ec9 100644 --- a/src/pygama/pargen/noise_optimization.py +++ b/src/pygama/pargen/noise_optimization.py @@ -3,35 +3,21 @@ This happens with a grid search performed on ENC peak. """ -import inspect -import json import logging -import os -import pathlib -import pickle as pkl -import sys import time -from collections import namedtuple import lgdo -import matplotlib as mpl - -mpl.use("agg") import matplotlib.pyplot as plt import numpy as np -import pandas as pd import scipy.stats -from iminuit import Minuit, cost, util -from matplotlib.backends.backend_pdf import PdfPages -from matplotlib.colors import LogNorm -from scipy.interpolate import splev, splrep +from scipy.interpolate import BSpline, splev, splrep from scipy.optimize import minimize -import pygama.math.peak_fitting as pgf +from pygama.math.binned_fitting import goodness_of_fit +from pygama.math.distributions import gauss_on_uniform from pygama.math.histogram import get_hist -from pygama.pargen.cuts import generate_cuts, get_cut_indexes +from pygama.math.unbinned_fitting import fit_unbinned from pygama.pargen.dsp_optimize import run_one_dsp -from pygama.pargen.energy_optimisation import index_data log = logging.getLogger(__name__) @@ -42,7 +28,6 @@ def noise_optimization( par_dsp: dict, opt_dict: dict, lh5_path: str, - verbose: bool = False, display: int = 0, ) -> dict: """ @@ -82,7 +67,7 @@ def noise_optimization( ax.set_xscale("log") ax.set_yscale("log") ax.set_xlabel("frequency (MHz)") - ax.set_ylabel(f"power spectral density") + ax.set_ylabel("power spectral density") plot_dict = {} plot_dict["nopt"] = {"fft": {"frequency": freq, "psd": psd, "fig": fig}} @@ -91,13 +76,9 @@ def noise_optimization( result_dict = {} ene_pars = [par for par in opt_dict_par.keys()] log.info(f"\nRunning optimization for {ene_pars}") - if verbose: - print(f"\nRunning optimization for {ene_pars}") for i, x in enumerate(samples): x = f"{x:.1f}" log.info(f"\nCase {i}, par = {x} us") - if verbose: - print(f"\nCase {i}, par = {x} us") for ene_par in ene_pars: dict_str = opt_dict_par[ene_par]["dict_str"] filter_par = opt_dict_par[ene_par]["filter_par"] @@ -109,8 +90,6 @@ def noise_optimization( t1 = time.time() dsp_data = run_one_dsp(tb_data, dsp_proc_chain, db_dict=par_dsp) log.info(f"Time to process dsp data {time.time()-t1:.2f} s") - if verbose: - print(f"Time to process dsp data {time.time()-t1:.2f} s") for ene_par in ene_pars: dict_str = opt_dict_par[ene_par]["dict_str"] @@ -138,8 +117,6 @@ def noise_optimization( for ene_par in ene_pars: log.info(f"\nOptimization for {ene_par}") - if verbose: - print(f"\nOptimization for {ene_par}") dict_str = opt_dict_par[ene_par]["dict_str"] par_dict_res = result_dict[dict_str] sample_list = np.array([float(x) for x in result_dict[dict_str].keys()]) @@ -151,27 +128,19 @@ def noise_optimization( ) guess_par = sample_list[np.nanargmin(fom_list)] - if verbose: - print(f"guess par: {guess_par:.2f} us") tck = splrep(sample_list, fom_list, k=opt_dict["fit_deg"]) + tck = BSpline(tck[0], tck[1], tck[2]) - def spl_func(x_val): - return splev(x_val, tck) - - result = minimize(spl_func, guess_par) + result = minimize(splev, guess_par, args=(tck)) best_par = result.x[0] if (best_par < np.min(sample_list)) or (best_par > np.max(sample_list)): log.info( f"Par from minimization not accepted {best_par:.2f}, setting par to guess" ) - if verbose: - print( - f"Par from minimization not accepted {best_par:.2f}, setting par to guess" - ) best_par = guess_par - best_val = spl_func(best_par) + best_val = splev(best_par, tck) b_best_pars = np.zeros(opt_dict["n_bootstrap_samples"]) for i in range(opt_dict["n_bootstrap_samples"]): @@ -181,8 +150,6 @@ def spl_func(x_val): b_best_pars[i] = b_sample_list[np.nanargmin(b_fom_list)] best_par_err = np.std(b_best_pars) log.info(f"best par: {best_par:.2f} ± {best_par_err:.2f} us") - if verbose: - print(f"best par: {best_par:.2f} ± {best_par_err:.2f} us") par_dict_res["best_par"] = best_par par_dict_res["best_par_err"] = best_par_err @@ -210,8 +177,6 @@ def spl_func(x_val): ) ax.plot(bc, hist, ds="steps", label=string_res) log.info(string_res) - if verbose: - print(string_res) ax.set_xlabel("energy (ADC)") ax.set_ylabel("counts") ax.legend(loc="upper right") @@ -233,7 +198,7 @@ def spl_func(x_val): capsize=4, label="samples", ) - ax.plot(samples_val, spl_func(samples_val), "k:", label="fit") + ax.plot(samples_val, splev(samples_val, tck), "k:", label="fit") ax.errorbar( best_par, best_val, @@ -256,8 +221,6 @@ def spl_func(x_val): plot_dict["nopt"][dict_str] = par_dict_res log.info(f"Time to complete the optimization {time.time()-t0:.2f} s") - if verbose: - print(f"Time to complete the optimization {time.time()-t0:.2f} s") if display > 0: return res_dict, plot_dict else: @@ -286,12 +249,12 @@ def simple_gaussian_fit(energies, dx=1, sigma_thr=4, allowed_p_val=1e-20): fit_range = [np.percentile(energies, 0.2), np.percentile(energies, 99.8)] hist, bins, var = get_hist(energies, range=fit_range, dx=dx) - guess, bounds = simple_gaussian_guess(hist, bins, pgf.extended_gauss_pdf) + guess, bounds = simple_gaussian_guess(hist, bins, gauss_on_uniform) fit_range = [guess[0] - sigma_thr * guess[1], guess[0] + sigma_thr * guess[1]] energies_fit = energies[(energies > fit_range[0]) & (energies < fit_range[1])] - pars, errs, cov = pgf.fit_unbinned( - pgf.extended_gauss_pdf, + pars, errs, cov = fit_unbinned( + gauss_on_uniform.pdf_ext, energies_fit, guess=guess, bounds=bounds, @@ -304,20 +267,20 @@ def simple_gaussian_fit(energies, dx=1, sigma_thr=4, allowed_p_val=1e-20): hist, bins, var = get_hist(energies_fit, range=fit_range, dx=dx) gof_pars = pars gof_pars[2] *= dx - chisq, dof = pgf.goodness_of_fit( - hist, bins, None, pgf.gauss_pdf, gof_pars, method="Pearson" + chisq, dof = goodness_of_fit( + hist, bins, None, gauss_on_uniform.pdf_norm, gof_pars, method="Pearson" ) p_val = scipy.stats.chi2.sf(chisq, dof + len(gof_pars)) if ( - sum(sum(c) if c is not None else 0 for c in cov[:3, :][:, :3]) == np.inf - or sum(sum(c) if c is not None else 0 for c in cov[:3, :][:, :3]) == 0 - or np.isnan(sum(sum(c) if c is not None else 0 for c in cov[:3, :][:, :3])) + sum(sum(c) if c is not None else 0 for c in cov[2:, :][:, 2:]) == np.inf + or sum(sum(c) if c is not None else 0 for c in cov[2:, :][:, 2:]) == 0 + or np.isnan(sum(sum(c) if c is not None else 0 for c in cov[2:, :][:, 2:])) ): log.debug("fit failed, cov estimation failed") fit_failed = True - elif (np.abs(np.array(errs)[:3] / np.array(pars)[:3]) < 1e-7).any() or np.isnan( - np.array(errs)[:3] + elif (np.abs(np.array(errs)[2:] / np.array(pars)[2:]) < 1e-7).any() or np.isnan( + np.array(errs)[2:] ).any(): log.debug("fit failed, parameter error too low") fit_failed = True @@ -328,16 +291,16 @@ def simple_gaussian_fit(energies, dx=1, sigma_thr=4, allowed_p_val=1e-20): fit_failed = False if fit_failed: - log.debug(f"Returning values from guess") + log.debug("Returning values from guess") mu = guess[0] mu_err = 0 fwhm = guess[1] * 2 * np.sqrt(2 * np.log(2)) fwhm_err = 0 results = { - "pars": pars[:3], - "errors": errs[:3], - "covariance": cov[:3], + "pars": pars, + "errors": errs, + "covariance": cov, "mu": mu, "mu_err": mu_err, "fom": fwhm, @@ -372,18 +335,18 @@ def simple_gaussian_guess(hist, bins, func, toll=0.2): n_sig = np.sum(hist[min_idx:max_idx]) - guess = [mu, sigma, n_sig] - bounds = [ - (mu - sigma, mu + sigma), - (sigma - sigma * toll, sigma + sigma * toll), - (n_sig + n_sig * toll, n_sig + n_sig * toll), - ] - - for i, par in enumerate(inspect.getfullargspec(func)[0][1:]): - if par == "lower_range" or par == "upper_range": - guess.append(np.inf) - bounds.append(None) - elif par == "n_bkg" or par == "hstep" or par == "components": - guess.append(0) - bounds.append(None) + guess = {"mu": mu, "sigma": sigma, "n_sig": n_sig} + bounds = { + "mu": (mu - sigma, mu + sigma), + "sigma": (sigma - sigma * toll, sigma + sigma * toll), + "n_sig": (n_sig + n_sig * toll, n_sig + n_sig * toll), + } + + for par in func.required_args(): + if par == "x_lo" or par == "x_hi": + guess[par] = np.inf + bounds[par] = None + elif par == "n_bkg" or par == "hstep": + guess[par] = 0 + bounds[par] = None return guess, bounds diff --git a/src/pygama/pargen/utils.py b/src/pygama/pargen/utils.py index a1ec229ab..91002a007 100644 --- a/src/pygama/pargen/utils.py +++ b/src/pygama/pargen/utils.py @@ -5,23 +5,33 @@ import numpy as np import pandas as pd -from iminuit import Minuit, cost, util -from lgdo import Table, lh5 +from iminuit import Minuit, cost +from lgdo import lh5 log = logging.getLogger(__name__) sto = lh5.LH5Store() +def convert_to_minuit(pars, func): + try: + c = cost.UnbinnedNLL(np.array([0]), func.pdf_ext) + except AttributeError: + c = cost.UnbinnedNLL(np.array([0]), func) + if isinstance(pars, dict): + m = Minuit(c, **pars) + else: + m = Minuit(c, *pars) + return m + + def return_nans(input): if isinstance(input, FunctionType): args = input.__code__.co_varnames[: input.__code__.co_argcount][1:] - c = cost.UnbinnedNLL(np.array([0]), input) - m = Minuit(c, *[np.nan for arg in args]) + m = convert_to_minuit(np.full(len(args), np.nan), input) return m.values, m.errors, np.full((len(m.values), len(m.values)), np.nan) else: - args = input.pdf.__code__.co_varnames[: input.pdf.__code__.co_argcount][1:] - c = cost.UnbinnedNLL(np.array([0]), input.pdf) - m = Minuit(c, *[np.nan for arg in args]) + args = input.required_args() + m = convert_to_minuit(np.full(len(args), np.nan), input) return m.values, m.errors, np.full((len(m.values), len(m.values)), np.nan) @@ -42,7 +52,7 @@ def load_data( files: list, lh5_path: str, cal_dict: dict, - params=["cuspEmax"], + params: list, cal_energy_param: str = "cuspEmax_ctc_cal", threshold=None, return_selection_mask=False, @@ -51,7 +61,8 @@ def load_data( Loads in the A/E parameters needed and applies calibration constants to energy """ - out_df = pd.DataFrame(columns=params) + if isinstance(files, str): + files = [files] if isinstance(files, dict): keys = lh5.ls( @@ -120,52 +131,8 @@ def load_data( if col not in params: df.drop(col, inplace=True, axis=1) - log.debug(f"data loaded") + log.debug("data loaded") if return_selection_mask: return df, masks else: return df - - -def get_tcm_pulser_ids(tcm_file, channel, multiplicity_threshold): - if isinstance(channel, str): - if channel[:2] == "ch": - chan = int(channel[2:]) - else: - chan = int(channel) - else: - chan = channel - if isinstance(tcm_file, list): - mask = np.array([], dtype=bool) - for file in tcm_file: - _, file_mask = get_tcm_pulser_ids(file, chan, multiplicity_threshold) - mask = np.append(mask, file_mask) - ids = np.where(mask)[0] - else: - data = pd.DataFrame( - { - "array_id": sto.read("hardware_tcm_1/array_id", tcm_file)[0].view_as( - "np" - ), - "array_idx": sto.read("hardware_tcm_1/array_idx", tcm_file)[0].view_as( - "np" - ), - } - ) - cumulength = sto.read("hardware_tcm_1/cumulative_length", tcm_file)[0].view_as( - "np" - ) - cumulength = np.append(np.array([0]), cumulength) - n_channels = np.diff(cumulength) - evt_numbers = np.repeat(np.arange(0, len(cumulength) - 1), np.diff(cumulength)) - evt_mult = np.repeat(np.diff(cumulength), np.diff(cumulength)) - data["evt_number"] = evt_numbers - data["evt_mult"] = evt_mult - high_mult_events = np.where(n_channels > multiplicity_threshold)[0] - - ids = data.query(f"array_id=={channel} and evt_number in @high_mult_events")[ - "array_idx" - ].to_numpy() - mask = np.zeros(len(data.query(f"array_id=={channel}")), dtype="bool") - mask[ids] = True - return ids, mask diff --git a/src/pygama/utils.py b/src/pygama/utils.py index 147b60775..888ca396c 100644 --- a/src/pygama/utils.py +++ b/src/pygama/utils.py @@ -1,7 +1,16 @@ +from __future__ import annotations + +import json +import logging import os from collections.abc import MutableMapping +from pathlib import Path from typing import Any, Iterator +import yaml + +log = logging.getLogger(__name__) + def getenv_bool(name: str, default: bool = False) -> bool: """Get environment value as a boolean, returning True for 1, t and true @@ -74,3 +83,28 @@ def __repr__(self) -> str: numba_math_defaults = NumbaPygamaDefaults() numba_math_defaults_kwargs = numba_math_defaults + +__file_extensions__ = {"json": [".json"], "yaml": [".yaml", ".yml"]} + + +def load_dict(fname: str, ftype: str | None = None) -> dict: + """Load a text file as a Python dict.""" + fname = Path(fname) + + # determine file type from extension + if ftype is None: + for _ftype, exts in __file_extensions__.items(): + if fname.suffix in exts: + ftype = _ftype + + msg = f"loading {ftype} dict from: {fname}" + log.debug(msg) + + with fname.open() as f: + if ftype == "json": + return json.load(f) + if ftype == "yaml": + return yaml.safe_load(f) + + msg = f"unsupported file format {ftype}" + raise NotImplementedError(msg) diff --git a/tests/conftest.py b/tests/conftest.py index 333c67c95..2460708e4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -30,7 +30,7 @@ def pytest_sessionfinish(session, exitstatus): @pytest.fixture(scope="session") def lgnd_test_data(): ldata = LegendTestData() - ldata.checkout("c089a59") + ldata.checkout("89e91f6") return ldata diff --git a/tests/evt/configs/basic-evt-config.json b/tests/evt/configs/basic-evt-config.json deleted file mode 100644 index 3a8c62753..000000000 --- a/tests/evt/configs/basic-evt-config.json +++ /dev/null @@ -1,90 +0,0 @@ -{ - "channels": { - "geds_on": ["ch1084803", "ch1084804", "ch1121600"] - }, - "outputs": [ - "multiplicity", - "energy", - "energy_id", - "energy_idx", - "energy_any_above1MeV", - "energy_all_above1MeV", - "energy_aux", - "energy_sum", - "is_usable_aoe", - "aoe", - "is_aoe_rejected" - ], - "operations": { - "multiplicity": { - "channels": "geds_on", - "aggregation_mode": "sum", - "expression": "hit.cuspEmax_ctc_cal > a", - "parameters": { "a": 25 }, - "initial": 0, - "lgdo_attrs": { "statement": "0bb decay is real" } - }, - "energy": { - "channels": "geds_on", - "aggregation_mode": "first_at:dsp.tp_0_est", - "query": "hit.cuspEmax_ctc_cal>25", - "expression": "hit.cuspEmax_ctc_cal", - "initial": "np.nan" - }, - "energy_id": { - "channels": "geds_on", - "aggregation_mode": "first_at:dsp.tp_0_est", - "query": "hit.cuspEmax_ctc_cal>25", - "expression": "tcm.array_id", - "initial": 0 - }, - "energy_idx": { - "channels": "geds_on", - "aggregation_mode": "first_at:dsp.tp_0_est", - "query": "hit.cuspEmax_ctc_cal>25", - "expression": "tcm.index", - "initial": 999999999999 - }, - "energy_any_above1MeV": { - "channels": "geds_on", - "aggregation_mode": "any", - "expression": "hit.cuspEmax_ctc_cal>1000", - "initial": false - }, - "energy_all_above1MeV": { - "channels": "geds_on", - "aggregation_mode": "all", - "expression": "hit.cuspEmax_ctc_cal>1000", - "initial": false - }, - "energy_aux": { - "channels": "geds_on", - "aggregation_mode": "last_at:dsp.tp_0_est", - "query": "hit.cuspEmax_ctc_cal>25", - "expression": "hit.cuspEmax_ctc_cal", - "initial": "np.nan" - }, - "energy_sum": { - "channels": "geds_on", - "aggregation_mode": "sum", - "query": "hit.cuspEmax_ctc_cal>25", - "expression": "hit.cuspEmax_ctc_cal", - "initial": 0.0 - }, - "is_usable_aoe": { - "aggregation_mode": "keep_at_ch:evt.energy_id", - "expression": "True", - "initial": false - }, - "aoe": { - "aggregation_mode": "keep_at_ch:evt.energy_id", - "expression": "hit.AoE_Classifier", - "initial": "np.nan" - }, - "is_aoe_rejected": { - "aggregation_mode": "keep_at_ch:evt.energy_id", - "expression": "~(hit.AoE_Double_Sided_Cut)", - "initial": false - } - } -} diff --git a/tests/evt/configs/basic-evt-config.yaml b/tests/evt/configs/basic-evt-config.yaml new file mode 100644 index 000000000..bf229504e --- /dev/null +++ b/tests/evt/configs/basic-evt-config.yaml @@ -0,0 +1,85 @@ +channels: + geds_on: + - ch1084803 + - ch1084804 + - ch1121600 +outputs: + - timestamp + - multiplicity + - energy + - energy_id + - energy_idx + - energy_hit_idx + - energy_any_above1MeV + - energy_all_above1MeV + - energy_sum + - is_usable_aoe + - aoe + - is_aoe_rejected +operations: + timestamp: + channels: geds_on + aggregation_mode: first_at:dsp.tp_0_est + expression: dsp.timestamp + lgdo_attrs: + units: s + multiplicity: + channels: geds_on + aggregation_mode: sum + expression: hit.cuspEmax_ctc_cal > a + parameters: + a: 25 + initial: 0 + lgdo_attrs: + statement: 0bb decay is real + energy: + channels: geds_on + aggregation_mode: gather + query: hit.cuspEmax_ctc_cal>25 + expression: hit.cuspEmax_ctc_cal + energy_id: + channels: geds_on + aggregation_mode: first_at:dsp.tp_0_est + query: hit.cuspEmax_ctc_cal>25 + expression: tcm.array_id + initial: 0 + energy_idx: + channels: geds_on + aggregation_mode: first_at:dsp.tp_0_est + query: hit.cuspEmax_ctc_cal>25 + expression: tcm.index + initial: 999999999999 + energy_hit_idx: + channels: geds_on + aggregation_mode: first_at:dsp.tp_0_est + query: hit.cuspEmax_ctc_cal>25 + expression: tcm.array_idx + initial: 999999999999 + energy_any_above1MeV: + channels: geds_on + aggregation_mode: any + expression: hit.cuspEmax_ctc_cal>1000 + initial: false + energy_all_above1MeV: + channels: geds_on + aggregation_mode: all + expression: hit.cuspEmax_ctc_cal>1000 + initial: false + energy_sum: + channels: geds_on + aggregation_mode: sum + query: hit.cuspEmax_ctc_cal>25 + expression: hit.cuspEmax_ctc_cal + initial: 0 + is_usable_aoe: + aggregation_mode: keep_at_ch:evt.energy_id + expression: "True" + initial: false + aoe: + aggregation_mode: keep_at_ch:evt.energy_id + expression: hit.AoE_Classifier + initial: np.nan + is_aoe_rejected: + aggregation_mode: keep_at_ch:evt.energy_id + expression: ~(hit.AoE_Double_Sided_Cut) + initial: false diff --git a/tests/evt/configs/module-test-evt-config.json b/tests/evt/configs/module-test-evt-config.json deleted file mode 100644 index 0daa94658..000000000 --- a/tests/evt/configs/module-test-evt-config.json +++ /dev/null @@ -1,72 +0,0 @@ -{ - "channels": { - "spms_on": ["ch1057600", "ch1059201", "ch1062405"], - "geds_on": ["ch1084803", "ch1084804", "ch1121600"] - }, - "outputs": [ - "energy_first", - "energy_first_id", - "t0", - "lar_energy", - "lar_multiplicity", - "is_lar_rejected", - "lar_classifier", - "lar_energy_dplms", - "lar_multiplicity_dplms", - "lar_time_shift" - ], - "operations": { - "energy_first": { - "channels": "geds_on", - "aggregation_mode": "first_at:dsp.tp_0_est", - "query": "hit.cuspEmax_ctc_cal>25", - "expression": "hit.cuspEmax_ctc_cal", - "initial": "np.nan" - }, - "energy_first_id": { - "channels": "geds_on", - "aggregation_mode": "first_at:dsp.tp_0_est", - "query": "hit.cuspEmax_ctc_cal>25", - "expression": "tcm.array_id", - "initial": 0 - }, - "t0": { - "aggregation_mode": "keep_at_ch:evt.energy_first_id", - "expression": "dsp.tp_0_est", - "initial": 0.0 - }, - "lar_energy": { - "channels": "spms_on", - "aggregation_mode": "function", - "expression": "pygama.evt.modules.spm.get_energy(0.5,evt.t0,48000,1000,5000)" - }, - "lar_multiplicity": { - "channels": "spms_on", - "aggregation_mode": "function", - "expression": ".modules.spm.get_majority(0.5,evt.t0,48000,1000,5000)" - }, - "is_lar_rejected": { - "expression": "(evt.lar_energy >4) | (evt.lar_multiplicity > 4) " - }, - "lar_classifier": { - "channels": "spms_on", - "aggregation_mode": "function", - "expression": ".modules.spm.get_etc(0.5,evt.t0,48000,100,6000,80,1,0,50)" - }, - "lar_energy_dplms": { - "channels": "spms_on", - "aggregation_mode": "function", - "expression": ".modules.spm.get_energy_dplms(0.5,evt.t0,48000,1000,5000)" - }, - "lar_multiplicity_dplms": { - "channels": "spms_on", - "aggregation_mode": "function", - "expression": ".modules.spm.get_majority_dplms(0.5,evt.t0,48000,1000,5000)" - }, - "lar_time_shift": { - "channels": "spms_on", - "aggregation_mode": "function", - "expression": ".modules.spm.get_time_shift(0.5,evt.t0,48000,1000,5000)" - } - } -} diff --git a/tests/evt/configs/module-test-t0-vov-evt-config.json b/tests/evt/configs/module-test-t0-vov-evt-config.json deleted file mode 100644 index cda042337..000000000 --- a/tests/evt/configs/module-test-t0-vov-evt-config.json +++ /dev/null @@ -1,82 +0,0 @@ -{ - "channels": { - "spms_on": ["ch1057600", "ch1059201", "ch1062405"], - "geds_on": ["ch1084803", "ch1084804", "ch1121600"] - }, - "outputs": [ - "energy", - "energy_id", - "t0", - "lar_energy", - "lar_multiplicity", - "is_lar_rejected", - "lar_classifier", - "lar_energy_dplms", - "lar_multiplicity_dplms", - "lar_time_shift", - "lar_tcm_index", - "lar_pulse_index" - ], - "operations": { - "energy": { - "channels": "geds_on", - "aggregation_mode": "gather", - "query": "hit.cuspEmax_ctc_cal>25", - "expression": "hit.cuspEmax_ctc_cal" - }, - "energy_id": { - "channels": "geds_on", - "aggregation_mode": "gather", - "query": "hit.cuspEmax_ctc_cal>25", - "expression": "tcm.array_id" - }, - "t0": { - "aggregation_mode": "keep_at_ch:evt.energy_id", - "expression": "dsp.tp_0_est", - "initial": 0.0 - }, - "lar_energy": { - "channels": "spms_on", - "aggregation_mode": "function", - "expression": ".modules.spm.get_energy(0.5,evt.t0,48000,1000,5000)" - }, - "lar_multiplicity": { - "channels": "spms_on", - "aggregation_mode": "function", - "expression": ".modules.spm.get_majority(0.5,evt.t0,48000,1000,5000)" - }, - "is_lar_rejected": { - "expression": "(evt.lar_energy >4) | (evt.lar_multiplicity > 4) " - }, - "lar_classifier": { - "channels": "spms_on", - "aggregation_mode": "function", - "expression": ".modules.spm.get_etc(0.5,evt.t0,48000,100,6000,80,1,0,50)" - }, - "lar_energy_dplms": { - "channels": "spms_on", - "aggregation_mode": "function", - "expression": ".modules.spm.get_energy_dplms(0.5,evt.t0,48000,1000,5000)" - }, - "lar_multiplicity_dplms": { - "channels": "spms_on", - "aggregation_mode": "function", - "expression": ".modules.spm.get_majority_dplms(0.5,evt.t0,48000,1000,5000)" - }, - "lar_time_shift": { - "channels": "spms_on", - "aggregation_mode": "function", - "expression": ".modules.spm.get_time_shift(0.5,evt.t0,48000,1000,5000)" - }, - "lar_tcm_index": { - "channels": "spms_on", - "aggregation_mode": "function", - "expression": ".modules.spm.get_masked_tcm_idx(0.5,evt.t0,48000,1000,5000,1)" - }, - "lar_pulse_index": { - "channels": "spms_on", - "aggregation_mode": "function", - "expression": ".modules.spm.get_masked_tcm_idx(0.5,evt.t0,48000,1000,5000,0)" - } - } -} diff --git a/tests/evt/configs/spms-module-config.yaml b/tests/evt/configs/spms-module-config.yaml new file mode 100644 index 000000000..2e9b3119a --- /dev/null +++ b/tests/evt/configs/spms-module-config.yaml @@ -0,0 +1,99 @@ +channels: + spms_on: + - ch1057600 + - ch1059201 + - ch1062405 + geds_on: + - ch1084803 + - ch1084804 + - ch1121600 +outputs: + - t0 + - _pulse_mask + - spms_amp + - rawid + - hit_idx + - rawid_wo_empty + - spms_amp_full + - spms_amp_wo_empty + - trigger_pos + - is_valid_hit + - lar_coinc_class +operations: + t0: + channels: geds_on + aggregation_mode: first_at:dsp.tp_0_est + expression: dsp.tp_0_est + query: hit.cuspEmax_ctc_cal > 25 + initial: np.nan + _pulse_mask: + channels: spms_on + aggregation_mode: function + expression: pygama.evt.modules.spms.make_pulse_data_mask( + <...>, + a_thr_pe=0.1, + t_loc_ns=evt.t0, + dt_range_ns=(-30_000, 30_000), + t_loc_default_ns=48_000) + trigger_pos_full: + channels: spms_on + aggregation_mode: function + expression: pygama.evt.modules.spms.gather_pulse_data( + <...>, + observable='hit.trigger_pos', + drop_empty=False) + trigger_pos: + channels: spms_on + aggregation_mode: function + expression: pygama.evt.modules.spms.gather_pulse_data( + <...>, + observable='hit.trigger_pos', + pulse_mask=evt._pulse_mask, + drop_empty=False) + is_valid_hit: + channels: spms_on + aggregation_mode: function + expression: pygama.evt.modules.spms.gather_is_valid_hit(<...>) + spms_amp: + channels: spms_on + aggregation_mode: function + expression: pygama.evt.modules.spms.gather_pulse_data( + <...>, + observable='hit.energy_in_pe', + pulse_mask=evt._pulse_mask, + drop_empty=False) + rawid: + channels: spms_on + aggregation_mode: function + expression: pygama.evt.modules.spms.gather_tcm_data(<...>, drop_empty=False) + hit_idx: + channels: spms_on + aggregation_mode: function + expression: pygama.evt.modules.spms.gather_tcm_data(<...>, + tcm_field='idx', + drop_empty=False) + spms_amp_full: + channels: spms_on + aggregation_mode: function + expression: pygama.evt.modules.spms.gather_pulse_data( + <...>, + observable='hit.energy_in_pe', + drop_empty=False) + spms_amp_wo_empty: + channels: spms_on + aggregation_mode: function + expression: pygama.evt.modules.spms.gather_pulse_data( + <...>, + observable='hit.energy_in_pe', + pulse_mask=evt._pulse_mask) + rawid_wo_empty: + channels: spms_on + aggregation_mode: function + expression: pygama.evt.modules.spms.gather_tcm_data( + <...>, + pulse_mask=evt._pulse_mask, + drop_empty=True) + lar_coin_class: + channels: spms_on + aggregation_mode: function + expression: pygama.evt.modules.spms.geds_coincidence_classifier(<...>, geds_t0_ns=evt.t0) diff --git a/tests/evt/configs/vov-test-evt-config.json b/tests/evt/configs/vov-test-evt-config.json index 31334101e..6de44075b 100644 --- a/tests/evt/configs/vov-test-evt-config.json +++ b/tests/evt/configs/vov-test-evt-config.json @@ -28,14 +28,15 @@ "channels": "geds_on", "aggregation_mode": "gather", "query": "hit.cuspEmax_ctc_cal>25", - "expression": "hit.cuspEmax_ctc_cal" + "expression": "hit.cuspEmax_ctc_cal", + "dtype": "float32" }, "energy_sum": { "channels": "geds_on", "aggregation_mode": "sum", "query": "hit.cuspEmax_ctc_cal>25", "expression": "hit.cuspEmax_ctc_cal", - "initial": 0.0 + "initial": 0 }, "energy_idx": { "channels": "geds_on", @@ -66,7 +67,8 @@ "aggregation_mode": "sum", "expression": "hit.cuspEmax_ctc_cal > a", "parameters": { "a": 25 }, - "initial": 0 + "initial": 0, + "dtype": "int16" }, "is_saturated": { "aggregation_mode": "keep_at_ch:evt.energy_id", diff --git a/tests/evt/modules/larveto.py b/tests/evt/modules/larveto.py new file mode 100644 index 000000000..79f580234 --- /dev/null +++ b/tests/evt/modules/larveto.py @@ -0,0 +1,14 @@ +import numpy as np +import pytest + +from pygama.evt.modules import larveto + + +def test_tc_time_pdf(): + assert isinstance(larveto.l200_tc_time_pdf(0), float) + assert isinstance( + larveto.l200_tc_time_pdf(np.array([0, -0.5, 3]) * 1e3), np.ndarray + ) + + with pytest.raises(ValueError): + assert isinstance(larveto.l200_tc_time_pdf(-10000), float) diff --git a/tests/evt/test_build_evt.py b/tests/evt/test_build_evt.py index 80a40d9a8..99bf66d6f 100644 --- a/tests/evt/test_build_evt.py +++ b/tests/evt/test_build_evt.py @@ -4,7 +4,7 @@ import awkward as ak import numpy as np import pytest -from lgdo import Array, VectorOfVectors, lh5 +from lgdo import Array, Table, VectorOfVectors, lh5 from lgdo.lh5 import LH5Store from pygama.evt import build_evt @@ -13,197 +13,224 @@ store = LH5Store() -def test_basics(lgnd_test_data, tmptestdir): - outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" +@pytest.fixture(scope="module") +def files_config(lgnd_test_data, tmptestdir): tcm_path = "lh5/prod-ref-l200/generated/tier/tcm/phy/p03/r001/l200-p03-r001-phy-20230322T160139Z-tier_tcm.lh5" - if os.path.exists(outfile): - os.remove(outfile) + outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" + + return { + "tcm": (lgnd_test_data.get_path(tcm_path), "hardware_tcm_1"), + "dsp": (lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), "dsp", "ch{}"), + "hit": (lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), "hit", "ch{}"), + "evt": (outfile, "evt"), + } + +def test_basics(lgnd_test_data, files_config): build_evt( - f_tcm=lgnd_test_data.get_path(tcm_path), - f_dsp=lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), - f_hit=lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), - evt_config=f"{config_dir}/basic-evt-config.json", - f_evt=outfile, - wo_mode="o", - evt_group="evt", - hit_group="hit", - dsp_group="dsp", - tcm_group="hardware_tcm_1", + files_config, + config=f"{config_dir}/basic-evt-config.yaml", + wo_mode="of", ) - assert "statement" in store.read("/evt/multiplicity", outfile)[0].getattrs().keys() - assert ( - store.read("/evt/multiplicity", outfile)[0].getattrs()["statement"] - == "0bb decay is real" - ) + outfile = files_config["evt"][0] + f_tcm = files_config["tcm"][0] + + evt = lh5.read("evt", outfile) + + assert "statement" in evt.multiplicity.attrs + assert evt.multiplicity.attrs["statement"] == "0bb decay is real" + assert os.path.exists(outfile) - assert len(lh5.ls(outfile, "/evt/")) == 11 - nda = { - e: store.read(f"/evt/{e}", outfile)[0].view_as("np") - for e in ["energy", "energy_aux", "energy_sum", "multiplicity"] - } - assert ( - nda["energy"][nda["multiplicity"] == 1] - == nda["energy_aux"][nda["multiplicity"] == 1] - ).all() - assert ( - nda["energy"][nda["multiplicity"] == 1] - == nda["energy_sum"][nda["multiplicity"] == 1] - ).all() - assert ( - nda["energy_aux"][nda["multiplicity"] == 1] - == nda["energy_sum"][nda["multiplicity"] == 1] - ).all() + assert sorted(evt.keys()) == [ + "aoe", + "energy", + "energy_all_above1MeV", + "energy_any_above1MeV", + "energy_hit_idx", + "energy_id", + "energy_idx", + "energy_sum", + "is_aoe_rejected", + "is_usable_aoe", + "multiplicity", + "timestamp", + ] + + ak_evt = evt.view_as("ak") + + assert ak.all(ak_evt.energy_sum == ak.sum(ak_evt.energy, axis=-1)) eid = store.read("/evt/energy_id", outfile)[0].view_as("np") eidx = store.read("/evt/energy_idx", outfile)[0].view_as("np") eidx = eidx[eidx != 999999999999] - ids = store.read("hardware_tcm_1/array_id", lgnd_test_data.get_path(tcm_path))[ - 0 - ].view_as("np") + ids = store.read("hardware_tcm_1/array_id", f_tcm)[0].view_as("np") ids = ids[eidx] assert ak.all(ids == eid[eid != 0]) + ehidx = store.read("/evt/energy_hit_idx", outfile)[0].view_as("np") + ids = store.read("hardware_tcm_1/array_idx", f_tcm)[0].view_as("np") + ids = ids[eidx] + assert ak.all(ids == ehidx[ehidx != 999999999999]) + + +def test_field_nesting(lgnd_test_data, files_config): + config = { + "channels": {"geds_on": ["ch1084803", "ch1084804", "ch1121600"]}, + "outputs": [ + "sub1___timestamp", + "sub2___multiplicity", + "sub2___dummy", + ], + "operations": { + "sub1___timestamp": { + "channels": "geds_on", + "aggregation_mode": "first_at:dsp.tp_0_est", + "expression": "dsp.timestamp", + }, + "sub2___multiplicity": { + "channels": "geds_on", + "aggregation_mode": "sum", + "expression": "hit.cuspEmax_ctc_cal > 25", + "initial": 0, + }, + "sub2___dummy": { + "channels": "geds_on", + "aggregation_mode": "sum", + "expression": "hit.cuspEmax_ctc_cal > evt.sub1___timestamp", + "initial": 0, + }, + }, + } -def test_lar_module(lgnd_test_data, tmptestdir): - outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" - tcm_path = "lh5/prod-ref-l200/generated/tier/tcm/phy/p03/r001/l200-p03-r001-phy-20230322T160139Z-tier_tcm.lh5" - if os.path.exists(outfile): - os.remove(outfile) build_evt( - f_tcm=lgnd_test_data.get_path(tcm_path), - f_dsp=lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), - f_hit=lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), - evt_config=f"{config_dir}/module-test-evt-config.json", - f_evt=outfile, - wo_mode="o", - evt_group="evt", - hit_group="hit", - dsp_group="dsp", - tcm_group="hardware_tcm_1", + files_config, + config=config, + wo_mode="of", ) - assert os.path.exists(outfile) - assert len(lh5.ls(outfile, "/evt/")) == 10 - nda = { - e: store.read(f"/evt/{e}", outfile)[0].view_as("np") - for e in ["lar_multiplicity", "lar_multiplicity_dplms", "t0", "lar_time_shift"] - } - assert np.max(nda["lar_multiplicity"]) <= 3 - assert np.max(nda["lar_multiplicity_dplms"]) <= 3 - assert ((nda["lar_time_shift"] + nda["t0"]) >= 0).all() + outfile = files_config["evt"][0] + evt = lh5.read("/evt", outfile) + assert isinstance(evt, Table) + assert isinstance(evt.sub1, Table) + assert isinstance(evt.sub2, Table) + assert isinstance(evt.sub1.timestamp, Array) -def test_lar_t0_vov_module(lgnd_test_data, tmptestdir): - outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" - tcm_path = "lh5/prod-ref-l200/generated/tier/tcm/phy/p03/r001/l200-p03-r001-phy-20230322T160139Z-tier_tcm.lh5" - if os.path.exists(outfile): - os.remove(outfile) + assert sorted(evt.keys()) == ["sub1", "sub2"] + assert sorted(evt.sub1.keys()) == ["timestamp"] + assert sorted(evt.sub2.keys()) == ["dummy", "multiplicity"] + + +def test_spms_module(lgnd_test_data, files_config): build_evt( - f_tcm=lgnd_test_data.get_path(tcm_path), - f_dsp=lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), - f_hit=lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), - evt_config=f"{config_dir}/module-test-t0-vov-evt-config.json", - f_evt=outfile, - wo_mode="o", - evt_group="evt", - hit_group="hit", - dsp_group="dsp", - tcm_group="hardware_tcm_1", + files_config, + config=f"{config_dir}/spms-module-config.yaml", + wo_mode="of", ) - assert os.path.exists(outfile) - assert len(lh5.ls(outfile, "/evt/")) == 12 - nda = { - e: store.read(f"/evt/{e}", outfile)[0].view_as("np") - for e in ["lar_multiplicity", "lar_multiplicity_dplms", "lar_time_shift"] - } - assert np.max(nda["lar_multiplicity"]) <= 3 - assert np.max(nda["lar_multiplicity_dplms"]) <= 3 + outfile = files_config["evt"][0] - ch_idx = store.read("/evt/lar_tcm_index", outfile)[0].view_as("ak") - pls_idx = store.read("/evt/lar_pulse_index", outfile)[0].view_as("ak") - assert ak.count(ch_idx) == ak.count(pls_idx) - assert ak.all(ak.count(ch_idx, axis=-1) == ak.count(pls_idx, axis=-1)) + evt = lh5.read("/evt", outfile) + t0 = ak.fill_none(ak.nan_to_none(evt.t0.view_as("ak")), 48_000) + tr_pos = evt.trigger_pos.view_as("ak") * 16 + assert ak.all(tr_pos > t0 - 30_000) + assert ak.all(tr_pos < t0 + 30_000) -def test_vov(lgnd_test_data, tmptestdir): - outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" - tcm_path = "lh5/prod-ref-l200/generated/tier/tcm/phy/p03/r001/l200-p03-r001-phy-20230322T160139Z-tier_tcm.lh5" - if os.path.exists(outfile): - os.remove(outfile) + mask = evt._pulse_mask + assert isinstance(mask, VectorOfVectors) + assert len(mask) == 10 + assert mask.ndim == 3 + + full = evt.spms_amp_full.view_as("ak") + amp = evt.spms_amp.view_as("ak") + assert ak.all(amp > 0.1) + + assert ak.all(full[mask.view_as("ak")] == amp) + + wo_empty = evt.spms_amp_wo_empty.view_as("ak") + assert ak.all(wo_empty == amp[ak.count(amp, axis=-1) > 0]) + + rawids = evt.rawid.view_as("ak") + assert rawids.ndim == 2 + assert ak.count(rawids) == 30 + + idx = evt.hit_idx.view_as("ak") + assert idx.ndim == 2 + assert ak.count(idx) == 30 + + rawids_wo_empty = evt.rawid_wo_empty.view_as("ak") + assert ak.count(rawids_wo_empty) == 7 + + vhit = evt.is_valid_hit.view_as("ak") + vhit.show() + assert ak.all(ak.num(vhit, axis=-1) == ak.num(full, axis=-1)) + + +def test_vov(lgnd_test_data, files_config): build_evt( - f_tcm=lgnd_test_data.get_path(tcm_path), - f_dsp=lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), - f_hit=lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), - evt_config=f"{config_dir}/vov-test-evt-config.json", - f_evt=outfile, - wo_mode="o", - evt_group="evt", - hit_group="hit", - dsp_group="dsp", - tcm_group="hardware_tcm_1", + files_config, + config=f"{config_dir}/vov-test-evt-config.json", + wo_mode="of", ) + outfile = files_config["evt"][0] + f_tcm = files_config["tcm"][0] + assert os.path.exists(outfile) assert len(lh5.ls(outfile, "/evt/")) == 12 + + timestamp, _ = store.read("/evt/timestamp", outfile) + assert np.all(~np.isnan(timestamp.nda)) + vov_ene, _ = store.read("/evt/energy", outfile) vov_aoe, _ = store.read("/evt/aoe", outfile) arr_ac, _ = store.read("/evt/multiplicity", outfile) vov_aoeene, _ = store.read("/evt/energy_times_aoe", outfile) vov_eneac, _ = store.read("/evt/energy_times_multiplicity", outfile) arr_ac2, _ = store.read("/evt/multiplicity_squared", outfile) + assert isinstance(vov_ene, VectorOfVectors) assert isinstance(vov_aoe, VectorOfVectors) assert isinstance(arr_ac, Array) assert isinstance(vov_aoeene, VectorOfVectors) assert isinstance(vov_eneac, VectorOfVectors) assert isinstance(arr_ac2, Array) + + assert vov_ene.dtype == "float32" + assert vov_aoe.dtype == "float64" + assert arr_ac.dtype == "int16" + assert (np.diff(vov_ene.cumulative_length.nda, prepend=[0]) == arr_ac.nda).all() vov_eid = store.read("/evt/energy_id", outfile)[0].view_as("ak") vov_eidx = store.read("/evt/energy_idx", outfile)[0].view_as("ak") vov_aoe_idx = store.read("/evt/aoe_idx", outfile)[0].view_as("ak") - ids = store.read("hardware_tcm_1/array_id", lgnd_test_data.get_path(tcm_path))[ - 0 - ].view_as("ak") + ids = store.read("hardware_tcm_1/array_id", f_tcm)[0].view_as("ak") ids = ak.unflatten(ids[ak.flatten(vov_eidx)], ak.count(vov_eidx, axis=-1)) assert ak.all(ids == vov_eid) arr_ene = store.read("/evt/energy_sum", outfile)[0].view_as("ak") - assert ak.all(arr_ene == ak.nansum(vov_ene.view_as("ak"), axis=-1)) + assert ak.all( + ak.isclose(arr_ene, ak.nansum(vov_ene.view_as("ak"), axis=-1), rtol=1e-3) + ) assert ak.all(vov_aoe.view_as("ak") == vov_aoe_idx) -def test_graceful_crashing(lgnd_test_data, tmptestdir): - outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" - tcm_path = "lh5/prod-ref-l200/generated/tier/tcm/phy/p03/r001/l200-p03-r001-phy-20230322T160139Z-tier_tcm.lh5" - if os.path.exists(outfile): - os.remove(outfile) - f_tcm = lgnd_test_data.get_path(tcm_path) - f_dsp = lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")) - f_hit = lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")) - f_config = f"{config_dir}/basic-evt-config.json" - - with pytest.raises(KeyError): - build_evt(f_dsp, f_tcm, f_hit, f_config, outfile) - - with pytest.raises(KeyError): - build_evt(f_tcm, f_hit, f_dsp, f_config, outfile) - +def test_graceful_crashing(lgnd_test_data, files_config): with pytest.raises(TypeError): - build_evt(f_tcm, f_dsp, f_hit, None, outfile) + build_evt(files_config, None, wo_mode="of") conf = {"operations": {}} with pytest.raises(ValueError): - build_evt(f_tcm, f_dsp, f_hit, conf, outfile) + build_evt(files_config, conf, wo_mode="of") conf = {"channels": {"geds_on": ["ch1084803", "ch1084804", "ch1121600"]}} with pytest.raises(ValueError): - build_evt(f_tcm, f_dsp, f_hit, conf, outfile) + build_evt(files_config, conf, wo_mode="of") conf = { "channels": {"geds_on": ["ch1084803", "ch1084804", "ch1121600"]}, @@ -219,38 +246,25 @@ def test_graceful_crashing(lgnd_test_data, tmptestdir): }, } with pytest.raises(ValueError): - build_evt(f_tcm, f_dsp, f_hit, conf, outfile) + build_evt( + files_config, + conf, + wo_mode="of", + ) -def test_query(lgnd_test_data, tmptestdir): - outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" - tcm_path = "lh5/prod-ref-l200/generated/tier/tcm/phy/p03/r001/l200-p03-r001-phy-20230322T160139Z-tier_tcm.lh5" - if os.path.exists(outfile): - os.remove(outfile) +def test_query(lgnd_test_data, files_config): build_evt( - f_tcm=lgnd_test_data.get_path(tcm_path), - f_dsp=lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), - f_hit=lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), - evt_config=f"{config_dir}/query-test-evt-config.json", - f_evt=outfile, - wo_mode="o", - evt_group="evt", - hit_group="hit", - dsp_group="dsp", - tcm_group="hardware_tcm_1", + files_config, + config=f"{config_dir}/query-test-evt-config.json", + wo_mode="of", ) + outfile = files_config["evt"][0] + assert len(lh5.ls(outfile, "/evt/")) == 12 -def test_vector_sort(lgnd_test_data, tmptestdir): - outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" - tcm_path = "lh5/prod-ref-l200/generated/tier/tcm/phy/p03/r001/l200-p03-r001-phy-20230322T160139Z-tier_tcm.lh5" - if os.path.exists(outfile): - os.remove(outfile) - f_tcm = lgnd_test_data.get_path(tcm_path) - f_dsp = lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")) - f_hit = lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")) - +def test_vector_sort(lgnd_test_data, files_config): conf = { "channels": {"geds_on": ["ch1084803", "ch1084804", "ch1121600"]}, "outputs": ["acend_id", "t0_acend", "decend_id", "t0_decend"], @@ -279,7 +293,14 @@ def test_vector_sort(lgnd_test_data, tmptestdir): }, }, } - build_evt(f_tcm, f_dsp, f_hit, conf, outfile) + + build_evt( + files_config, + conf, + wo_mode="of", + ) + + outfile = files_config["evt"][0] assert os.path.exists(outfile) assert len(lh5.ls(outfile, "/evt/")) == 4 @@ -289,27 +310,3 @@ def test_vector_sort(lgnd_test_data, tmptestdir): vov_t0, _ = store.read("/evt/t0_decend", outfile) nda_t0 = vov_t0.to_aoesa().view_as("np") assert ((np.diff(nda_t0) <= 0) | (np.isnan(np.diff(nda_t0)))).all() - - -def test_tcm_id_table_pattern(lgnd_test_data, tmptestdir): - outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" - tcm_path = "lh5/prod-ref-l200/generated/tier/tcm/phy/p03/r001/l200-p03-r001-phy-20230322T160139Z-tier_tcm.lh5" - if os.path.exists(outfile): - os.remove(outfile) - f_tcm = lgnd_test_data.get_path(tcm_path) - f_dsp = lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")) - f_hit = lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")) - f_config = f"{config_dir}/basic-evt-config.json" - - with pytest.raises(ValueError): - build_evt(f_tcm, f_dsp, f_hit, f_config, outfile, tcm_id_table_pattern="ch{{}}") - with pytest.raises(ValueError): - build_evt(f_tcm, f_dsp, f_hit, f_config, outfile, tcm_id_table_pattern="ch{}{}") - with pytest.raises(NotImplementedError): - build_evt( - f_tcm, f_dsp, f_hit, f_config, outfile, tcm_id_table_pattern="ch{tcm_id}" - ) - with pytest.raises(ValueError): - build_evt( - f_tcm, f_dsp, f_hit, f_config, outfile, tcm_id_table_pattern="apple{}banana" - ) diff --git a/tests/evt/test_utils.py b/tests/evt/test_utils.py new file mode 100644 index 000000000..c3548289e --- /dev/null +++ b/tests/evt/test_utils.py @@ -0,0 +1,22 @@ +from pygama.evt import utils + + +def test_tier_data_tuple(): + files = utils.make_files_config( + { + "tcm": ("f1", "g1"), + "dsp": ("f2", "g2"), + "hit": ("f3", "g3"), + "evt": ("f4", "g4"), + } + ) + + assert files.raw == utils.H5DataLoc() + assert files.tcm.file == "f1" + assert files.tcm.group == "g1" + assert files.dsp.file == "f2" + assert files.dsp.group == "g2" + assert files.hit.file == "f3" + assert files.hit.group == "g3" + assert files.evt.file == "f4" + assert files.evt.group == "g4" diff --git a/tests/flow/test_filedb.py b/tests/flow/test_filedb.py index fe8fa72cb..8a57160d3 100644 --- a/tests/flow/test_filedb.py +++ b/tests/flow/test_filedb.py @@ -2,6 +2,7 @@ from pathlib import Path import pytest +from lgdo.lh5.exceptions import LH5EncodeError from pandas.testing import assert_frame_equal from pygama.flow import FileDB @@ -346,7 +347,7 @@ def test_serialization(test_filedb_full, tmptestdir): db = test_filedb_full db.to_disk(f"{tmptestdir}/filedb.lh5", wo_mode="of") - with pytest.raises(RuntimeError): + with pytest.raises(LH5EncodeError): db.to_disk(f"{tmptestdir}/filedb.lh5") db2 = FileDB(f"{tmptestdir}/filedb.lh5") diff --git a/tests/math/functions/test_polynomial.py b/tests/math/functions/test_polynomial.py index 354d037ee..f69bd1757 100644 --- a/tests/math/functions/test_polynomial.py +++ b/tests/math/functions/test_polynomial.py @@ -1,4 +1,5 @@ import numpy as np +from numpy.polynomial.polynomial import Polynomial from pygama.math.functions.polynomial import nb_poly @@ -7,7 +8,7 @@ def test_nb_poly(): x = np.arange(-10, 10) params = np.array([1, 2, 3]) y = nb_poly(x, params) - - y_numpy = np.polyval(params, x) + np_poly = Polynomial(params) + y_numpy = np_poly(x) assert np.array_equal(y, y_numpy) diff --git a/tests/pargen/test_aoecal.py b/tests/pargen/test_aoecal.py new file mode 100644 index 000000000..414670bcd --- /dev/null +++ b/tests/pargen/test_aoecal.py @@ -0,0 +1,41 @@ +import lgdo.lh5 as lh5 +import numpy as np + +import pygama.pargen.AoE_cal as Coe + + +def test_aoe_cal(lgnd_test_data): + # load test data here + data = lgnd_test_data.get_path( + "lh5/prod-ref-l200/generated/tier/dsp/cal/p03/r000/l200-p03-r000-cal-20230311T235840Z-tier_dsp.lh5" + ) + + df = lh5.read_as("ch1104000/dsp", data, "pd") + + df["AoE_Uncorr"] = df["A_max"] / df["cuspEmax"] + + df["cuspEmax_cal"] = df["cuspEmax"] * 0.155 + + cal_dict = { + "AoE_Uncorr": { + "expression": "A_max/cuspEmax", + "parameters": {}, + } + } + + aoe = Coe.CalAoE( + cal_dict, + "cuspEmax_cal", + lambda x: np.sqrt(1.5 + 0.1 * x), + selection_string="index==index", + debug_mode=True, + ) + aoe.calibrate(df, "AoE_Uncorr") + assert ( + (aoe.low_cut_val < -1.0) & (aoe.low_cut_val > -3) & (~np.isnan(aoe.low_cut_val)) + ) + assert ( + (~np.isnan(aoe.low_side_sfs.loc[2614.5]["sf"])) + & (aoe.low_side_sfs.loc[2614.5]["sf"] < 20) + & (aoe.low_side_sfs.loc[2614.5]["sf"] > 0) + ) diff --git a/tests/pargen/test_datacleaning.py b/tests/pargen/test_datacleaning.py new file mode 100644 index 000000000..11d6f6e53 --- /dev/null +++ b/tests/pargen/test_datacleaning.py @@ -0,0 +1,52 @@ +import lgdo.lh5 as lh5 +from pytest import approx + +import pygama.pargen.data_cleaning as dc + + +def test_cuts(lgnd_test_data): + # load test data here + data = lgnd_test_data.get_path( + "lh5/prod-ref-l200/generated/tier/dsp/cal/p03/r000/l200-p03-r000-cal-20230311T235840Z-tier_dsp.lh5" + ) + sto = lh5.LH5Store() + tbl = sto.read("ch1104000/dsp", data)[0] + + cut_pars = { + "bl_std_cut": {"cut_parameter": "bl_std", "cut_level": 4, "mode": "inclusive"}, + "dt_eff_cut": { + "expression": "(dt_eff>a)&(dt_effa) & (bl_stdb)" + ) + assert approx(cut_dict["bl_pileup_cut"]["parameters"]["a"], 0.1) == 7 + assert approx(cut_dict["bl_pileup_cut"]["parameters"]["b"], 0.1) == 24 + + # check also works for df + df = lh5.read_as("ch1104000/dsp", data, "pd") + + df["baselineEmax"].to_numpy() + + cut_dict_df = dc.generate_cuts(df, cut_pars) + + assert cut_dict_df == cut_dict + + ids = dc.get_cut_indexes(df, cut_pars) + ids_tbl = dc.get_cut_indexes(tbl, cut_pars) + + assert (ids == ids_tbl).all() diff --git a/tests/pargen/test_ecal.py b/tests/pargen/test_ecal.py index 9f9e1d7c9..26eb4d1fd 100644 --- a/tests/pargen/test_ecal.py +++ b/tests/pargen/test_ecal.py @@ -1,5 +1,8 @@ +import lgdo.lh5 as lh5 import numpy as np +from pytest import approx +from pygama.math.distributions import hpge_peak from pygama.pargen import energy_cal @@ -20,3 +23,148 @@ def test_peak_match(): peaks_adu, expected_peaks_kev, deg=0, atol=10 ) assert np.array_equal(best_ixtup, [5, 7]) + + +def test_hpge_cal(lgnd_test_data): + # test the HPGe calibration function + # the function should return a calibration polynomial + # that maps ADC channel to energy in keV + + # load test data here + data = lgnd_test_data.get_path( + "lh5/prod-ref-l200/generated/tier/dsp/cal/p03/r000/l200-p03-r000-cal-20230311T235840Z-tier_dsp.lh5" + ) + + energy = lh5.read_as("ch1104000/dsp/cuspEmax", data, "np") + + glines = [860.564, 1592.53, 1620.50, 2103.53, 2614.50] + + pk_pars = [ + (860.564, (20, 20), hpge_peak), + (1592.53, (20, 20), hpge_peak), + (1620.50, (20, 20), hpge_peak), + (2103.53, (20, 20), hpge_peak), + (2614.50, (20, 20), hpge_peak), + ] + + # test init + cal = energy_cal.HPGeCalibration( + "cuspEmax", + glines, + 2615 / np.nanpercentile(energy, 99), + deg=0, + debug_mode=True, + ) + + # test dictionary generation + out_dict = cal.gen_pars_dict() + assert out_dict == { + "expression": "a + b * cuspEmax", + "parameters": {"a": 0.0, "b": 2615 / np.nanpercentile(energy, 99)}, + } + + cal.hpge_find_energy_peaks(energy, update_cal_pars=False) + + assert (cal.peaks_kev == glines).all() + assert approx(cal.pars[1], 1) == 0.15 + assert cal.pars[0] == 0.0 + + cal.hpge_find_energy_peaks(energy) + + assert len(cal.peaks_kev) == len(glines) and (cal.peaks_kev == glines).all() + assert approx(cal.pars[1], 0.1) == 0.15 + assert cal.pars[0] == 0.0 + + cal.hpge_get_energy_peaks(energy) + + assert len(cal.peaks_kev) == len(glines) and (cal.peaks_kev == glines).all() + assert approx(cal.pars[1], 0.1) == 0.15 + assert cal.pars[0] == 0.0 + locs = cal.peak_locs.copy() + cal.hpge_cal_energy_peak_tops(energy) + + assert len(cal.peaks_kev) == len(glines) and (cal.peaks_kev == glines).all() + assert approx(cal.pars[1], 0.1) == 0.15 + assert cal.pars[0] == 0.0 + + cal.peak_locs = locs + cal.hpge_fit_energy_peaks(energy, peak_pars=pk_pars) + + assert len(cal.peaks_kev) == len(glines) and (cal.peaks_kev == glines).all() + assert approx(cal.pars[1], 0.1) == 0.15 + assert cal.pars[0] == 0.0 + + cal.get_energy_res_curve( + energy_cal.FWHMLinear, + interp_energy_kev={"Qbb": 2039.0}, + ) + + assert ( + approx( + cal.results["hpge_fit_energy_peaks"]["FWHMLinear"]["Qbb_fwhm_in_kev"], 0.1 + ) + == 2.3 + ) + + +def test_hpge_cal_full_calibration(lgnd_test_data): + data = lgnd_test_data.get_path( + "lh5/prod-ref-l200/generated/tier/dsp/cal/p03/r000/l200-p03-r000-cal-20230311T235840Z-tier_dsp.lh5" + ) + + energy = lh5.read_as("ch1104000/dsp/cuspEmax", data, "np") + + glines = [860.564, 1592.53, 1620.50, 2103.53, 2614.50] + + pk_pars = [ + (860.564, (20, 20), hpge_peak), + (1592.53, (20, 20), hpge_peak), + (1620.50, (20, 20), hpge_peak), + (2103.53, (20, 20), hpge_peak), + (2614.50, (20, 20), hpge_peak), + ] + + cal = energy_cal.HPGeCalibration( + "cuspEmax", + glines, + 2615 / np.nanpercentile(energy, 99), + deg=0, + debug_mode=True, + ) + + cal.full_calibration(energy, peak_pars=pk_pars) + + assert len(cal.peaks_kev) == len(glines) and (cal.peaks_kev == glines).all() + assert approx(cal.pars[1], 0.1) == 0.15 + assert cal.pars[0] == 0.0 + + +def test_hpge_cal_prominent_peak(lgnd_test_data): + data = lgnd_test_data.get_path( + "lh5/prod-ref-l200/generated/tier/dsp/cal/p03/r000/l200-p03-r000-cal-20230311T235840Z-tier_dsp.lh5" + ) + + energy = lh5.read_as("ch1104000/dsp/cuspEmax", data, "np") + + glines = [860.564, 1592.53, 1620.50, 2103.53, 2614.50] + + pk_pars = [ + (860.564, (20, 20), hpge_peak), + (1592.53, (20, 20), hpge_peak), + (1620.50, (20, 20), hpge_peak), + (2103.53, (20, 20), hpge_peak), + (2614.50, (20, 20), hpge_peak), + ] + + # test in + cal = energy_cal.HPGeCalibration( + "cuspEmax", + glines, + 2615 / np.nanpercentile(energy, 99), + deg=0, + debug_mode=True, + ) + + cal.calibrate_prominent_peak(energy, 2614.5, pk_pars) + assert cal.peaks_kev[0] == 2614.5 and len(cal.peaks_kev) == 1 + assert approx(cal.pars[1], 0.1) == 0.15 diff --git a/tests/pargen/test_lqcal.py b/tests/pargen/test_lqcal.py new file mode 100644 index 000000000..fbe276edc --- /dev/null +++ b/tests/pargen/test_lqcal.py @@ -0,0 +1,44 @@ +import lgdo.lh5 as lh5 +import numpy as np + +import pygama.pargen.lq_cal as lq +from pygama.math.distributions import gaussian + + +def test_lq_cal(lgnd_test_data): + # test the HPGe calibration function + # the function should return a calibration polynomial + # that maps ADC channel to energy in keV + + # load test data here + data = lgnd_test_data.get_path( + "lh5/prod-ref-l200/generated/tier/dsp/cal/p03/r000/l200-p03-r000-cal-20230311T235840Z-tier_dsp.lh5" + ) + + df = lh5.read_as("ch1104000/dsp", data, "pd") + + df["cuspEmax_cal"] = df["cuspEmax"] * 0.155 + + cal_dict = { + "LQ_Ecorr": { + "expression": "lq80/cuspEmax", + "parameters": {}, + } + } + + lqcal = lq.LQCal( + cal_dict, + "cuspEmax_cal", + lambda x: np.sqrt(1.5 + 0.1 * x), + selection_string="index==index", + cdf=gaussian, + debug_mode=True, + ) + + df["LQ_Ecorr"] = np.divide(df["lq80"], df["cuspEmax"]) + + lqcal.calibrate(df, "LQ_Ecorr") + assert (lqcal.cut_val > 0) & (~np.isnan(lqcal.cut_val)) + assert (~np.isnan(lqcal.low_side_sf.loc[1592.50]["sf"])) & ( + lqcal.low_side_sf.loc[1592.50]["sf"] > 95 + ) diff --git a/tests/skm/test_build_skm.py b/tests/skm/test_build_skm.py index c60c460f0..00fda9f08 100644 --- a/tests/skm/test_build_skm.py +++ b/tests/skm/test_build_skm.py @@ -3,6 +3,7 @@ import awkward as ak import lgdo +import pytest from lgdo.lh5 import LH5Store from pygama.evt import build_evt @@ -13,33 +14,35 @@ store = LH5Store() -def test_basics(lgnd_test_data, tmptestdir): - outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" +@pytest.fixture(scope="module") +def files_config(lgnd_test_data, tmptestdir): tcm_path = "lh5/prod-ref-l200/generated/tier/tcm/phy/p03/r001/l200-p03-r001-phy-20230322T160139Z-tier_tcm.lh5" - if os.path.exists(outfile): - os.remove(outfile) + outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" + + return { + "tcm": (lgnd_test_data.get_path(tcm_path), "hardware_tcm_1"), + "dsp": (lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), "dsp", "ch{}"), + "hit": (lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), "hit", "ch{}"), + "evt": (outfile, "evt"), + } + +def test_basics(tmptestdir, files_config): build_evt( - f_tcm=lgnd_test_data.get_path(tcm_path), - f_dsp=lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), - f_hit=lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), - evt_config=f"{evt_config_dir}/vov-test-evt-config.json", - f_evt=outfile, - wo_mode="o", - evt_group="evt", - hit_group="hit", - dsp_group="dsp", - tcm_group="hardware_tcm_1", + files_config, + config=f"{evt_config_dir}/vov-test-evt-config.json", + wo_mode="of", ) + outfile = files_config["evt"][0] skm_conf = f"{config_dir}/basic-skm-config.json" skm_out = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_skm.lh5" result = build_skm( outfile, - lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), - lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), - lgnd_test_data.get_path(tcm_path), + files_config["hit"][0], + files_config["dsp"][0], + files_config["tcm"][0], skm_conf, ) @@ -47,9 +50,9 @@ def test_basics(lgnd_test_data, tmptestdir): build_skm( outfile, - lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), - lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), - lgnd_test_data.get_path(tcm_path), + files_config["hit"][0], + files_config["dsp"][0], + files_config["tcm"][0], skm_conf, skm_out, wo_mode="o", @@ -71,10 +74,6 @@ def test_basics(lgnd_test_data, tmptestdir): assert "multiplicity" in df.keys() assert "energy_sum" in df.keys() assert (df.multiplicity.to_numpy() <= 3).all() - assert ( - df.energy_0.to_numpy() + df.energy_1.to_numpy() + df.energy_2.to_numpy() - == df.energy_sum.to_numpy() - ).all() vov_eid = ak.to_numpy( ak.fill_none( @@ -90,23 +89,15 @@ def test_basics(lgnd_test_data, tmptestdir): assert (vov_eid[:, 2] == df.energy_id_2.to_numpy()).all() -def test_attribute_passing(lgnd_test_data, tmptestdir): +def test_attribute_passing(tmptestdir, files_config): outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" - tcm_path = "lh5/prod-ref-l200/generated/tier/tcm/phy/p03/r001/l200-p03-r001-phy-20230322T160139Z-tier_tcm.lh5" if os.path.exists(outfile): os.remove(outfile) build_evt( - f_tcm=lgnd_test_data.get_path(tcm_path), - f_dsp=lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), - f_hit=lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), - evt_config=f"{evt_config_dir}/vov-test-evt-config.json", - f_evt=outfile, - wo_mode="o", - evt_group="evt", - hit_group="hit", - dsp_group="dsp", - tcm_group="hardware_tcm_1", + files_config, + config=f"{evt_config_dir}/vov-test-evt-config.json", + wo_mode="of", ) skm_conf = f"{config_dir}/basic-skm-config.json" @@ -115,9 +106,9 @@ def test_attribute_passing(lgnd_test_data, tmptestdir): build_skm( outfile, - lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), - lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), - lgnd_test_data.get_path(tcm_path), + files_config["hit"][0], + files_config["dsp"][0], + files_config["tcm"][0], skm_conf, f_skm=skm_out, wo_mode="o",