diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 368d295a8..f9ecf576e 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,5 +5,3 @@ updates: directory: "/" schedule: interval: "monthly" - ignore: - - dependency-name: "actions/*" diff --git a/.github/logo.png b/.github/logo.png new file mode 100644 index 000000000..36aec659a Binary files /dev/null and b/.github/logo.png differ diff --git a/.github/workflows/distribution.yml b/.github/workflows/distribution.yml index add54cba1..63e9c51e7 100644 --- a/.github/workflows/distribution.yml +++ b/.github/workflows/distribution.yml @@ -13,14 +13,14 @@ jobs: dist: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Build SDist and wheel run: pipx run build - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: path: dist/* @@ -33,7 +33,7 @@ jobs: if: github.event_name == 'release' && github.event.action == 'published' steps: - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 with: name: artifact path: dist diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 92ae646bb..24b9bfbd8 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -12,6 +12,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +env: + TQDM_MININTERVAL: 100 + jobs: build-and-test: @@ -24,9 +27,9 @@ jobs: os: [ubuntu-latest, macOS-latest] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Get dependencies and install the package @@ -35,16 +38,16 @@ jobs: python -m pip install --upgrade .[test] - name: Run unit tests run: | - pytest + python -m pytest test-coverage: name: Calculate and upload test coverage runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: fetch-depth: 2 - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v5 with: python-version: '3.10' @@ -54,7 +57,7 @@ jobs: python -m pip install --upgrade .[test] python -m pytest --cov=pygama --cov-report=xml - name: Upload Coverage to codecov.io - uses: codecov/codecov-action@v3 + uses: codecov/codecov-action@v4 with: token: ${{ secrets.CODECOV_TOKEN }} fail_ci_if_error: false @@ -63,10 +66,10 @@ jobs: name: Build documentation runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: fetch-depth: 0 - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v5 with: python-version: '3.10' - name: Setup build environment diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e8238d0d9..f0bc4718d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ ci: exclude: ^(attic|tutorials|src/pygama/math|src/pygama/flow/datagroup.py) repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: "v4.4.0" + rev: "v4.5.0" hooks: - id: check-added-large-files - id: check-case-conflict @@ -26,35 +26,35 @@ repos: - id: trailing-whitespace - repo: https://github.com/asottile/setup-cfg-fmt - rev: "v2.4.0" + rev: "v2.5.0" hooks: - id: setup-cfg-fmt - repo: https://github.com/PyCQA/isort - rev: "5.12.0" + rev: "5.13.2" hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: "v3.13.0" + rev: "v3.15.0" hooks: - id: pyupgrade args: ["--py38-plus"] - repo: https://github.com/psf/black - rev: "23.9.1" + rev: "23.12.1" hooks: - id: black-jupyter - repo: https://github.com/pre-commit/mirrors-mypy - rev: "v1.5.1" + rev: "v1.8.0" hooks: - id: mypy files: src stages: [manual] - repo: https://github.com/hadialqattan/pycln - rev: "v2.2.2" + rev: "v2.4.0" hooks: - id: pycln exclude: ^src/pygama/pargen @@ -85,7 +85,7 @@ repos: stages: [manual] - repo: https://github.com/codespell-project/codespell - rev: "v2.2.5" + rev: "v2.2.6" hooks: - id: codespell @@ -103,7 +103,7 @@ repos: - id: rst-inline-touching-normal - repo: https://github.com/pre-commit/mirrors-prettier - rev: "v3.0.3" + rev: "v4.0.0-alpha.8" hooks: - id: prettier types_or: [json] diff --git a/README.md b/README.md index e182a9c06..54fecb59d 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +pygama logo + # pygama [![PyPI](https://img.shields.io/pypi/v/pygama?logo=pypi)](https://pypi.org/project/pygama/) @@ -25,3 +27,8 @@ - generating and selecting high-level event data for further analysis Check out the [online documentation](https://pygama.readthedocs.io). + +## Related repositories +- [legend-exp/legend-pydataobj](https://github.com/legend-exp/legend-pydataobj) → LEGEND Python Data Objects +- [legend-exp/legend-daq2lh5](https://github.com/legend-exp/legend-daq2lh5) → Convert digitizer data to LEGEND HDF5 +- [legend-exp/dspeed](https://github.com/legend-exp/dspeed) → Fast Digital Signal Processing for particle detector signals in Python diff --git a/docs/source/conf.py b/docs/source/conf.py index a02d2d512..3bcd7d1fc 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -39,6 +39,7 @@ "source_directory": "docs/source", } html_title = f"{project} {version}" +html_logo = "../../.github/logo.png" # sphinx-napoleon # enforce consistent usage of NumPy-style docstrings @@ -57,9 +58,11 @@ "scipy": ("https://docs.scipy.org/doc/scipy", None), "pandas": ("https://pandas.pydata.org/docs", None), "matplotlib": ("https://matplotlib.org/stable", None), - "iminuit": ("https://iminuit.readthedocs.io/en/stable", None), "h5py": ("https://docs.h5py.org/en/stable", None), "pint": ("https://pint.readthedocs.io/en/stable", None), + "lgdo": ("https://legend-pydataobj.readthedocs.io/en/stable", None), + "dspeed": ("https://dspeed.readthedocs.io/en/stable", None), + "daq2lh5": ("https://legend-daq2lh5.readthedocs.io/en/stable", None), } suppress_warnings = [ diff --git a/docs/source/developer.rst b/docs/source/developer.rst index 5d3c85dcf..c27d05def 100644 --- a/docs/source/developer.rst +++ b/docs/source/developer.rst @@ -1,14 +1,23 @@ Developer's guide ================= +.. note:: + + The https://learn.scientific-python.org webpages are an extremely valuable + learning resource for Python software developer. The reader is referred to + that for any detail not covered in the following guide. + The following rules and conventions have been established for the package development and are enforced throughout the entire code base. Merge requests that do not comply to the following directives will be rejected. To start developing :mod:`pygama`, fork the remote repository to your personal -GitHub account (see `About Forks `_). +GitHub account (see `About Forks +`_). If you have not set up your ssh keys on the computer you will be working on, -please follow `GitHub's instructions `_. Once you have your own fork, you can clone it via +please follow `GitHub's instructions +`_. +Once you have your own fork, you can clone it via (replace "yourusername" with your GitHub username): .. code-block:: console @@ -21,7 +30,20 @@ dependencies and can be installed via pip by running: .. code-block:: console $ cd pygama - $ pip install '.[all]' # single quotes are not needed on bash + $ pip install -e '.[all]' # single quotes are not needed on bash + +.. important:: + + Pip's ``--editable | -e`` flag let's you install the package in "developer + mode", meaning that any change to the source code will be directly + propagated to the installed package and importable in scripts. + +.. tip:: + + It is strongly recommended to work inside a virtual environment, which + guarantees reproductibility and isolation. For more details, see + `learn.scientific-python.org + `_. Code style ---------- @@ -29,13 +51,6 @@ Code style * All functions and methods (arguments and return types) must be `type-annotated `_. Type annotations for variables like class attributes are also highly appreciated. - Do not forget to - - .. code-block:: python - - from __future__ import annotations - - at the top of a module implementation. * Messaging to the user is managed through the :mod:`logging` module. Do not add :func:`print` statements. To make a logging object available in a module, add this: @@ -48,7 +63,8 @@ Code style at the top. In general, try to keep the number of :func:`logging.debug` calls low and use informative messages. :func:`logging.info` calls should be reserved for messages from high-level routines (like - :func:`pygama.dsp.build_dsp`). Good code is never too verbose. + :func:`pygama.dsp.build_dsp`) and very sporadic. Good code is never too + verbose. * If an error condition leading to undefined behavior occurs, raise an exception. try to find the most suitable between the `built-in exceptions `_, otherwise ``raise @@ -63,18 +79,19 @@ The pre-commit tool is able to identify common style problems and automatically fix them, wherever possible. Configured hooks are listed in the ``.pre-commit-config.yaml`` file at the project root folder. They are run remotely on the GitHub repository through the `pre-commit bot -`_, but can also be run locally before submitting a -pull request (recommended): +`_, but should also be run locally before submitting a +pull request: .. code-block:: console $ cd pygama $ pip install '.[test]' $ pre-commit run --all-files # analyse the source code and fix it wherever possible - $ pre-commit install # install a Git pre-commit hook (optional but recommended) + $ pre-commit install # install a Git pre-commit hook (strongly recommended) -For a more comprehensive guide, check out the `Scikit-HEP documentation about -code style `_. +For a more comprehensive guide, check out the `learn.scientific-python.org +documentation about code style +`_. Testing ------- @@ -82,26 +99,9 @@ Testing * The :mod:`pygama` test suite is available below ``tests/``. We use `pytest `_ to run tests and analyze their output. As a starting point to learn how to write good tests, reading of `the - Scikit-HEP Intro to testing `_ is - recommended. Refer to `pytest's how-to guides - `_ for a complete - overview. -* :mod:`pygama` tests belong to three categories: - - :unit tests: Should ensure the correct behaviour of each function - independently, possibly without relying on other :mod:`pygama` methods. - The existence of these micro-tests makes it possible to promptly identify - and fix the source of a bug. An example of this are tests for each single - DSP processor - - :integration tests: Should ensure that independent parts of the code base - work well together and are integrated in a cohesive framework. An example - of this is testing whether :func:`moduleA.process_obj` is able to - correctly handle :class:`moduleB.DataObj` - - :functional tests: High-level tests of realistic applications. An example is - testing whether the processing of a real or synthetic data sample yields - consistent output parameters + relevant learn.scientific-python.org webpage + `_ is + recommended. * Unit tests are automatically run for every push event and pull request to the remote Git repository on a remote server (currently handled by GitHub @@ -125,127 +125,6 @@ Testing $ pytest --cov=pygama -Testing Numba-Wrapped Functions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -When using Numba to vectorize Python functions, the Python version of the function -does not, by default, get directly tested, but the Numba version instead. In -this case, we need to unwrap the Numba function and test the pure Python version. -With various processors in :mod:`pygama.dsp.processors`, this means that testing -and triggering the code coverage requires this unwrapping. - -Within the testing suite, we use the :func:`@pytest.fixture()` -decorator to include a helper function called ``compare_numba_vs_python`` that -can be used in any test. This function runs both the Numba and pure Python versions -of a function, asserts that they are equal up to floating precision, and returns the -output value. - -As an example, we show a snippet from the test for -:func:`pygama.dsp.processors.fixed_time_pickoff`, a processor which uses the -:func:`@numba.guvectorize()` decorator. - -.. code-block:: python - - def test_fixed_time_pickoff(compare_numba_vs_python): - """Testing function for the fixed_time_pickoff processor.""" - - len_wf = 20 - - # test for nan if w_in has a nan - w_in = np.ones(len_wf) - w_in[4] = np.nan - assert np.isnan(compare_numba_vs_python(fixed_time_pickoff, w_in, 1, ord("i"))) - -In the assertion that the output is what we expect, we use -``compare_numba_vs_python(fixed_time_pickoff, w_in, 1, ord("i"))`` in place of -``fixed_time_pickoff(w_in, 1, ord("i"))``. In general, the replacement to make is -``func(*inputs)`` becomes ``compare_numba_vs_python(func, *inputs)``. - -Note, that in cases of testing for the raising of errors, it is recommended -to instead run the function twice: once with the Numba version, and once using the -:func:`inspect.unwrap` function. We again show a snippet from the test for -:func:`pygama.dsp.processors.fixed_time_pickoff` below. We include the various -required imports in the snippet for verbosity. - -.. code-block:: python - - import inspect - - import numpy as np - import pytest - - from pygama.dsp.errors import DSPFatal - from pygama.dsp.processors import fixed_time_pickoff - - def test_fixed_time_pickoff(compare_numba_vs_python): - "skipping parts of function..." - # test for DSPFatal errors being raised - # noninteger t_in with integer interpolation - with pytest.raises(DSPFatal): - w_in = np.ones(len_wf) - fixed_time_pickoff(w_in, 1.5, ord("i")) - - with pytest.raises(DSPFatal): - a_out = np.empty(len_wf) - inspect.unwrap(fixed_time_pickoff)(w_in, 1.5, ord("i"), a_out) - -In this case, the general idea is to use :func:`pytest.raises` twice, once with -``func(*inputs)``, and again with ``inspect.unwrap(func)(*inputs)``. - -Testing Factory Functions that Return Numba-Wrapped Functions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -As in the previous section, we also have processors that are first initialized -with a factory function, which then returns a callable Numba-wrapped function. -In this case, there is a slightly different way of testing the function to ensure -full code coverage when using ``compare_numba_vs_python``, as the function -signature is generally different. - -As an example, we show a snippet from the test for -:func:`pygama.dsp.processors.dwt.discrete_wavelet_transform`, a processor which uses -a factory function to return a function wrapped by the -:func:`@numba.guvectorize()` decorator. - -.. code-block:: python - - import numpy as np - import pytest - - from pygama.dsp.errors import DSPFatal - from pygama.dsp.processors import discrete_wavelet_transform - - def test_discrete_wavelet_transform(compare_numba_vs_python): - """Testing function for the discrete_wavelet_transform processor.""" - - # set up values to use for each test case - len_wf_in = 16 - wave_type = 'haar' - level = 2 - len_wf_out = 4 - - # ensure the DSPFatal is raised for a negative level - with pytest.raises(DSPFatal): - discrete_wavelet_transform(wave_type, -1) - - # ensure that a valid input gives the expected output - w_in = np.ones(len_wf_in) - w_out = np.empty(len_wf_out) - w_out_expected = np.ones(len_wf_out) * 2**(level / 2) - - dwt_func = discrete_wavelet_transform(wave_type, level) - assert np.allclose( - compare_numba_vs_python(dwt_func, w_in, w_out), - w_out_expected, - ) - ## rest of test function is truncated in this example - -In this case, the error is raised outside of the Numba-wrapped function, and -we only need to test for the error once. For the comparison of the calculated -values to expectation, we must initialize the output array and pass it to the -list of inputs that should be used in the comparison. This is different than -the previous section, where we are instead now updating the outputted values -in place. - Documentation ------------- @@ -267,7 +146,7 @@ following: other) must be provided as separate pages in ``docs/source/`` and linked in the table of contents. * Jupyter notebooks should be added to the main Git repository below - ``tutorials/``. + ``docs/source/notebooks``. * Before submitting a pull request, contributors are required to build the documentation locally and resolve and warnings or errors. diff --git a/pyproject.toml b/pyproject.toml index 8f5058ee8..4d08123b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ write_to = "src/pygama/_version.py" minversion = "6.0" addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config"] xfail_strict = true -filterwarnings = ["error", "ignore::DeprecationWarning"] +filterwarnings = ["error"] log_cli_level = "info" testpaths = "tests" diff --git a/setup.cfg b/setup.cfg index 58bc7bbef..74c036924 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,16 +32,17 @@ project_urls = packages = find: install_requires = colorlog - dspeed>=1.1 + dspeed>=1.3.0a4 h5py>=3.2 iminuit - legend-daq2lh5>=1.0 - legend-pydataobj>=1.3 + legend-daq2lh5>=1.2.0a1 + legend-pydataobj>=1.5.0a2 matplotlib numba!=0.53.*,!=0.54.*,!=0.57 numpy>=1.21 pandas>=1.4.4 pint + pyarrow scikit-learn scipy>=1.0.1 tables diff --git a/src/pygama/cli.py b/src/pygama/cli.py index a6b59abaf..fb05ef658 100644 --- a/src/pygama/cli.py +++ b/src/pygama/cli.py @@ -80,7 +80,7 @@ def pygama_cli(): def add_lh5ls_parser(subparsers): - """Configure :func:`.lgdo.lh5_store.show` command line interface.""" + """Configure :func:`.lgdo.lh5.show` command line interface.""" parser_lh5ls = subparsers.add_parser( "lh5ls", description="""Inspect LEGEND HDF5 (LH5) file contents""" @@ -99,7 +99,7 @@ def add_lh5ls_parser(subparsers): def lh5_show_cli(args): - """Passes command line arguments to :func:`.lgdo.lh5_store.show`.""" + """Passes command line arguments to :func:`.lgdo.lh5.show`.""" show(args.lh5_file, args.lh5_group, attrs=args.attributes) diff --git a/src/pygama/evt/__init__.py b/src/pygama/evt/__init__.py index 8257a98e3..80b544455 100644 --- a/src/pygama/evt/__init__.py +++ b/src/pygama/evt/__init__.py @@ -2,7 +2,8 @@ Utilities for grouping hit data into events. """ +from .build_evt import build_evt from .build_tcm import build_tcm from .tcm import generate_tcm_cols -__all__ = ["build_tcm", "generate_tcm_cols"] +__all__ = ["build_tcm", "generate_tcm_cols", "build_evt"] diff --git a/src/pygama/evt/aggregators.py b/src/pygama/evt/aggregators.py new file mode 100644 index 000000000..dbcae2829 --- /dev/null +++ b/src/pygama/evt/aggregators.py @@ -0,0 +1,689 @@ +""" +This module provides aggregators to build the `evt` tier. +""" + +from __future__ import annotations + +import awkward as ak +import numpy as np +from lgdo import Array, ArrayOfEqualSizedArrays, VectorOfVectors, lh5 +from lgdo.lh5 import LH5Store +from numpy.typing import NDArray + +from . import utils + + +def evaluate_to_first_or_last( + cumulength: NDArray, + idx: NDArray, + ids: NDArray, + f_hit: str, + f_dsp: str, + chns: list, + chns_rm: list, + expr: str, + exprl: list, + qry: str | NDArray, + nrows: int, + sorter: tuple, + var_ph: dict = None, + defv: bool | int | float = np.nan, + is_first: bool = True, + tcm_id_table_pattern: str = "ch{}", + evt_group: str = "evt", + hit_group: str = "hit", + dsp_group: str = "dsp", +) -> Array: + """Aggregates across channels by returning the expression of the channel + with value of `sorter`. + + Parameters + ---------- + idx + `tcm` index array. + ids + `tcm` id array. + f_hit + path to `hit` tier file. + f_dsp + path to `dsp` tier file. + chns + list of channels to be aggregated. + chns_rm + list of channels to be skipped from evaluation and set to default value. + expr + expression string to be evaluated. + exprl + list of `dsp/hit/evt` parameter tuples in expression ``(tier, field)``. + qry + query expression to mask aggregation. + nrows + length of output array. + sorter + tuple of field in `hit/dsp/evt` tier to evaluate ``(tier, field)``. + var_ph + dictionary of `evt` and additional parameters and their values. + defv + default value. + is_first + defines if sorted by smallest or largest value of `sorter` + tcm_id_table_pattern + pattern to format `tcm` id values to table name in higher tiers. Must have one + placeholder which is the `tcm` id. + dsp_group + LH5 root group in `dsp` file. + hit_group + LH5 root group in `hit` file. + evt_group + LH5 root group in `evt` file. + """ + + # define dimension of output array + out = np.full(nrows, defv, dtype=type(defv)) + outt = np.zeros(len(out)) + + store = LH5Store() + + for ch in chns: + # get index list for this channel to be loaded + idx_ch = idx[ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch)] + evt_ids_ch = np.searchsorted( + cumulength, + np.where(ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch))[0], + "right", + ) + + # evaluate at channel + res = utils.get_data_at_channel( + ch=ch, + ids=ids, + idx=idx, + expr=expr, + exprl=exprl, + var_ph=var_ph, + is_evaluated=ch not in chns_rm, + f_hit=f_hit, + f_dsp=f_dsp, + defv=defv, + tcm_id_table_pattern=tcm_id_table_pattern, + evt_group=evt_group, + hit_group=hit_group, + dsp_group=dsp_group, + ) + + # get mask from query + limarr = utils.get_mask_from_query( + qry=qry, + length=len(res), + ch=ch, + idx_ch=idx_ch, + f_hit=f_hit, + f_dsp=f_dsp, + hit_group=hit_group, + dsp_group=dsp_group, + ) + + # find if sorter is in hit or dsp + t0 = store.read( + f"{ch}/{sorter[0]}/{sorter[1]}", + f_hit if f"{hit_group}" == sorter[0] else f_dsp, + idx=idx_ch, + )[0].view_as("np") + + if t0.ndim > 1: + raise ValueError(f"sorter '{sorter[0]}/{sorter[1]}' must be a 1D array") + + if is_first: + if ch == chns[0]: + outt[:] = np.inf + + out[evt_ids_ch] = np.where( + (t0 < outt[evt_ids_ch]) & (limarr), res, out[evt_ids_ch] + ) + outt[evt_ids_ch] = np.where( + (t0 < outt[evt_ids_ch]) & (limarr), t0, outt[evt_ids_ch] + ) + + else: + out[evt_ids_ch] = np.where( + (t0 > outt[evt_ids_ch]) & (limarr), res, out[evt_ids_ch] + ) + outt[evt_ids_ch] = np.where( + (t0 > outt[evt_ids_ch]) & (limarr), t0, outt[evt_ids_ch] + ) + + return Array(nda=out, dtype=type(defv)) + + +def evaluate_to_scalar( + mode: str, + cumulength: NDArray, + idx: NDArray, + ids: NDArray, + f_hit: str, + f_dsp: str, + chns: list, + chns_rm: list, + expr: str, + exprl: list, + qry: str | NDArray, + nrows: int, + var_ph: dict = None, + defv: bool | int | float = np.nan, + tcm_id_table_pattern: str = "ch{}", + evt_group: str = "evt", + hit_group: str = "hit", + dsp_group: str = "dsp", +) -> Array: + """Aggregates by summation across channels. + + Parameters + ---------- + mode + aggregation mode. + idx + `tcm` index array. + ids + `tcm` id array. + f_hit + path to `hit` tier file. + f_dsp + path to `dsp` tier file. + chns + list of channels to be aggregated. + chns_rm + list of channels to be skipped from evaluation and set to default value. + expr + expression string to be evaluated. + exprl + list of `dsp/hit/evt` parameter tuples in expression ``(tier, field)``. + qry + query expression to mask aggregation. + nrows + length of output array + var_ph + dictionary of `evt` and additional parameters and their values. + defv + default value. + tcm_id_table_pattern + pattern to format `tcm` id values to table name in higher tiers. Must have one + placeholder which is the `tcm` id. + dsp_group + LH5 root group in `dsp` file. + hit_group + LH5 root group in `hit` file. + evt_group + LH5 root group in `evt` file. + """ + + # define dimension of output array + out = np.full(nrows, defv, dtype=type(defv)) + + for ch in chns: + # get index list for this channel to be loaded + idx_ch = idx[ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch)] + evt_ids_ch = np.searchsorted( + cumulength, + np.where(ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch))[0], + "right", + ) + + res = utils.get_data_at_channel( + ch=ch, + ids=ids, + idx=idx, + expr=expr, + exprl=exprl, + var_ph=var_ph, + is_evaluated=ch not in chns_rm, + f_hit=f_hit, + f_dsp=f_dsp, + defv=defv, + tcm_id_table_pattern=tcm_id_table_pattern, + evt_group=evt_group, + hit_group=hit_group, + dsp_group=dsp_group, + ) + + # get mask from query + limarr = utils.get_mask_from_query( + qry=qry, + length=len(res), + ch=ch, + idx_ch=idx_ch, + f_hit=f_hit, + f_dsp=f_dsp, + hit_group=hit_group, + dsp_group=dsp_group, + ) + + # switch through modes + if "sum" == mode: + if res.dtype == bool: + res = res.astype(int) + out[evt_ids_ch] = np.where(limarr, res + out[evt_ids_ch], out[evt_ids_ch]) + if "any" == mode: + if res.dtype != bool: + res = res.astype(bool) + out[evt_ids_ch] = out[evt_ids_ch] | (res & limarr) + if "all" == mode: + if res.dtype != bool: + res = res.astype(bool) + out[evt_ids_ch] = out[evt_ids_ch] & res & limarr + + return Array(nda=out, dtype=type(defv)) + + +def evaluate_at_channel( + cumulength: NDArray, + idx: NDArray, + ids: NDArray, + f_hit: str, + f_dsp: str, + chns_rm: list, + expr: str, + exprl: list, + ch_comp: Array, + var_ph: dict = None, + defv: bool | int | float = np.nan, + tcm_id_table_pattern: str = "ch{}", + evt_group: str = "evt", + hit_group: str = "hit", + dsp_group: str = "dsp", +) -> Array: + """Aggregates by evaluating the expression at a given channel. + + Parameters + ---------- + idx + `tcm` index array. + ids + `tcm` id array. + f_hit + path to `hit` tier file. + f_dsp + path to `dsp` tier file. + chns_rm + list of channels to be skipped from evaluation and set to default value. + expr + expression string to be evaluated. + exprl + list of `dsp/hit/evt` parameter tuples in expression ``(tier, field)``. + ch_comp + array of rawids at which the expression is evaluated. + var_ph + dictionary of `evt` and additional parameters and their values. + defv + default value. + tcm_id_table_pattern + pattern to format `tcm` id values to table name in higher tiers. Must have one + placeholder which is the `tcm` id. + dsp_group + LH5 root group in `dsp` file. + hit_group + LH5 root group in `hit` file. + evt_group + LH5 root group in `evt` file. + """ + + out = np.full(len(ch_comp.nda), defv, dtype=type(defv)) + + for ch in np.unique(ch_comp.nda.astype(int)): + # skip default value + if utils.get_table_name_by_pattern(tcm_id_table_pattern, ch) not in lh5.ls( + f_hit + ): + continue + idx_ch = idx[ids == ch] + evt_ids_ch = np.searchsorted(cumulength, np.where(ids == ch)[0], "right") + res = utils.get_data_at_channel( + ch=utils.get_table_name_by_pattern(tcm_id_table_pattern, ch), + ids=ids, + idx=idx, + expr=expr, + exprl=exprl, + var_ph=var_ph, + is_evaluated=utils.get_table_name_by_pattern(tcm_id_table_pattern, ch) + not in chns_rm, + f_hit=f_hit, + f_dsp=f_dsp, + defv=defv, + tcm_id_table_pattern=tcm_id_table_pattern, + evt_group=evt_group, + hit_group=hit_group, + dsp_group=dsp_group, + ) + + out[evt_ids_ch] = np.where(ch == ch_comp.nda[idx_ch], res, out[evt_ids_ch]) + + return Array(nda=out, dtype=type(defv)) + + +def evaluate_at_channel_vov( + cumulength: NDArray, + idx: NDArray, + ids: NDArray, + f_hit: str, + f_dsp: str, + expr: str, + exprl: list, + ch_comp: VectorOfVectors, + chns_rm: list, + var_ph: dict = None, + defv: bool | int | float = np.nan, + tcm_id_table_pattern: str = "ch{}", + evt_group: str = "evt", + hit_group: str = "hit", + dsp_group: str = "dsp", +) -> VectorOfVectors: + """Same as :func:`evaluate_at_channel` but evaluates expression at non + flat channels :class:`.VectorOfVectors`. + + Parameters + ---------- + idx + `tcm` index array. + ids + `tcm` id array. + f_hit + path to `hit` tier file. + f_dsp + path to `dsp` tier file. + expr + expression string to be evaluated. + exprl + list of `dsp/hit/evt` parameter tuples in expression ``(tier, field)``. + ch_comp + array of "rawid"s at which the expression is evaluated. + chns_rm + list of channels to be skipped from evaluation and set to default value. + var_ph + dictionary of `evt` and additional parameters and their values. + defv + default value. + tcm_id_table_pattern + pattern to format `tcm` id values to table name in higher tiers. Must have one + placeholder which is the `tcm` id. + dsp_group + LH5 root group in `dsp` file. + hit_group + LH5 root group in `hit` file. + evt_group + LH5 root group in `evt` file. + """ + + # blow up vov to aoesa + out = ak.Array([[] for _ in range(len(ch_comp))]) + + chns = np.unique(ch_comp.flattened_data.nda).astype(int) + ch_comp = ch_comp.view_as("ak") + + type_name = None + for ch in chns: + evt_ids_ch = np.searchsorted(cumulength, np.where(ids == ch)[0], "right") + res = utils.get_data_at_channel( + ch=utils.get_table_name_by_pattern(tcm_id_table_pattern, ch), + ids=ids, + idx=idx, + expr=expr, + exprl=exprl, + var_ph=var_ph, + is_evaluated=utils.get_table_name_by_pattern(tcm_id_table_pattern, ch) + not in chns_rm, + f_hit=f_hit, + f_dsp=f_dsp, + defv=defv, + tcm_id_table_pattern=tcm_id_table_pattern, + evt_group=evt_group, + hit_group=hit_group, + dsp_group=dsp_group, + ) + + # see in which events the current channel is present + mask = ak.to_numpy(ak.any(ch_comp == ch, axis=-1), allow_missing=False) + cv = np.full(len(ch_comp), np.nan) + cv[evt_ids_ch] = res + cv[~mask] = np.nan + cv = ak.drop_none(ak.nan_to_none(ak.Array(cv)[:, None])) + + out = ak.concatenate((out, cv), axis=-1) + + if ch == chns[0]: + type_name = res.dtype + + return VectorOfVectors(ak.values_astype(out, type_name), dtype=type_name) + + +def evaluate_to_aoesa( + cumulength: NDArray, + idx: NDArray, + ids: NDArray, + f_hit: str, + f_dsp: str, + chns: list, + chns_rm: list, + expr: str, + exprl: list, + qry: str | NDArray, + nrows: int, + var_ph: dict = None, + defv: bool | int | float = np.nan, + missv=np.nan, + tcm_id_table_pattern: str = "ch{}", + evt_group: str = "evt", + hit_group: str = "hit", + dsp_group: str = "dsp", +) -> ArrayOfEqualSizedArrays: + """Aggregates by returning an :class:`.ArrayOfEqualSizedArrays` of evaluated + expressions of channels that fulfill a query expression. + + Parameters + ---------- + idx + `tcm` index array. + ids + `tcm` id array. + f_hit + path to `hit` tier file. + f_dsp + path to `dsp` tier file. + chns + list of channels to be aggregated. + chns_rm + list of channels to be skipped from evaluation and set to default value. + expr + expression string to be evaluated. + exprl + list of `dsp/hit/evt` parameter tuples in expression ``(tier, field)``. + qry + query expression to mask aggregation. + nrows + length of output :class:`.VectorOfVectors`. + ch_comp + array of "rawid"s at which the expression is evaluated. + var_ph + dictionary of `evt` and additional parameters and their values. + defv + default value. + missv + missing value. + sorter + sorts the entries in the vector according to sorter expression. + tcm_id_table_pattern + pattern to format `tcm` id values to table name in higher tiers. Must have one + placeholder which is the `tcm` id. + dsp_group + LH5 root group in `dsp` file. + hit_group + LH5 root group in `hit` file. + evt_group + LH5 root group in `evt` file. + """ + # define dimension of output array + out = np.full((nrows, len(chns)), missv) + + i = 0 + for ch in chns: + idx_ch = idx[ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch)] + evt_ids_ch = np.searchsorted( + cumulength, + np.where(ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch))[0], + "right", + ) + res = utils.get_data_at_channel( + ch=ch, + ids=ids, + idx=idx, + expr=expr, + exprl=exprl, + var_ph=var_ph, + is_evaluated=ch not in chns_rm, + f_hit=f_hit, + f_dsp=f_dsp, + defv=defv, + tcm_id_table_pattern=tcm_id_table_pattern, + evt_group=evt_group, + hit_group=hit_group, + dsp_group=dsp_group, + ) + + # get mask from query + limarr = utils.get_mask_from_query( + qry=qry, + length=len(res), + ch=ch, + idx_ch=idx_ch, + f_hit=f_hit, + f_dsp=f_dsp, + hit_group=hit_group, + dsp_group=dsp_group, + ) + + out[evt_ids_ch, i] = np.where(limarr, res, out[evt_ids_ch, i]) + + i += 1 + + return ArrayOfEqualSizedArrays(nda=out) + + +def evaluate_to_vector( + cumulength: NDArray, + idx: NDArray, + ids: NDArray, + f_hit: str, + f_dsp: str, + chns: list, + chns_rm: list, + expr: str, + exprl: list, + qry: str | NDArray, + nrows: int, + var_ph: dict = None, + defv: bool | int | float = np.nan, + sorter: str = None, + tcm_id_table_pattern: str = "ch{}", + evt_group: str = "evt", + hit_group: str = "hit", + dsp_group: str = "dsp", +) -> VectorOfVectors: + """Aggregates by returning a :class:`.VectorOfVector` of evaluated + expressions of channels that fulfill a query expression. + + Parameters + ---------- + idx + `tcm` index array. + ids + `tcm` id array. + f_hit + path to `hit` tier file. + f_dsp + path to `dsp` tier file. + chns + list of channels to be aggregated. + chns_rm + list of channels to be skipped from evaluation and set to default value. + expr + expression string to be evaluated. + exprl + list of `dsp/hit/evt` parameter tuples in expression ``(tier, field)``. + qry + query expression to mask aggregation. + nrows + length of output :class:`.VectorOfVectors`. + ch_comp + array of "rawids" at which the expression is evaluated. + var_ph + dictionary of `evt` and additional parameters and their values. + defv + default value. + sorter + sorts the entries in the vector according to sorter expression. + ``ascend_by:`` results in an vector ordered ascending, + ``decend_by:`` sorts descending. + tcm_id_table_pattern + pattern to format `tcm` id values to table name in higher tiers. Must have one + placeholder which is the `tcm` id. + dsp_group + LH5 root group in `dsp` file. + hit_group + LH5 root group in `hit` file. + evt_group + LH5 root group in `evt` file. + """ + out = evaluate_to_aoesa( + cumulength=cumulength, + idx=idx, + ids=ids, + f_hit=f_hit, + f_dsp=f_dsp, + chns=chns, + chns_rm=chns_rm, + expr=expr, + exprl=exprl, + qry=qry, + nrows=nrows, + var_ph=var_ph, + defv=defv, + missv=np.nan, + tcm_id_table_pattern=tcm_id_table_pattern, + evt_group=evt_group, + hit_group=hit_group, + dsp_group=dsp_group, + ).view_as("np") + + # if a sorter is given sort accordingly + if sorter is not None: + md, fld = sorter.split(":") + s_val = evaluate_to_aoesa( + cumulength=cumulength, + idx=idx, + ids=ids, + f_hit=f_hit, + f_dsp=f_dsp, + chns=chns, + chns_rm=chns_rm, + expr=fld, + exprl=[tuple(fld.split("."))], + qry=None, + nrows=nrows, + missv=np.nan, + tcm_id_table_pattern=tcm_id_table_pattern, + evt_group=evt_group, + hit_group=hit_group, + dsp_group=dsp_group, + ).view_as("np") + if "ascend_by" == md: + out = out[np.arange(len(out))[:, None], np.argsort(s_val)] + + elif "descend_by" == md: + out = out[np.arange(len(out))[:, None], np.argsort(-s_val)] + else: + raise ValueError( + "sorter values can only have 'ascend_by' or 'descend_by' prefixes" + ) + + return VectorOfVectors( + ak.values_astype(ak.drop_none(ak.nan_to_none(ak.Array(out))), type(defv)), + dtype=type(defv), + ) diff --git a/src/pygama/evt/build_evt.py b/src/pygama/evt/build_evt.py new file mode 100644 index 000000000..5f7949bdb --- /dev/null +++ b/src/pygama/evt/build_evt.py @@ -0,0 +1,589 @@ +""" +This module implements routines to build the `evt` tier. +""" + +from __future__ import annotations + +import itertools +import json +import logging +import re +from importlib import import_module + +import awkward as ak +import numpy as np +from lgdo import Array, ArrayOfEqualSizedArrays, Table, VectorOfVectors, lh5 +from lgdo.lh5 import LH5Store + +from . import aggregators, utils + +log = logging.getLogger(__name__) + + +def build_evt( + f_tcm: str, + f_dsp: str, + f_hit: str, + evt_config: str | dict, + f_evt: str | None = None, + wo_mode: str = "write_safe", + evt_group: str = "evt", + tcm_group: str = "hardware_tcm_1", + dsp_group: str = "dsp", + hit_group: str = "hit", + tcm_id_table_pattern: str = "ch{}", +) -> None | Table: + """Transform data from the `hit` and `dsp` levels which a channel sorted to a + event sorted data format. + + Parameters + ---------- + f_tcm + input LH5 file of the `tcm` level. + f_dsp + input LH5 file of the `dsp` level. + f_hit + input LH5 file of the `hit` level. + evt_config + name of configuration file or dictionary defining event fields. Channel + lists can be defined by importing a metadata module. + + - ``operations`` defines the fields ``name=key``, where ``channels`` + specifies the channels used to for this field (either a string or a + list of strings), + - ``aggregation_mode`` defines how the channels should be combined (see + :func:`evaluate_expression`). + - ``expression`` defnies the mathematical/special function to apply + (see :func:`evaluate_expression`), + - ``query`` defines an expression to mask the aggregation. + - ``parameters`` defines any other parameter used in expression. + + For example: + + .. code-block:: json + + { + "channels": { + "geds_on": ["ch1084803", "ch1084804", "ch1121600"], + "spms_on": ["ch1057600", "ch1059201", "ch1062405"], + "muon": "ch1027202", + }, + "operations": { + "energy_id":{ + "channels": "geds_on", + "aggregation_mode": "gather", + "query": "hit.cuspEmax_ctc_cal > 25", + "expression": "tcm.array_id", + "sort": "ascend_by:dsp.tp_0_est" + }, + "energy":{ + "aggregation_mode": "keep_at_ch:evt.energy_id", + "expression": "hit.cuspEmax_ctc_cal > 25" + } + "is_muon_rejected":{ + "channels": "muon", + "aggregation_mode": "any", + "expression": "dsp.wf_max>a", + "parameters": {"a":15100}, + "initial": false + }, + "multiplicity":{ + "channels": ["geds_on", "geds_no_psd", "geds_ac"], + "aggregation_mode": "sum", + "expression": "hit.cuspEmax_ctc_cal > a", + "parameters": {"a":25}, + "initial": 0 + }, + "t0":{ + "aggregation_mode": "keep_at_ch:evt.energy_id", + "expression": "dsp.tp_0_est" + }, + "lar_energy":{ + "channels": "spms_on", + "aggregation_mode": "function", + "expression": ".modules.spm.get_energy(0.5, evt.t0, 48000, 1000, 5000)" + }, + } + } + + f_evt + name of the output file. If ``None``, return the output :class:`.Table` + instead of writing to disk. + wo_mode + writing mode. + evt group + LH5 root group name of `evt` tier. + tcm_group + LH5 root group in `tcm` file. + dsp_group + LH5 root group in `dsp` file. + hit_group + LH5 root group in `hit` file. + tcm_id_table_pattern + pattern to format `tcm` id values to table name in higher tiers. Must + have one placeholder which is the `tcm` id. + """ + + store = LH5Store() + tbl_cfg = evt_config + if not isinstance(tbl_cfg, (str, dict)): + raise TypeError() + if isinstance(tbl_cfg, str): + with open(tbl_cfg) as f: + tbl_cfg = json.load(f) + + if "channels" not in tbl_cfg.keys(): + raise ValueError("channel field needs to be specified in the config") + if "operations" not in tbl_cfg.keys(): + raise ValueError("operations field needs to be specified in the config") + + # check tcm_id_table_pattern validity + pattern_check = re.findall(r"{([^}]*?)}", tcm_id_table_pattern) + if len(pattern_check) != 1: + raise ValueError( + f"tcm_id_table_pattern must have exactly one placeholder. {tcm_id_table_pattern} is invalid." + ) + elif "{" in pattern_check[0] or "}" in pattern_check[0]: + raise ValueError( + f"tcm_id_table_pattern {tcm_id_table_pattern} has an invalid placeholder." + ) + + if ( + utils.get_table_name_by_pattern( + tcm_id_table_pattern, + utils.get_tcm_id_by_pattern(tcm_id_table_pattern, lh5.ls(f_hit)[0]), + ) + != lh5.ls(f_hit)[0] + ): + raise ValueError( + f"tcm_id_table_pattern {tcm_id_table_pattern} does not match keys in data!" + ) + + # create channel list according to config + # This can be either read from the meta data + # or a list of channel names + log.debug("Creating channel dictionary") + + chns = {} + + for k, v in tbl_cfg["channels"].items(): + if isinstance(v, dict): + # it is a meta module. module_name must exist + if "module" not in v.keys(): + raise ValueError( + "Need module_name to load channel via a meta data module" + ) + + attr = {} + # the time_key argument is set to the time key of the DSP file + # in case it is not provided by the config + if "time_key" not in v.keys(): + attr["time_key"] = re.search(r"\d{8}T\d{6}Z", f_dsp).group(0) + + # if "None" do None + elif "None" == v["time_key"]: + attr["time_key"] = None + + # load module + p, m = v["module"].rsplit(".", 1) + met = getattr(import_module(p, package=__package__), m) + chns[k] = met(v | attr) + + elif isinstance(v, str): + chns[k] = [v] + + elif isinstance(v, list): + chns[k] = [e for e in v] + + nrows = store.read_n_rows(f"/{tcm_group}/cumulative_length", f_tcm) + + table = Table(size=nrows) + + for k, v in tbl_cfg["operations"].items(): + log.debug("Processing field " + k) + + # if mode not defined in operation, it can only be an operation on the evt level. + if "aggregation_mode" not in v.keys(): + var = {} + if "parameters" in v.keys(): + var = var | v["parameters"] + res = table.eval(v["expression"].replace(f"{evt_group}.", ""), var) + + # add attribute if present + if "lgdo_attrs" in v.keys(): + res.attrs |= v["lgdo_attrs"] + + table.add_field(k, res) + + # Else we build the event entry + else: + if "channels" not in v.keys(): + chns_e = [] + elif isinstance(v["channels"], str): + chns_e = chns[v["channels"]] + elif isinstance(v["channels"], list): + chns_e = list( + itertools.chain.from_iterable([chns[e] for e in v["channels"]]) + ) + chns_rm = [] + if "exclude_channels" in v.keys(): + if isinstance(v["exclude_channels"], str): + chns_rm = chns[v["exclude_channels"]] + elif isinstance(v["exclude_channels"], list): + chns_rm = list( + itertools.chain.from_iterable( + [chns[e] for e in v["exclude_channels"]] + ) + ) + + pars, qry, defaultv, srter = None, None, np.nan, None + if "parameters" in v.keys(): + pars = v["parameters"] + if "query" in v.keys(): + qry = v["query"] + if "initial" in v.keys(): + defaultv = v["initial"] + if isinstance(defaultv, str) and ( + defaultv in ["np.nan", "np.inf", "-np.inf"] + ): + defaultv = eval(defaultv) + if "sort" in v.keys(): + srter = v["sort"] + + obj = evaluate_expression( + f_tcm=f_tcm, + f_hit=f_hit, + f_dsp=f_dsp, + chns=chns_e, + chns_rm=chns_rm, + mode=v["aggregation_mode"], + expr=v["expression"], + nrows=nrows, + table=table, + para=pars, + qry=qry, + defv=defaultv, + sorter=srter, + tcm_id_table_pattern=tcm_id_table_pattern, + evt_group=evt_group, + hit_group=hit_group, + dsp_group=dsp_group, + tcm_group=tcm_group, + ) + + # add attribute if present + if "lgdo_attrs" in v.keys(): + obj.attrs |= v["lgdo_attrs"] + + table.add_field(k, obj) + + # write output fields into f_evt + if "outputs" in tbl_cfg.keys(): + if len(tbl_cfg["outputs"]) < 1: + log.warning("No output fields specified, no file will be written.") + return table + else: + clms_to_remove = [e for e in table.keys() if e not in tbl_cfg["outputs"]] + for fld in clms_to_remove: + table.remove_field(fld, True) + + if f_evt: + store.write( + obj=table, name=f"/{evt_group}/", lh5_file=f_evt, wo_mode=wo_mode + ) + else: + return table + else: + log.warning("No output fields specified, no file will be written.") + + key = re.search(r"\d{8}T\d{6}Z", f_hit).group(0) + log.info( + f"Applied {len(tbl_cfg['operations'])} operations to key {key} and saved " + f"{len(tbl_cfg['outputs'])} evt fields across {len(chns)} channel groups" + ) + + +def evaluate_expression( + f_tcm: str, + f_hit: str, + f_dsp: str, + chns: list, + chns_rm: list, + mode: str, + expr: str, + nrows: int, + table: Table = None, + para: dict = None, + qry: str = None, + defv: bool | int | float = np.nan, + sorter: str = None, + tcm_id_table_pattern: str = "ch{}", + evt_group: str = "evt", + hit_group: str = "hit", + dsp_group: str = "dsp", + tcm_group: str = "tcm", +) -> Array | ArrayOfEqualSizedArrays | VectorOfVectors: + """Evaluates the expression defined by the user across all channels + according to the mode. + + Parameters + ---------- + f_tcm + path to `tcm` tier file. + f_hit + path to `hit` tier file. + f_dsp + path to `dsp` tier file. + chns + list of channel names across which expression gets evaluated (form: + ``ch``). + chns_rm + list of channels which get set to default value during evaluation. In + function mode they are removed entirely (form: ``ch``) + mode + The mode determines how the event entry is calculated across channels. + Options are: + + - ``first_at:sorter``: aggregates across channels by returning the + expression of the channel with smallest value of sorter. + - ``last_at``: aggregates across channels by returning the expression of + the channel with largest value of sorter. + - ``sum``: aggregates by summation. + - ``any``: aggregates by logical or. + - ``all``: aggregates by logical and. + - ``keep_at_ch:ch_field``: aggregates according to passed ch_field. + - ``keep_at_idx:tcm_idx_field``: aggregates according to passed tcm + index field. + - ``gather``: Channels are not combined, but result saved as + :class:`.VectorOfVectors`. + + qry + a query that can mask the aggregation. + expr + the expression. That can be any mathematical equation/comparison. If + `mode` is ``function``, the expression needs to be a special processing + function defined in modules (e.g. :func:`.modules.spm.get_energy`). In + the expression parameters from either hit, dsp, evt tier (from + operations performed before this one! Dictionary operations order + matters), or from the ``parameters`` field can be used. + nrows + number of rows to be processed. + table + table of `evt` tier data. + para + dictionary of parameters defined in the ``parameters`` field in the + configuration dictionary. + defv + default value of evaluation. + sorter + can be used to sort vector outputs according to sorter expression (see + :func:`evaluate_to_vector`). + tcm_id_table_pattern + pattern to format tcm id values to table name in higher tiers. Must have one + placeholder which is the `tcm` id. + evt group + LH5 root group name of `evt` tier. + tcm_group + LH5 root group in `tcm` file. + dsp_group + LH5 root group in `dsp` file. + hit_group + LH5 root group in `hit` file. + """ + + store = LH5Store() + + # find parameters in evt file or in parameters + exprl = re.findall( + rf"({evt_group}|{hit_group}|{dsp_group}).([a-zA-Z_$][\w$]*)", expr + ) + var_ph = {} + if table: + var_ph = var_ph | { + e: table[e].view_as("ak") + for e in table.keys() + if isinstance(table[e], (Array, ArrayOfEqualSizedArrays, VectorOfVectors)) + } + if para: + var_ph = var_ph | para + + if mode == "function": + # evaluate expression + func, params = expr.split("(") + params = ( + params.replace(f"{dsp_group}.", f"{dsp_group}_") + .replace(f"{hit_group}.", f"{hit_group}_") + .replace(f"{evt_group}.", "") + ) + params = [ + f_hit, + f_dsp, + f_tcm, + hit_group, + dsp_group, + tcm_group, + tcm_id_table_pattern, + [x for x in chns if x not in chns_rm], + ] + [utils.num_and_pars(e, var_ph) for e in params[:-1].split(",")] + + # load function dynamically + p, m = func.rsplit(".", 1) + met = getattr(import_module(p, package=__package__), m) + return met(*params) + + else: + # check if query is either on channel basis or evt basis (and not a mix) + qry_mask = qry + if qry is not None: + if f"{evt_group}." in qry and ( + f"{hit_group}." in qry or f"{dsp_group}." in qry + ): + raise ValueError( + f"Query can't be a mix of {evt_group} tier and lower tiers." + ) + + # if it is an evt query we can evaluate it directly here + if table and f"{evt_group}." in qry: + qry_mask = eval(qry.replace(f"{evt_group}.", ""), table) + + # load TCM data to define an event + ids = store.read(f"/{tcm_group}/array_id", f_tcm)[0].view_as("np") + idx = store.read(f"/{tcm_group}/array_idx", f_tcm)[0].view_as("np") + cumulength = store.read(f"/{tcm_group}/cumulative_length", f_tcm)[0].view_as( + "np" + ) + + # switch through modes + if table and (("keep_at_ch:" == mode[:11]) or ("keep_at_idx:" == mode[:12])): + if "keep_at_ch:" == mode[:11]: + ch_comp = table[mode[11:].replace(f"{evt_group}.", "")] + else: + ch_comp = table[mode[12:].replace(f"{evt_group}.", "")] + if isinstance(ch_comp, Array): + ch_comp = Array(nda=ids[ch_comp.view_as("np")]) + elif isinstance(ch_comp, VectorOfVectors): + ch_comp = ch_comp.view_as("ak") + ch_comp = VectorOfVectors( + array=ak.unflatten( + ids[ak.flatten(ch_comp)], ak.count(ch_comp, axis=-1) + ) + ) + else: + raise NotImplementedError( + type(ch_comp) + + " not supported (only Array and VectorOfVectors are supported)" + ) + + if isinstance(ch_comp, Array): + return aggregators.evaluate_at_channel( + cumulength=cumulength, + idx=idx, + ids=ids, + f_hit=f_hit, + f_dsp=f_dsp, + chns_rm=chns_rm, + expr=expr, + exprl=exprl, + ch_comp=ch_comp, + var_ph=var_ph, + defv=defv, + tcm_id_table_pattern=tcm_id_table_pattern, + evt_group=evt_group, + hit_group=hit_group, + dsp_group=dsp_group, + ) + elif isinstance(ch_comp, VectorOfVectors): + return aggregators.evaluate_at_channel_vov( + cumulength=cumulength, + idx=idx, + ids=ids, + f_hit=f_hit, + f_dsp=f_dsp, + expr=expr, + exprl=exprl, + ch_comp=ch_comp, + chns_rm=chns_rm, + var_ph=var_ph, + defv=defv, + tcm_id_table_pattern=tcm_id_table_pattern, + evt_group=evt_group, + hit_group=hit_group, + dsp_group=dsp_group, + ) + else: + raise NotImplementedError( + type(ch_comp) + + " not supported (only Array and VectorOfVectors are supported)" + ) + elif "first_at:" in mode or "last_at:" in mode: + sorter = tuple( + re.findall( + rf"({evt_group}|{hit_group}|{dsp_group}).([a-zA-Z_$][\w$]*)", + mode.split("first_at:")[-1], + )[0] + ) + return aggregators.evaluate_to_first_or_last( + cumulength=cumulength, + idx=idx, + ids=ids, + f_hit=f_hit, + f_dsp=f_dsp, + chns=chns, + chns_rm=chns_rm, + expr=expr, + exprl=exprl, + qry=qry_mask, + nrows=nrows, + sorter=sorter, + var_ph=var_ph, + defv=defv, + is_first=True if "first_at:" in mode else False, + tcm_id_table_pattern=tcm_id_table_pattern, + evt_group=evt_group, + hit_group=hit_group, + dsp_group=dsp_group, + ) + elif mode in ["sum", "any", "all"]: + return aggregators.evaluate_to_scalar( + mode=mode, + cumulength=cumulength, + idx=idx, + ids=ids, + f_hit=f_hit, + f_dsp=f_dsp, + chns=chns, + chns_rm=chns_rm, + expr=expr, + exprl=exprl, + qry=qry_mask, + nrows=nrows, + var_ph=var_ph, + defv=defv, + tcm_id_table_pattern=tcm_id_table_pattern, + evt_group=evt_group, + hit_group=hit_group, + dsp_group=dsp_group, + ) + elif "gather" == mode: + return aggregators.evaluate_to_vector( + cumulength=cumulength, + idx=idx, + ids=ids, + f_hit=f_hit, + f_dsp=f_dsp, + chns=chns, + chns_rm=chns_rm, + expr=expr, + exprl=exprl, + qry=qry_mask, + nrows=nrows, + var_ph=var_ph, + defv=defv, + sorter=sorter, + tcm_id_table_pattern=tcm_id_table_pattern, + evt_group=evt_group, + hit_group=hit_group, + dsp_group=dsp_group, + ) + else: + raise ValueError(mode + " not a valid mode") diff --git a/src/pygama/evt/build_tcm.py b/src/pygama/evt/build_tcm.py index 7bb0bbef3..05c7638c4 100644 --- a/src/pygama/evt/build_tcm.py +++ b/src/pygama/evt/build_tcm.py @@ -2,7 +2,8 @@ import re -import lgdo as lgdo +import lgdo +from lgdo import lh5 from . import tcm as ptcm @@ -49,7 +50,7 @@ def build_tcm( out_name name for the TCM table in the output file. wo_mode - mode to send to :meth:`~.lgdo.lh5_store.LH5Store.write_object`. + mode to send to :meth:`~.lgdo.lh5.LH5Store.write`. See Also -------- @@ -57,7 +58,7 @@ def build_tcm( """ # hash_func: later can add list or dict or a function(str) --> int. - store = lgdo.LH5Store() + store = lh5.LH5Store() coin_data = [] array_ids = [] all_tables = [] @@ -65,7 +66,7 @@ def build_tcm( if isinstance(patterns, str): patterns = [patterns] for pattern in patterns: - tables = lgdo.ls(filename, lh5_group=pattern) + tables = lh5.ls(filename, lh5_group=pattern) for table in tables: all_tables.append(table) array_id = len(array_ids) @@ -79,7 +80,7 @@ def build_tcm( else: array_id = len(all_tables) - 1 table = table + "/" + coin_col - coin_data.append(store.read_object(table, filename)[0].nda) + coin_data.append(store.read(table, filename)[0].nda) array_ids.append(array_id) tcm_cols = ptcm.generate_tcm_cols( @@ -94,6 +95,6 @@ def build_tcm( ) if out_file is not None: - store.write_object(tcm, out_name, out_file, wo_mode=wo_mode) + store.write(tcm, out_name, out_file, wo_mode=wo_mode) return tcm diff --git a/src/pygama/evt/modules/__init__.py b/src/pygama/evt/modules/__init__.py new file mode 100644 index 000000000..bd80462f8 --- /dev/null +++ b/src/pygama/evt/modules/__init__.py @@ -0,0 +1,21 @@ +""" +Contains submodules for evt processing +""" + +from .spm import ( + get_energy, + get_energy_dplms, + get_etc, + get_majority, + get_majority_dplms, + get_time_shift, +) + +__all__ = [ + "get_energy", + "get_majority", + "get_energy_dplms", + "get_majority_dplms", + "get_etc", + "get_time_shift", +] diff --git a/src/pygama/evt/modules/legend.py b/src/pygama/evt/modules/legend.py new file mode 100644 index 000000000..2ee2d7e8e --- /dev/null +++ b/src/pygama/evt/modules/legend.py @@ -0,0 +1,35 @@ +""" +Module provides LEGEND internal functions +""" +from importlib import import_module + +from lgdo.lh5 import utils + + +def metadata(params: dict) -> list: + # only import legend meta data when needed. + # LEGEND collaborators can use the meta keyword + # While for users w/o access to the LEGEND meta data this is still working + lm = import_module("legendmeta") + lmeta = lm.LegendMetadata(path=utils.expand_path(params["meta_path"])) + chmap = lmeta.channelmap(params["time_key"]) + + tmp = [ + f"ch{e}" + for e in chmap.map("daq.rawid") + if chmap.map("daq.rawid")[e]["system"] == params["system"] + ] + + if "selectors" in params.keys(): + for k in params["selectors"].keys(): + s = "" + for e in k.split("."): + s += f"['{e}']" + + tmp = [ + e + for e in tmp + if eval("dotter" + s, {"dotter": chmap.map("daq.rawid")[int(e[2:])]}) + == params["selectors"][k] + ] + return tmp diff --git a/src/pygama/evt/modules/spm.py b/src/pygama/evt/modules/spm.py new file mode 100644 index 000000000..2dc5a4290 --- /dev/null +++ b/src/pygama/evt/modules/spm.py @@ -0,0 +1,525 @@ +""" +Module for special event level routines for SiPMs + +functions must take as the first 8 args in order: +- path to the hit file +- path to the dsp ak.Array: + if isinstance(trgr, Array): + return ak.fill_none(ak.nan_to_none(trgr.view_as("ak")), tdefault) + + elif isinstance(trgr, (VectorOfVectors)): + return ak.fill_none( + ak.min(ak.fill_none(trgr.view_as("ak"), tdefault), axis=-1), tdefault + ) + + elif isinstance(trgr, ak.Array): + if trgr.ndim == 1: + return ak.fill_none(trgr, tdefault) + elif trgr.ndim == 2: + return ak.fill_none(ak.min(ak.fill_none(trgr, tdefault), axis=-1), tdefault) + else: + raise ValueError(f"Too many dimensions: {trgr.ndim}") + elif isinstance(trgr, (float, int)) and isinstance(length, int): + return ak.Array([trgr] * length) + else: + raise ValueError(f"Can't deal with t0 of type {type(trgr)}") + + +# get SiPM coincidence window mask +def get_spm_mask( + lim: float, trgr: ak.Array, tmin: float, tmax: float, pe: ak.Array, times: ak.Array +) -> ak.Array: + if trgr.ndim != 1: + raise ValueError("trigger array muse be 1 dimensional!") + if (len(trgr) != len(pe)) or (len(trgr) != len(times)): + raise ValueError( + f"All arrays must have same dimension across first axis len(pe)={len(pe)}, len(times)={len(times)}, len(trgr)={len(trgr)}" + ) + + tmi = trgr - tmin + tma = trgr + tmax + + mask = ( + ((times * 16.0) < tma[:, None]) & ((times * 16.0) > tmi[:, None]) & (pe > lim) + ) + return mask + + +# get LAr indices according to mask per event over all channels +# mode 0 -> return pulse indices +# mode 1 -> return tcm indices +# mode 2 -> return rawids +# mode 3 -> return tcm_idx +def get_masked_tcm_idx( + f_hit, + f_dsp, + f_tcm, + hit_group, + dsp_group, + tcm_group, + tcm_id_table_pattern, + chs, + lim, + trgr, + tdefault, + tmin, + tmax, + mode=0, +) -> VectorOfVectors: + # load TCM data to define an event + store = LH5Store() + ids = store.read(f"/{tcm_group}/array_id", f_tcm)[0].view_as("np") + idx = store.read(f"/{tcm_group}/array_idx", f_tcm)[0].view_as("np") + + arr_lst = [] + + if isinstance(trgr, (float, int)): + tge = cast_trigger(trgr, tdefault, length=np.max(idx) + 1) + else: + tge = cast_trigger(trgr, tdefault, length=None) + + for ch in chs: + idx_ch = idx[ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch)] + + pe = store.read(f"{ch}/{hit_group}/energy_in_pe", f_hit, idx=idx_ch)[0].view_as( + "np" + ) + tmp = np.full((np.max(idx) + 1, len(pe[0])), np.nan) + tmp[idx_ch] = pe + pe = ak.drop_none(ak.nan_to_none(ak.Array(tmp))) + + # times are in sample units + times = store.read(f"{ch}/{hit_group}/trigger_pos", f_hit, idx=idx_ch)[ + 0 + ].view_as("np") + tmp = np.full((np.max(idx) + 1, len(times[0])), np.nan) + tmp[idx_ch] = times + times = ak.drop_none(ak.nan_to_none(ak.Array(tmp))) + + mask = get_spm_mask(lim, tge, tmin, tmax, pe, times) + + if mode == 0: + out_idx = ak.local_index(mask)[mask] + + elif mode == 1: + out_idx = np.full((np.max(idx) + 1), np.nan) + out_idx[idx_ch] = np.where( + ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch) + )[0] + out_idx = ak.drop_none(ak.nan_to_none(ak.Array(out_idx)[:, None])) + out_idx = out_idx[mask[mask] - 1] + + elif mode == 2: + out_idx = ak.Array( + [utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch)] * len(mask) + ) + out_idx = out_idx[:, None][mask[mask] - 1] + + elif mode == 3: + out_idx = np.full((np.max(idx) + 1), np.nan) + out_idx[idx_ch] = idx_ch + out_idx = ak.drop_none(ak.nan_to_none(ak.Array(out_idx)[:, None])) + out_idx = out_idx[mask[mask] - 1] + + else: + raise ValueError("Unknown mode") + + arr_lst.append(out_idx) + + return VectorOfVectors(array=ak.concatenate(arr_lst, axis=-1)) + + +def get_spm_ene_or_maj( + f_hit, + f_tcm, + hit_group, + tcm_group, + tcm_id_table_pattern, + chs, + lim, + trgr, + tdefault, + tmin, + tmax, + mode, +): + if mode not in ["energy_hc", "energy_dplms", "majority_hc", "majority_dplms"]: + raise ValueError("Unknown mode") + + # load TCM data to define an event + store = LH5Store() + ids = store.read(f"/{tcm_group}/array_id", f_tcm)[0].view_as("np") + idx = store.read(f"/{tcm_group}/array_idx", f_tcm)[0].view_as("np") + out = np.zeros(np.max(idx) + 1) + + if isinstance(trgr, (float, int)): + tge = cast_trigger(trgr, tdefault, length=np.max(idx) + 1) + else: + tge = cast_trigger(trgr, tdefault, length=None) + + for ch in chs: + idx_ch = idx[ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch)] + + if mode in ["energy_dplms", "majority_dplms"]: + pe = ak.drop_none( + ak.nan_to_none( + store.read( + f"{ch}/{hit_group}/energy_in_pe_dplms", f_hit, idx=idx_ch + )[0].view_as("ak") + ) + ) + + # times are in sample units + times = ak.drop_none( + ak.nan_to_none( + store.read( + f"{ch}/{hit_group}/trigger_pos_dplms", f_hit, idx=idx_ch + )[0].view_as("ak") + ) + ) + + else: + pe = ak.drop_none( + ak.nan_to_none( + store.read(f"{ch}/{hit_group}/energy_in_pe", f_hit, idx=idx_ch)[ + 0 + ].view_as("ak") + ) + ) + + # times are in sample units + times = ak.drop_none( + ak.nan_to_none( + store.read(f"{ch}/{hit_group}/trigger_pos", f_hit, idx=idx_ch)[ + 0 + ].view_as("ak") + ) + ) + + mask = get_spm_mask(lim, tge[idx_ch], tmin, tmax, pe, times) + pe = pe[mask] + + if mode in ["energy_hc", "energy_dplms"]: + out[idx_ch] = out[idx_ch] + ak.to_numpy(ak.nansum(pe, axis=-1)) + + else: + out[idx_ch] = out[idx_ch] + ak.to_numpy( + ak.where(ak.nansum(pe, axis=-1) > lim, 1, 0) + ) + + return Array(nda=out) + + +# get LAr energy per event over all channels +def get_energy( + f_hit, + f_dsp, + f_tcm, + hit_group, + dsp_group, + tcm_group, + tcm_id_table_pattern, + chs, + lim, + trgr, + tdefault, + tmin, + tmax, +) -> Array: + return get_spm_ene_or_maj( + f_hit, + f_tcm, + hit_group, + tcm_group, + tcm_id_table_pattern, + chs, + lim, + trgr, + tdefault, + tmin, + tmax, + "energy_hc", + ) + + +# get LAr majority per event over all channels +def get_majority( + f_hit, + f_dsp, + f_tcm, + hit_group, + dsp_group, + tcm_group, + tcm_id_table_pattern, + chs, + lim, + trgr, + tdefault, + tmin, + tmax, +) -> Array: + return get_spm_ene_or_maj( + f_hit, + f_tcm, + hit_group, + tcm_group, + tcm_id_table_pattern, + chs, + lim, + trgr, + tdefault, + tmin, + tmax, + "majority_hc", + ) + + +# get LAr energy per event over all channels +def get_energy_dplms( + f_hit, + f_dsp, + f_tcm, + hit_group, + dsp_group, + tcm_group, + tcm_id_table_pattern, + chs, + lim, + trgr, + tdefault, + tmin, + tmax, +) -> Array: + return get_spm_ene_or_maj( + f_hit, + f_tcm, + hit_group, + tcm_group, + tcm_id_table_pattern, + chs, + lim, + trgr, + tdefault, + tmin, + tmax, + "energy_dplms", + ) + + +# get LAr majority per event over all channels +def get_majority_dplms( + f_hit, + f_dsp, + f_tcm, + hit_group, + dsp_group, + tcm_group, + tcm_id_table_pattern, + chs, + lim, + trgr, + tdefault, + tmin, + tmax, +) -> Array: + return get_spm_ene_or_maj( + f_hit, + f_tcm, + hit_group, + tcm_group, + tcm_id_table_pattern, + chs, + lim, + trgr, + tdefault, + tmin, + tmax, + "majority_dplms", + ) + + +# Calculate the ETC in different trailing modes: +# trail = 0: Singlet window = [tge,tge+swin] +# trail = 1: Singlet window = [t_first_lar_pulse, t_first_lar_pulse+ swin] +# trail = 2: Like trail = 1, but t_first_lar_pulse <= tge is ensured +# min_first_pls_ene sets the minimum energy of the first pulse (only used in trail > 0) +# max_per_channel, maximum number of pes a channel is allowed to have, if above it gets excluded +def get_etc( + f_hit, + f_dsp, + f_tcm, + hit_group, + dsp_group, + tcm_group, + tcm_id_table_pattern, + chs, + lim, + trgr, + tdefault, + tmin, + tmax, + swin, + trail, + min_first_pls_ene, + max_per_channel, +) -> Array: + # load TCM data to define an event + store = LH5Store() + ids = store.read(f"/{tcm_group}/array_id", f_tcm)[0].view_as("np") + idx = store.read(f"/{tcm_group}/array_idx", f_tcm)[0].view_as("np") + pe_lst = [] + time_lst = [] + + if isinstance(trgr, (float, int)): + tge = cast_trigger(trgr, tdefault, length=np.max(idx) + 1) + else: + tge = cast_trigger(trgr, tdefault, length=None) + + for ch in chs: + idx_ch = idx[ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch)] + + pe = store.read(f"{ch}/{hit_group}/energy_in_pe", f_hit, idx=idx_ch)[0].view_as( + "np" + ) + tmp = np.full((np.max(idx) + 1, len(pe[0])), np.nan) + tmp[idx_ch] = pe + pe = ak.drop_none(ak.nan_to_none(ak.Array(tmp))) + + # times are in sample units + times = store.read(f"{ch}/{hit_group}/trigger_pos", f_hit, idx=idx_ch)[ + 0 + ].view_as("np") + tmp = np.full((np.max(idx) + 1, len(times[0])), np.nan) + tmp[idx_ch] = times + times = ak.drop_none(ak.nan_to_none(ak.Array(tmp))) + + mask = get_spm_mask(lim, tge, tmin, tmax, pe, times) + + pe = pe[mask] + + # max pe mask + max_pe_mask = ak.nansum(pe, axis=-1) < max_per_channel + pe = ak.drop_none( + ak.nan_to_none(ak.where(max_pe_mask, pe, ak.Array([[np.nan]]))) + ) + pe_lst.append(pe) + + times = times[mask] * 16 + times = ak.drop_none( + ak.nan_to_none(ak.where(max_pe_mask, times, ak.Array([[np.nan]]))) + ) + time_lst.append(times) + + pe_all = ak.concatenate(pe_lst, axis=-1) + time_all = ak.concatenate(time_lst, axis=-1) + + if trail > 0: + t1d = ak.min(time_all[pe_all > min_first_pls_ene], axis=-1) + + if trail == 2: + t1d = ak.where(t1d > tge, tge, t1d) + + mask_total = time_all > t1d + mask_singlet = (time_all > t1d) & (time_all < t1d + swin) + + else: + mask_total = time_all > tge + mask_singlet = (time_all > tge) & (time_all < tge + swin) + + pe_singlet = ak.to_numpy( + ak.fill_none(ak.nansum(pe_all[mask_singlet], axis=-1), 0), allow_missing=False + ) + pe_total = ak.to_numpy( + ak.fill_none(ak.nansum(pe_all[mask_total], axis=-1), 0), allow_missing=False + ) + etc = np.divide( + pe_singlet, pe_total, out=np.full_like(pe_total, np.nan), where=pe_total != 0 + ) + + return Array(nda=etc) + + +# returns relative time shift of the first LAr pulse relative to the Ge trigger +def get_time_shift( + f_hit, + f_dsp, + f_tcm, + hit_group, + dsp_group, + tcm_group, + tcm_id_table_pattern, + chs, + lim, + trgr, + tdefault, + tmin, + tmax, +) -> Array: + store = LH5Store() + # load TCM data to define an event + ids = store.read(f"/{tcm_group}/array_id", f_tcm)[0].view_as("np") + idx = store.read(f"/{tcm_group}/array_idx", f_tcm)[0].view_as("np") + time_all = ak.Array([[] for x in range(np.max(idx) + 1)]) + + if isinstance(trgr, (float, int)): + tge = cast_trigger(trgr, tdefault, length=np.max(idx) + 1) + else: + tge = cast_trigger(trgr, tdefault, length=None) + + for ch in chs: + idx_ch = idx[ids == utils.get_tcm_id_by_pattern(tcm_id_table_pattern, ch)] + + pe = store.read(f"{ch}/{hit_group}/energy_in_pe", f_hit, idx=idx_ch)[0].view_as( + "np" + ) + tmp = np.full((np.max(idx) + 1, len(pe[0])), np.nan) + tmp[idx_ch] = pe + pe = ak.drop_none(ak.nan_to_none(ak.Array(tmp))) + + # times are in sample units + times = store.read(f"{ch}/{hit_group}/trigger_pos", f_hit, idx=idx_ch)[ + 0 + ].view_as("np") + tmp = np.full((np.max(idx) + 1, len(times[0])), np.nan) + tmp[idx_ch] = times + times = ak.drop_none(ak.nan_to_none(ak.Array(tmp))) + + mask = get_spm_mask(lim, tge, tmin, tmax, pe, times) + + # apply mask and convert sample units to ns + times = times[mask] * 16 + + time_all = ak.concatenate((time_all, times), axis=-1) + + out = ak.min(time_all, axis=-1) + + # Convert to 1D numpy array + out = ak.to_numpy(ak.fill_none(out, np.inf), allow_missing=False) + tge = ak.to_numpy(tge, allow_missing=False) + + return Array(out - tge) diff --git a/src/pygama/evt/utils.py b/src/pygama/evt/utils.py new file mode 100644 index 000000000..175cd868a --- /dev/null +++ b/src/pygama/evt/utils.py @@ -0,0 +1,282 @@ +""" +This module provides utilities to build the `evt` tier. +""" + +from __future__ import annotations + +import re + +import awkward as ak +import numpy as np +from lgdo.lh5 import LH5Store +from numpy.typing import NDArray + + +def get_tcm_id_by_pattern(tcm_id_table_pattern: str, ch: str) -> int: + pre = tcm_id_table_pattern.split("{")[0] + post = tcm_id_table_pattern.split("}")[1] + return int(ch.strip(pre).strip(post)) + + +def get_table_name_by_pattern(tcm_id_table_pattern: str, ch_id: int) -> str: + # check tcm_id_table_pattern validity + pattern_check = re.findall(r"{([^}]*?)}", tcm_id_table_pattern)[0] + if pattern_check == "" or ":" == pattern_check[0]: + return tcm_id_table_pattern.format(ch_id) + else: + raise NotImplementedError( + "Only empty placeholders with format specifications are currently implemented" + ) + + +def num_and_pars(value: str, par_dic: dict): + # function tries to convert a string to a int, float, bool + # or returns the value if value is a key in par_dic + if value in par_dic.keys(): + return par_dic[value] + try: + value = int(value) + except ValueError: + try: + value = float(value) + except ValueError: + try: + value = bool(value) + except ValueError: + pass + return value + + +def find_parameters( + f_hit: str, + f_dsp: str, + ch: str, + idx_ch: NDArray, + exprl: list, + hit_group: str = "hit", + dsp_group: str = "dsp", +) -> dict: + """Wraps :func:`load_vars_to_nda` to return parameters from `hit` and `dsp` + tiers. + + Parameters + ---------- + f_hit + path to `hit` tier file. + f_dsp + path to `dsp` tier file. + ch + "rawid" in the tiers. + idx_ch + index array of entries to be read from files. + exprl + list of tuples ``(tier, field)`` to be found in the `hit/dsp` tiers. + dsp_group + LH5 root group in dsp file. + hit_group + LH5 root group in hit file. + """ + + # find fields in either dsp, hit + dsp_flds = [e[1] for e in exprl if e[0] == dsp_group] + hit_flds = [e[1] for e in exprl if e[0] == hit_group] + + store = LH5Store() + hit_dict, dsp_dict = {}, {} + if len(hit_flds) > 0: + hit_ak = store.read( + f"{ch.replace('/','')}/{hit_group}/", f_hit, field_mask=hit_flds, idx=idx_ch + )[0].view_as("ak") + hit_dict = dict( + zip([f"{hit_group}_" + e for e in ak.fields(hit_ak)], ak.unzip(hit_ak)) + ) + if len(dsp_flds) > 0: + dsp_ak = store.read( + f"{ch.replace('/','')}/{dsp_group}/", f_dsp, field_mask=dsp_flds, idx=idx_ch + )[0].view_as("ak") + dsp_dict = dict( + zip([f"{dsp_group}_" + e for e in ak.fields(dsp_ak)], ak.unzip(dsp_ak)) + ) + + return hit_dict | dsp_dict + + +def get_data_at_channel( + ch: str, + ids: NDArray, + idx: NDArray, + expr: str, + exprl: list, + var_ph: dict, + is_evaluated: bool, + f_hit: str, + f_dsp: str, + defv, + tcm_id_table_pattern: str = "ch{}", + evt_group: str = "evt", + hit_group: str = "hit", + dsp_group: str = "dsp", +) -> np.ndarray: + """Evaluates an expression and returns the result. + + Parameters + ---------- + ch + "rawid" of channel to be evaluated. + idx + `tcm` index array. + ids + `tcm` id array. + expr + expression to be evaluated. + exprl + list of parameter-tuples ``(root_group, field)`` found in the expression. + var_ph + dict of additional parameters that are not channel dependent. + is_evaluated + if false, the expression does not get evaluated but an array of default + values is returned. + f_hit + path to `hit` tier file. + f_dsp + path to `dsp` tier file. + defv + default value. + tcm_id_table_pattern + Pattern to format tcm id values to table name in higher tiers. Must have one + placeholder which is the tcm id. + dsp_group + LH5 root group in dsp file. + hit_group + LH5 root group in hit file. + evt_group + LH5 root group in evt file. + """ + + # get index list for this channel to be loaded + idx_ch = idx[ids == get_tcm_id_by_pattern(tcm_id_table_pattern, ch)] + outsize = len(idx_ch) + + if not is_evaluated: + res = np.full(outsize, defv, dtype=type(defv)) + elif "tcm.array_id" == expr: + res = np.full( + outsize, get_tcm_id_by_pattern(tcm_id_table_pattern, ch), dtype=int + ) + elif "tcm.index" == expr: + res = np.where(ids == get_tcm_id_by_pattern(tcm_id_table_pattern, ch))[0] + else: + var = find_parameters( + f_hit=f_hit, + f_dsp=f_dsp, + ch=ch, + idx_ch=idx_ch, + exprl=exprl, + hit_group=hit_group, + dsp_group=dsp_group, + ) + + if var_ph is not None: + var = var | var_ph + + # evaluate expression + # move tier+dots in expression to underscores (e.g. evt.foo -> evt_foo) + res = eval( + expr.replace(f"{dsp_group}.", f"{dsp_group}_") + .replace(f"{hit_group}.", f"{hit_group}_") + .replace(f"{evt_group}.", ""), + var, + ) + + # in case the expression evaluates to a single value blow it up + if (not hasattr(res, "__len__")) or (isinstance(res, str)): + return np.full(outsize, res) + + # the resulting arrays need to be 1D from the operation, + # this can only change once we support larger than two dimensional LGDOs + # ak.to_numpy() raises error if array not regular + res = ak.to_numpy(res, allow_missing=False) + + # in this method only 1D values are allowed + if res.ndim > 1: + raise ValueError( + f"expression '{expr}' must return 1D array. If you are using VectorOfVectors or ArrayOfEqualSizedArrays, use awkward reduction functions to reduce the dimension" + ) + + return res + + +def get_mask_from_query( + qry: str | NDArray, + length: int, + ch: str, + idx_ch: NDArray, + f_hit: str, + f_dsp: str, + hit_group: str = "hit", + dsp_group: str = "dsp", +) -> np.ndarray: + """Evaluates a query expression and returns a mask accordingly. + + Parameters + ---------- + qry + query expression. + length + length of the return mask. + ch + "rawid" of channel to be evaluated. + idx_ch + channel indices to be read. + f_hit + path to `hit` tier file. + f_dsp + path to `dsp` tier file. + hit_group + LH5 root group in hit file. + dsp_group + LH5 root group in dsp file. + """ + + # get sub evt based query condition if needed + if isinstance(qry, str): + qry_lst = re.findall(r"(hit|dsp).([a-zA-Z_$][\w$]*)", qry) + qry_var = find_parameters( + f_hit=f_hit, + f_dsp=f_dsp, + ch=ch, + idx_ch=idx_ch, + exprl=qry_lst, + hit_group=hit_group, + dsp_group=dsp_group, + ) + limarr = eval( + qry.replace(f"{dsp_group}.", f"{dsp_group}_").replace( + f"{hit_group}.", f"{hit_group}_" + ), + qry_var, + ) + + # in case the expression evaluates to a single value blow it up + if (not hasattr(limarr, "__len__")) or (isinstance(limarr, str)): + return np.full(len(idx_ch), limarr) + + limarr = ak.to_numpy(limarr, allow_missing=False) + if limarr.ndim > 1: + raise ValueError( + f"query '{qry}' must return 1D array. If you are using VectorOfVectors or ArrayOfEqualSizedArrays, use awkward reduction functions to reduce the dimension" + ) + + # or forward the array + elif isinstance(qry, np.ndarray): + limarr = qry + + # if no condition, it must be true + else: + limarr = np.ones(length).astype(bool) + + # explicit cast to bool + if limarr.dtype != bool: + limarr = limarr.astype(bool) + + return limarr diff --git a/src/pygama/flow/data_loader.py b/src/pygama/flow/data_loader.py index ea6d8ebe3..db0ac5c7e 100644 --- a/src/pygama/flow/data_loader.py +++ b/src/pygama/flow/data_loader.py @@ -14,7 +14,9 @@ import numpy as np import pandas as pd from dspeed.vis import WaveformBrowser -from lgdo import Array, LH5Iterator, LH5Store, Struct, Table, lgdo_utils +from lgdo.lh5 import LH5Iterator, LH5Store +from lgdo.lh5.utils import expand_vars +from lgdo.types import Array, Struct, Table from lgdo.types.vectorofvectors import build_cl, explode_arrays, explode_cl from tqdm.auto import tqdm @@ -193,9 +195,7 @@ def set_config(self, config: dict | str) -> None: # look for info in configuration if FileDB is not set if self.filedb is None: # expand $_ variables - value = lgdo_utils.expand_vars( - config["filedb"], substitute={"_": config_dir} - ) + value = expand_vars(config["filedb"], substitute={"_": config_dir}) self.filedb = FileDB(value) if not os.path.isdir(self.filedb.data_dir): @@ -584,17 +584,17 @@ def build_entry_list( tcm_table_name = self.filedb.get_table_name(tcm_tier, tcm_tb) try: - tcm_lgdo, _ = sto.read_object(tcm_table_name, tcm_path) + tcm_lgdo, _ = sto.read(tcm_table_name, tcm_path) except KeyError: log.warning(f"Cannot find table {tcm_table_name} in file {tcm_path}") continue - # Have to do some hacky stuff until I get a get_dataframe() method + # Have to do some hacky stuff until I get a view_as("pd") method tcm_lgdo[self.tcms[tcm_level]["tcm_cols"]["child_idx"]] = Array( nda=explode_cl(tcm_lgdo["cumulative_length"].nda) ) tcm_lgdo.pop("cumulative_length") tcm_tb = Table(col_dict=tcm_lgdo) - f_entries = tcm_tb.get_dataframe() + f_entries = tcm_tb.view_as("pd") renaming = { self.tcms[tcm_level]["tcm_cols"]["child_idx"]: f"{child}_idx", self.tcms[tcm_level]["tcm_cols"]["parent_tb"]: f"{parent}_table", @@ -649,7 +649,7 @@ def build_entry_list( if tb in col_tiers[file]["tables"][tier]: table_name = self.filedb.get_table_name(tier, tb) try: - tier_table, _ = sto.read_object( + tier_table, _ = sto.read( table_name, tier_path, field_mask=cut_cols[level], @@ -666,7 +666,7 @@ def build_entry_list( tb_table.join(tier_table) if tb_table is None: continue - tb_df = tb_table.get_dataframe() + tb_df = tb_table.view_as("pd") tb_df.query(cut, inplace=True) idx_match = f_entries.query(f"{level}_idx in {list(tb_df.index)}") if level == parent: @@ -708,11 +708,9 @@ def build_entry_list( f_dict = f_entries.to_dict("list") f_struct = Struct(f_dict) if self.merge_files: - sto.write_object(f_struct, "entries", output_file, wo_mode="a") + sto.write(f_struct, "entries", output_file, wo_mode="a") else: - sto.write_object( - f_struct, f"entries/{file}", output_file, wo_mode="a" - ) + sto.write(f_struct, f"entries/{file}", output_file, wo_mode="a") if log.getEffectiveLevel() >= logging.INFO: progress_bar.close() @@ -862,7 +860,7 @@ def build_hit_entries( # load the data from the tier file, just the columns needed for the cut table_name = self.filedb.get_table_name(tier, tb) try: - tier_tb, _ = sto.read_object( + tier_tb, _ = sto.read( table_name, tier_path, field_mask=cut_cols ) except KeyError: @@ -870,7 +868,7 @@ def build_hit_entries( f"Cannot find {table_name} in file {tier_path}" ) continue - # join eveything in one table + # join everything in one table if tb_table is None: tb_table = tier_tb else: @@ -880,7 +878,7 @@ def build_hit_entries( continue # convert to DataFrame and apply cuts - tb_df = tb_table.get_dataframe() + tb_df = tb_table.view_as("pd") tb_df.query(cut, inplace=True) tb_df[f"{low_level}_table"] = tb tb_df[f"{low_level}_idx"] = tb_df.index @@ -902,11 +900,9 @@ def build_hit_entries( f_dict = f_entries.to_dict("list") f_struct = Struct(f_dict) if self.merge_files: - sto.write_object(f_struct, "entries", output_file, wo_mode="a") + sto.write(f_struct, "entries", output_file, wo_mode="a") else: - sto.write_object( - f_struct, f"entries/{file}", output_file, wo_mode="a" - ) + sto.write(f_struct, f"entries/{file}", output_file, wo_mode="a") if log.getEffectiveLevel() >= logging.INFO: progress_bar.close() @@ -1120,7 +1116,7 @@ def explode_evt_cols(el: pd.DataFrame, tier_table: Table): for file in files ] - tier_table, _ = sto.read_object( + tier_table, _ = sto.read( name=tb_name, lh5_file=tier_paths, idx=idx_mask, @@ -1146,12 +1142,12 @@ def explode_evt_cols(el: pd.DataFrame, tier_table: Table): f_table = utils.dict_to_table(col_dict=col_dict, attr_dict=attr_dict) if output_file: - sto.write_object(f_table, "merged_data", output_file, wo_mode="o") + sto.write(f_table, "merged_data", output_file, wo_mode="o") if in_memory: if self.output_format == "lgdo.Table": return f_table elif self.output_format == "pd.DataFrame": - return f_table.get_dataframe() + return f_table.view_as("pd") else: raise ValueError( f"'{self.output_format}' output format not supported" @@ -1224,7 +1220,7 @@ def explode_evt_cols(el: pd.DataFrame, tier_table: Table): raise FileNotFoundError(tier_path) table_name = self.filedb.get_table_name(tier, tb) - tier_table, _ = sto.read_object( + tier_table, _ = sto.read( table_name, tier_path, idx=idx_mask, @@ -1250,7 +1246,7 @@ def explode_evt_cols(el: pd.DataFrame, tier_table: Table): if in_memory: load_out.add_field(name=file, obj=f_table) if output_file: - sto.write_object(f_table, f"{file}", output_file, wo_mode="o") + sto.write(f_table, f"{file}", output_file, wo_mode="o") # end file loop if log.getEffectiveLevel() >= logging.INFO: @@ -1263,7 +1259,7 @@ def explode_evt_cols(el: pd.DataFrame, tier_table: Table): return load_out elif self.output_format == "pd.DataFrame": for file in load_out.keys(): - load_out[file] = load_out[file].get_dataframe() + load_out[file] = load_out[file].view_as("pd") return load_out else: raise ValueError( @@ -1322,7 +1318,7 @@ def load_evts( ) if os.path.exists(tier_path): table_name = self.filedb.get_table_name(tier, tb) - tier_table, _ = sto.read_object( + tier_table, _ = sto.read( table_name, tier_path, idx=idx_mask, @@ -1336,7 +1332,7 @@ def load_evts( if in_memory: load_out[file] = f_table if output_file: - sto.write_object(f_table, f"file{file}", output_file, wo_mode="o") + sto.write(f_table, f"file{file}", output_file, wo_mode="o") # end file loop if in_memory: @@ -1344,7 +1340,7 @@ def load_evts( return load_out elif self.output_format == "pd.DataFrame": for file in load_out.keys(): - load_out[file] = load_out[file].get_dataframe() + load_out[file] = load_out[file].view_as("pd") return load_out else: raise ValueError( diff --git a/src/pygama/flow/file_db.py b/src/pygama/flow/file_db.py index 2639b3122..2f74ad0b3 100644 --- a/src/pygama/flow/file_db.py +++ b/src/pygama/flow/file_db.py @@ -9,11 +9,12 @@ import warnings import h5py -import lgdo import numpy as np import pandas as pd -from lgdo import Array, Scalar, VectorOfVectors -from lgdo import lh5_store as lh5 +from lgdo.lh5 import ls +from lgdo.lh5.store import LH5Store +from lgdo.lh5.utils import expand_path, expand_vars +from lgdo.types import Array, Scalar, VectorOfVectors from parse import parse from . import utils @@ -186,14 +187,12 @@ def set_config(self, config: dict, config_path: str = None) -> None: if config_path is not None: subst_vars["_"] = os.path.dirname(str(config_path)) - data_dir = lgdo.lgdo_utils.expand_path( - self.config["data_dir"], substitute=subst_vars - ) + data_dir = expand_path(self.config["data_dir"], substitute=subst_vars) self.data_dir = data_dir tier_dirs = self.config["tier_dirs"] for k, val in tier_dirs.items(): - tier_dirs[k] = lgdo.lgdo_utils.expand_vars(val, substitute=subst_vars) + tier_dirs[k] = expand_vars(val, substitute=subst_vars) self.tier_dirs = tier_dirs def scan_files(self, dirs: list[str] = None) -> None: @@ -277,8 +276,10 @@ def scan_files(self, dirs: list[str] = None) -> None: # convert cols to numeric dtypes where possible for col in self.df.columns: - self.df[col] = pd.to_numeric(self.df[col], errors="ignore") - + try: + self.df[col] = pd.to_numeric(self.df[col]) + except ValueError: + continue # Remove duplicates in the 'sortby' column self.df.drop_duplicates(subset=[self.sortby], inplace=True, ignore_index=True) @@ -412,7 +413,7 @@ def update_tables_cols(row, tier: str, utc_cache: dict = None) -> pd.Series: ) # TODO this call here is really expensive! - groups = lh5.ls(f, wildcard) + groups = ls(f, wildcard) if len(groups) > 0 and parse(template, groups[0]) is None: log.warning(f"groups in {fpath} don't match template") else: @@ -436,7 +437,7 @@ def update_tables_cols(row, tier: str, utc_cache: dict = None) -> pd.Series: table_name = template try: - col = lh5.ls(f[table_name]) + col = ls(f[table_name]) except KeyError: log.warning(f"cannot find '{table_name}' in {fpath}") continue @@ -485,8 +486,8 @@ def update_tables_cols(row, tier: str, utc_cache: dict = None) -> pd.Series: columns_vov = VectorOfVectors( flattened_data=flattened, cumulative_length=length ) - sto = lh5.LH5Store() - sto.write_object(columns_vov, "unique_columns", to_file) + sto = LH5Store() + sto.write(columns_vov, "unique_columns", to_file) return self.columns @@ -509,12 +510,12 @@ def from_disk(self, path: str | list[str]) -> None: # expand wildcards paths = [] for p in path: - paths += lgdo.lgdo_utils.expand_path(p, list=True) + paths += expand_path(p, list=True) if not paths: raise FileNotFoundError(path) - sto = lh5.LH5Store() + sto = LH5Store() # objects/accumulators that will be used to configure the FileDB at the end _cfg = None _df = None @@ -536,7 +537,7 @@ def _replace_idx(row, trans, tier): # loop over the files for p in paths: - cfg, _ = sto.read_object("config", p) + cfg, _ = sto.read("config", p) cfg = json.loads(cfg.value.decode()) # make sure configurations are all the same @@ -548,7 +549,7 @@ def _replace_idx(row, trans, tier): ) # read in unique columns - vov, _ = sto.read_object("columns", p) + vov, _ = sto.read("columns", p) # Convert back from VoV of UTF-8 bytestrings to a list of lists of strings columns = [[v.decode("utf-8") for v in ov] for ov in list(vov)] @@ -607,14 +608,12 @@ def to_disk(self, filename: str, wo_mode="write_safe") -> None: filename output LH5 file name. wo_mode - passed to :meth:`~.lgdo.lh5_store.write_object`. + passed to :meth:`~.lgdo.lh5.write`. """ log.debug(f"writing database to {filename}") - sto = lh5.LH5Store() - sto.write_object( - Scalar(json.dumps(self.config)), "config", filename, wo_mode=wo_mode - ) + sto = LH5Store() + sto.write(Scalar(json.dumps(self.config)), "config", filename, wo_mode=wo_mode) if wo_mode in ["write_safe", "w", "overwrite_file", "of"]: wo_mode = "a" @@ -631,7 +630,7 @@ def to_disk(self, filename: str, wo_mode="write_safe") -> None: flattened_data=Array(nda=np.array(flat).astype("S")), cumulative_length=Array(nda=np.array(cum_l)), ) - sto.write_object(col_vov, "columns", filename, wo_mode=wo_mode) + sto.write(col_vov, "columns", filename, wo_mode=wo_mode) # FIXME: to_hdf() throws this: # @@ -681,7 +680,10 @@ def scan_daq_files(self, daq_dir: str, daq_template: str) -> None: # convert cols to numeric dtypes where possible for col in self.df.columns: - self.df[col] = pd.to_numeric(self.df[col], errors="ignore") + try: + self.df[col] = pd.to_numeric(self.df[col]) + except ValueError: + continue def get_table_name(self, tier: str, tb: str) -> str: """Get the table name for a tier given its table identifier. diff --git a/src/pygama/hit/build_hit.py b/src/pygama/hit/build_hit.py index 5cc34202f..2a6d6a066 100644 --- a/src/pygama/hit/build_hit.py +++ b/src/pygama/hit/build_hit.py @@ -7,9 +7,11 @@ import logging import os from collections import OrderedDict +from typing import Iterable, Mapping +import lgdo import numpy as np -from lgdo import Array, LH5Iterator, LH5Store, ls +from lgdo.lh5 import LH5Iterator, LH5Store, ls log = logging.getLogger(__name__) @@ -17,18 +19,20 @@ def build_hit( infile: str, outfile: str = None, - hit_config: str | dict = None, - lh5_tables: list[str] = None, - lh5_tables_config: str | dict[str] = None, + hit_config: str | Mapping = None, + lh5_tables: Iterable[str] = None, + lh5_tables_config: str | Mapping[str, Mapping] = None, n_max: int = np.inf, wo_mode: str = "write_safe", buffer_len: int = 3200, ) -> None: """ - Transform a :class:`~.lgdo.Table` into a new :class:`~.lgdo.Table` by - evaluating strings describing column operations. + Transform a :class:`~lgdo.types.table.Table` into a new + :class:`~lgdo.types.table.Table` by evaluating strings describing column + operations. - Operates on columns only, not specific rows or elements. + Operates on columns only, not specific rows or elements. Relies on + :meth:`~lgdo.types.table.Table.eval`. Parameters ---------- @@ -44,14 +48,14 @@ def build_hit( .. code-block:: json { - "outputs": ["calE", "AoE"], - "operations": { - "calE": { - "expression": "sqrt(@a + @b * trapEmax**2)", - "parameters": {"a": "1.23", "b": "42.69"}, - }, - "AoE": {"expression": "A_max/calE"}, - } + "outputs": ["calE", "AoE"], + "operations": { + "calE": { + "expression": "sqrt(a + b * trapEmax**2)", + "parameters": {"a": "1.23", "b": "42.69"}, + }, + "AoE": {"expression": "A_max/calE"}, + } } The ``outputs`` array lists columns that will be effectively written in @@ -69,7 +73,11 @@ def build_hit( n_max maximum number of rows to process wo_mode - forwarded to :meth:`~.lgdo.lh5_store.write_object`. + forwarded to :meth:`lgdo.lh5.store.LH5Store.write`. + + See Also + -------- + lgdo.types.table.Table.eval """ store = LH5Store() @@ -93,16 +101,14 @@ def build_hit( for k, v in tbl_cfg.items(): if isinstance(v, str): with open(v) as f: - # order in hit configs is important (dependencies) - tbl_cfg[k] = json.load(f, object_pairs_hook=OrderedDict) + tbl_cfg[k] = json.load(f) lh5_tables_config = tbl_cfg else: if isinstance(hit_config, str): # sanitize config with open(hit_config) as f: - # order in hit configs is important (dependencies) - hit_config = json.load(f, object_pairs_hook=OrderedDict) + hit_config = json.load(f) if lh5_tables is None: lh5_tables_config = {} @@ -113,11 +119,19 @@ def build_hit( if f"{el}/dsp" in ls(infile, f"{el}/"): log.debug(f"found candidate table /{el}/dsp") lh5_tables_config[f"{el}/dsp"] = hit_config + else: + for tbl in lh5_tables: + lh5_tables_config[tbl] = hit_config if outfile is None: outfile = os.path.splitext(os.path.basename(infile))[0] outfile = outfile.removesuffix("_dsp") + "_hit.lh5" + # reorder blocks in "operations" based on dependency + log.debug("reordering operations based on mutual dependency") + for cfg in lh5_tables_config.values(): + cfg["operations"] = _reorder_table_operations(cfg["operations"]) + first_done = False for tbl, cfg in lh5_tables_config.items(): lh5_it = LH5Iterator(infile, tbl, buffer_len=buffer_len) @@ -129,7 +143,18 @@ def build_hit( for tbl_obj, start_row, n_rows in lh5_it: n_rows = min(tot_n_rows - start_row, n_rows) - outtbl_obj = tbl_obj.eval(cfg["operations"]) + # create a new table object that links all the columns in the + # current table (i.e. no copy) + outtbl_obj = lgdo.Table(col_dict=tbl_obj) + + for outname, info in cfg["operations"].items(): + outcol = outtbl_obj.eval( + info["expression"], info.get("parameters", None) + ) + if "lgdo_attrs" in info: + outcol.attrs |= info["lgdo_attrs"] + + outtbl_obj.add_column(outname, outcol) # make high level flags if "aggregations" in cfg: @@ -145,13 +170,13 @@ def build_hit( else: flag_dtype = np.uint64 - df_flags = outtbl_obj.get_dataframe(flags_list) + df_flags = outtbl_obj.view_as("pd", cols=flags_list) flag_values = df_flags.values.astype(flag_dtype) multiplier = 2 ** np.arange(n_flags, dtype=flag_values.dtype) flag_out = np.dot(flag_values, multiplier) - outtbl_obj.add_field(high_lvl_flag, Array(flag_out)) + outtbl_obj.add_field(high_lvl_flag, lgdo.Array(flag_out)) # remove or add columns according to "outputs" in the configuration # dictionary @@ -159,7 +184,7 @@ def build_hit( if isinstance(cfg["outputs"], list): # add missing columns (forwarding) for out in cfg["outputs"]: - if out not in outtbl_obj.keys(): + if out not in outtbl_obj: outtbl_obj.add_column(out, tbl_obj[out]) # remove non-required columns @@ -168,7 +193,7 @@ def build_hit( if col not in cfg["outputs"]: outtbl_obj.remove_column(col, delete=True) - store.write_object( + store.write( obj=outtbl_obj, name=tbl.replace("/dsp", "/hit"), lh5_file=outfile, @@ -178,3 +203,59 @@ def build_hit( ) first_done = True + + +def _reorder_table_operations( + config: Mapping[str, Mapping] +) -> OrderedDict[str, Mapping]: + """Reorder operations in `config` according to mutual dependency.""" + + def _one_pass(config): + """Loop once over `config` and do a first round of reordering""" + # list to hold reordered config keys + ordered_keys = [] + + # start looping over config + for outname in config: + # initialization + if not ordered_keys: + ordered_keys.append(outname) + continue + + if outname in ordered_keys: + raise RuntimeError(f"duplicated operation '{outname}' detected") + + # loop over existing reordered keys and figure out where to place + # the new key + idx = 0 + for k in ordered_keys: + # get valid names in the expression + c = compile( + config[k]["expression"], "gcc -O3 -ffast-math build_hit.py", "eval" + ) + + # if we need "outname" for this expression, insert it before! + if outname in c.co_names: + break + else: + idx += 1 + + ordered_keys.insert(idx, outname) + + # now replay the config dictionary based on sorted keys + opdict = OrderedDict() + for k in ordered_keys: + opdict[k] = config[k] + + return opdict + + # okay, now we need to repeat this until we've sorted everything + current = OrderedDict(config) + + while True: + new = _one_pass(current) + + if new == current: + return new + else: + current = new diff --git a/src/pygama/math/histogram.py b/src/pygama/math/histogram.py index 5391c9b23..32774ac0d 100644 --- a/src/pygama/math/histogram.py +++ b/src/pygama/math/histogram.py @@ -17,9 +17,12 @@ import matplotlib.pyplot as plt import numpy as np from matplotlib import rcParams +import logging import pygama.math.utils as pgu +log = logging.getLogger(__name__) + def get_hist(data, bins=None, range=None, dx=None, wts=None): """return hist, bins, var after binning data @@ -318,7 +321,7 @@ def get_fwfm(fraction, hist, bins, var=None, mx=None, dmx=0, bl=0, dbl=0, method # interpolate between the two bins that cross the [fraction] line # works well for high stats if bin_lo < 1 or bin_hi >= len(hist)-1: - print(f"get_fwhm: can't interpolate ({bin_lo}, {bin_hi})") + log.debug(f"get_fwhm: can't interpolate ({bin_lo}, {bin_hi})") return 0, 0 val_f = bl + fraction*(mx-bl) @@ -361,7 +364,7 @@ def get_fwfm(fraction, hist, bins, var=None, mx=None, dmx=0, bl=0, dbl=0, method # x_lo i_0 = bin_lo - int(np.floor(n_slope/2)) if i_0 < 0: - print(f"get_fwfm: fit slopes failed") + log.debug(f"get_fwfm: fit slopes failed") return 0, 0 i_n = i_0 + n_slope wts = None if var is None else 1/np.sqrt(var[i_0:i_n]) #fails for any var = 0 @@ -370,7 +373,7 @@ def get_fwfm(fraction, hist, bins, var=None, mx=None, dmx=0, bl=0, dbl=0, method try: (m, b), cov = np.polyfit(bin_centers[i_0:i_n], hist[i_0:i_n], 1, w=wts, cov='unscaled') except np.linalg.LinAlgError: - print(f"get_fwfm: LinAlgError") + log.debug(f"get_fwfm: LinAlgError") return 0, 0 x_lo = (val_f-b)/m #uncertainty @@ -380,7 +383,7 @@ def get_fwfm(fraction, hist, bins, var=None, mx=None, dmx=0, bl=0, dbl=0, method # x_hi i_0 = bin_hi - int(np.floor(n_slope/2)) + 1 if i_0 == len(hist): - print(f"get_fwfm: fit slopes failed") + log.debug(f"get_fwfm: fit slopes failed") return 0, 0 i_n = i_0 + n_slope @@ -389,11 +392,11 @@ def get_fwfm(fraction, hist, bins, var=None, mx=None, dmx=0, bl=0, dbl=0, method try: (m, b), cov = np.polyfit(bin_centers[i_0:i_n], hist[i_0:i_n], 1, w=wts, cov='unscaled') except np.linalg.LinAlgError: - print(f"get_fwfm: LinAlgError") + log.debug(f"get_fwfm: LinAlgError") return 0, 0 x_hi = (val_f-b)/m if x_hi < x_lo: - print(f"get_fwfm: fit slopes produced negative fwfm") + log.debug(f"get_fwfm: fit slopes produced negative fwfm") return 0, 0 #uncertainty @@ -403,7 +406,7 @@ def get_fwfm(fraction, hist, bins, var=None, mx=None, dmx=0, bl=0, dbl=0, method return x_hi - x_lo, np.sqrt(dxl2 + dxh2) else: - print(f"get_fwhm: unrecognized method {method}") + log.debug(f"get_fwhm: unrecognized method {method}") return 0, 0 diff --git a/src/pygama/math/peak_fitting.py b/src/pygama/math/peak_fitting.py index 0a2ad98c4..958a12925 100644 --- a/src/pygama/math/peak_fitting.py +++ b/src/pygama/math/peak_fitting.py @@ -6,9 +6,12 @@ from iminuit import Minuit, cost from scipy.optimize import brentq, minimize_scalar from scipy.stats import crystalball +import logging import pygama.math.histogram as pgh +log = logging.getLogger(__name__) + limit = np.log(sys.float_info.max)/10 kwd = {"parallel": False, "fastmath": True} diff --git a/src/pygama/pargen/AoE_cal.py b/src/pygama/pargen/AoE_cal.py index bbe0ee171..227aec4e2 100644 --- a/src/pygama/pargen/AoE_cal.py +++ b/src/pygama/pargen/AoE_cal.py @@ -15,7 +15,7 @@ import matplotlib as mpl mpl.use("agg") -import lgdo.lh5_store as lh5 +import lgdo.lh5 as lh5 import matplotlib.cm as cmx import matplotlib.colors as mcolors import matplotlib.dates as mdates diff --git a/src/pygama/pargen/cuts.py b/src/pygama/pargen/cuts.py index e6c6b571e..638199f64 100644 --- a/src/pygama/pargen/cuts.py +++ b/src/pygama/pargen/cuts.py @@ -9,9 +9,10 @@ import logging import os -import lgdo.lh5_store as lh5 +import lgdo.lh5 as lh5 import numpy as np import pandas as pd +from lgdo.types import Table from scipy import stats import pygama.math.histogram as pgh @@ -51,7 +52,7 @@ def generate_cuts( output_dict = {} if isinstance(data, pd.DataFrame): pass - elif isinstance(data, lh5.Table): + elif isinstance(data, Table): data = {entry: data[entry].nda for entry in get_keys(data, parameters)} data = pd.DataFrame.from_dict(data) elif isinstance(data, dict): @@ -204,7 +205,7 @@ def get_cut_indexes( keys = cut_dict.keys() if isinstance(all_data, pd.DataFrame): pass - elif isinstance(all_data, lh5.Table): + elif isinstance(all_data, Table): cut_keys = list(cut_dict) cut_keys.append(energy_param) all_data = { diff --git a/src/pygama/pargen/dplms_ge_dict.py b/src/pygama/pargen/dplms_ge_dict.py new file mode 100644 index 000000000..6a155d239 --- /dev/null +++ b/src/pygama/pargen/dplms_ge_dict.py @@ -0,0 +1,564 @@ +""" +This module is for creating dplms dictionary for ge processing +""" + +from __future__ import annotations + +import itertools +import json +import logging +import os +import time + +import matplotlib.pyplot as plt +import numpy as np +from lgdo import Array, Table, lh5 +from scipy.signal import convolve, convolve2d + +from pygama.math.histogram import get_hist +from pygama.math.peak_fitting import ( + extended_gauss_step_pdf, + extended_radford_pdf, + gauss_step_pdf, + radford_pdf, +) +from pygama.pargen.cuts import generate_cuts, get_cut_indexes +from pygama.pargen.dsp_optimize import run_one_dsp +from pygama.pargen.energy_optimisation import fom_FWHM_with_dt_corr_fit + +log = logging.getLogger(__name__) +sto = lh5.LH5Store() + + +def dplms_ge_dict( + lh5_path: str, + raw_fft: Table, + raw_cal: Table, + dsp_config: dict, + par_dsp: dict, + par_dsp_lh5: str, + dplms_dict: dict, + decay_const: float = 0, + ene_par: str = "dplmsEmax", + display: int = 0, +) -> dict: + """ + This function calculates the dplms dictionary for HPGe detectors. + + Parameters + ---------- + lh5_path + Name of channel to process, should be name of lh5 group in raw files + fft_files + table with fft data + raw_cal + table with cal data + dsp_config + dsp config file + par_dsp + Dictionary with db parameters for dsp processing + par_dsp_lh5 + Path for saving dplms coefficients + dplms_dict + Dictionary with various parameters + + Returns + ------- + out_dict + """ + + t0 = time.time() + log.info(f"\nSelecting baselines") + + dsp_fft = run_one_dsp(raw_fft, dsp_config, db_dict=par_dsp[lh5_path]) + cut_dict = generate_cuts(dsp_fft, parameters=dplms_dict["bls_cut_pars"]) + idxs = get_cut_indexes(dsp_fft, cut_dict) + bl_field = dplms_dict["bl_field"] + log.info(f"... {len(dsp_fft[bl_field].values.nda[idxs,:])} baselines after cuts") + + bls = dsp_fft[bl_field].values.nda[idxs, : dplms_dict["bsize"]] + bls_par = {} + bls_cut_pars = [par for par in dplms_dict["bls_cut_pars"].keys()] + for par in bls_cut_pars: + bls_par[par] = dsp_fft[par].nda + t1 = time.time() + log.info( + f"total events {len(raw_fft)}, {len(bls)} baseline selected in {(t1-t0):.2f} s" + ) + + log.info( + "\nCalculating noise matrix of length", + dplms_dict["length"], + "n. events", + bls.shape[0], + "size", + bls.shape[1], + ) + nmat = noise_matrix(bls, dplms_dict["length"]) + t2 = time.time() + log.info(f"Time to calculate noise matrix {(t2-t1):.2f} s") + + log.info("\nSelecting signals") + wsize = dplms_dict["wsize"] + wf_field = dplms_dict["wf_field"] + peaks_keV = np.array(dplms_dict["peaks_keV"]) + kev_widths = [tuple(kev_width) for kev_width in dplms_dict["kev_widths"]] + + log.info(f"Produce dsp data for {len(raw_cal)} events") + dsp_cal = run_one_dsp(raw_cal, dsp_config, db_dict=par_dsp[lh5_path]) + t3 = time.time() + log.info(f"Time to run dsp production {(t3-t2):.2f} s") + + dsp_config["outputs"] = [ene_par, "dt_eff"] + + # dictionary for peak fitting + peak_dict = { + "peak": peaks_keV[-1], + "kev_width": kev_widths[-1], + "parameter": ene_par, + "func": extended_gauss_step_pdf, + "gof_func": gauss_step_pdf, + } + + if display > 0: + plot_dict = {} + plot_dict["dplms"] = {} + + # penalized coefficients + dp_coeffs = dplms_dict["dp_coeffs"] + za_coeff = dplms_dict["dp_def"]["za"] + dp_coeffs.pop("za") + coeff_keys = [key for key in dp_coeffs.keys()] + lists = [dp_coeffs[key] for key in dp_coeffs.keys()] + + prod = list(itertools.product(*lists)) + grid_dict = {} + min_fom = float("inf") + min_idx = None + + for i, values in enumerate(prod): + coeff_values = dict(zip(coeff_keys, values)) + + log.info( + "\nCase", + i, + "->", + ", ".join(f"{key} = {value}" for key, value in coeff_values.items()), + ) + grid_dict[i] = coeff_values + + sel_dict = signal_selection(dsp_cal, dplms_dict, coeff_values) + wfs = dsp_cal[wf_field].nda[sel_dict["idxs"], :] + log.info(f"... {len(wfs)} signals after signal selection") + + ref, rmat, pmat, fmat = signal_matrices(wfs, dplms_dict["length"], decay_const) + + t_tmp = time.time() + nm_coeff = coeff_values["nm"] + ft_coeff = coeff_values["ft"] + x, y, refy = filter_synthesis( + ref, + nm_coeff * nmat, + rmat, + za_coeff, + pmat, + ft_coeff * fmat, + dplms_dict["length"], + wsize, + ) + par_dsp[lh5_path]["dplms"] = {"length": dplms_dict["length"], "coefficients": x} + log.info( + f"Filter synthesis in {time.time()-t_tmp:.1f} s, filter area", np.sum(x) + ) + + t_tmp = time.time() + dsp_opt = run_one_dsp(raw_cal, dsp_config, db_dict=par_dsp[lh5_path]) + + try: + res = fom_FWHM_with_dt_corr_fit( + dsp_opt, + peak_dict, + "QDrift", + idxs=np.where(~np.isnan(dsp_opt["dt_eff"].nda))[0], + ) + except: + log.debug("FWHM not calculated") + continue + + fwhm, fwhm_err, alpha, chisquare = ( + res["fwhm"], + res["fwhm_err"], + res["alpha"], + res["chisquare"], + ) + log.info( + f"FWHM = {fwhm:.2f} ± {fwhm_err:.2f} keV, evaluated in {time.time()-t_tmp:.1f} s" + ) + + grid_dict[i]["fwhm"] = fwhm + grid_dict[i]["fwhm_err"] = fwhm_err + grid_dict[i]["alpha"] = alpha + + if ( + fwhm < dplms_dict["fwhm_limit"] + and fwhm_err < dplms_dict["err_limit"] + and chisquare < dplms_dict["chi_limit"] + ): + if fwhm < min_fom: + min_idx, min_fom = i, fwhm + + if min_idx is not None: + min_result = grid_dict[min_idx] + best_case_values = {key: min_result[key] for key in min_result.keys()} + + fwhm = best_case_values.get("fwhm", None) + fwhm_err = best_case_values.get("fwhm_err", 0) + alpha = best_case_values.get("alpha", 0) + nm_coeff = best_case_values.get("nm", dplms_dict["dp_def"]["nm"]) + ft_coeff = best_case_values.get("ft", dplms_dict["dp_def"]["nm"]) + rt_coeff = best_case_values.get("rt", dplms_dict["dp_def"]["rt"]) + pt_coeff = best_case_values.get("pt", dplms_dict["dp_def"]["pt"]) + + if all( + v is not None + for v in [ + fwhm, + fwhm_err, + alpha, + nm_coeff, + ft_coeff, + rt_coeff, + pt_coeff, + ] + ): + log.info( + f"\nBest case: FWHM = {fwhm:.2f} ± {fwhm_err:.2f} keV, ctc {alpha}" + ) + else: + log.error("Some values are missing in the best case results") + else: + log.error("Filter synthesis failed") + nm_coeff = dplms_dict["dp_def"]["nm"] + ft_coeff = dplms_dict["dp_def"]["ft"] + rt_coeff = dplms_dict["dp_def"]["rt"] + pt_coeff = dplms_dict["dp_def"]["pt"] + + # filter synthesis + sel_dict = signal_selection(dsp_cal, dplms_dict, best_case_values) + idxs = sel_dict["idxs"] + wfs = dsp_cal[wf_field].nda[idxs, :] + ref, rmat, pmat, fmat = signal_matrices(wfs, dplms_dict["length"], decay_const) + + x, y, refy = filter_synthesis( + ref, + nm_coeff * nmat, + rmat, + za_coeff, + pmat, + ft_coeff * fmat, + dplms_dict["length"], + wsize, + ) + + sto.write( + Array(x), + name="dplms", + lh5_file=par_dsp_lh5, + wo_mode="overwrite", + group=lh5_path, + ) + + out_dict = { + "dplms": { + "length": dplms_dict["length"], + "coefficients": f"loadlh5('{par_dsp_lh5}', '{lh5_path}/dplms')", + "dp_coeffs": { + "nm": nm_coeff, + "za": za_coeff, + "ft": ft_coeff, + "rt": rt_coeff, + "pt": pt_coeff, + }, + } + } + out_alpha_dict = { + f"{ene_par}_ctc": { + "expression": f"{ene_par}*(1+dt_eff*a)", + "parameters": {"a": round(alpha, 9)}, + } + } + out_dict.update({"ctc_params": out_alpha_dict}) + + log.info(f"Time to complete DPLMS filter synthesis {time.time()-t0:.1f}") + + if display > 0: + plot_dict["dplms"]["ref"] = ref + plot_dict["dplms"]["coefficients"] = x + + bl_idxs = np.random.choice(len(bls), dplms_dict["n_plot"]) + bls = bls[bl_idxs] + fig, ax = plt.subplots(figsize=(12, 6.75), facecolor="white") + for ii, wf in enumerate(bls): + if ii < 10: + ax.plot(wf, label=f"mean = {wf.mean():.1f}") + else: + ax.plot(wf) + ax.legend(title=f"{lh5_path}", loc="upper right") + plot_dict["dplms"]["bls"] = fig + fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(16, 9), facecolor="white") + for ii, par in enumerate(bls_cut_pars): + mean = cut_dict[par]["Mean Value"] + llo, lup = cut_dict[par]["Lower Boundary"], cut_dict[par]["Upper Boundary"] + plo, pup = mean - 2 * (mean - llo), mean + 2 * (lup - mean) + hh, bb = np.histogram(bls_par[par], bins=np.linspace(plo, pup, 200)) + ax.flat[ii].plot(bb[1:], hh, ds="steps", label=f"cut on {par}") + ax.flat[ii].axvline(lup, color="k", linestyle=":", label="selection") + ax.flat[ii].axvline(llo, color="k", linestyle=":") + ax.flat[ii].set_xlabel(par) + ax.flat[ii].set_yscale("log") + ax.flat[ii].legend(title=f"{lh5_path}", loc="upper right") + plot_dict["dplms"]["bl_sel"] = fig + + wf_idxs = np.random.choice(len(wfs), dplms_dict["n_plot"]) + wfs = wfs[wf_idxs] + peak_pos = dsp_cal["peak_pos"].nda + peak_pos_neg = dsp_cal["peak_pos_neg"].nda + centroid = dsp_cal["centroid"].nda + risetime = dsp_cal["tp_90"].nda - dsp_cal["tp_10"].nda + rt_low = dplms_dict["rt_low"] + rt_high = dplms_dict["rt_high"] + peak_lim = dplms_dict["peak_lim"] + cal_par = {} + wfs_cut_pars = [par for par in dplms_dict["wfs_cut_pars"].keys()] + for par in wfs_cut_pars: + cal_par[par] = dsp_cal[par].nda + fig, ax = plt.subplots(figsize=(12, 6.75), facecolor="white") + for ii, wf in enumerate(wfs): + if ii < 10: + ax.plot(wf, label=f"centr = {centroid[ii]}") + else: + ax.plot(wf) + ax.legend(title=f"{lh5_path}", loc="upper right") + axin = ax.inset_axes([0.1, 0.15, 0.35, 0.5]) + for wf in wfs: + axin.plot(wf) + axin.set_xlim(wsize / 2 - dplms_dict["zoom"], wsize / 2 + dplms_dict["zoom"]) + axin.set_yticklabels("") + plot_dict["dplms"]["wfs"] = fig + fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(16, 9), facecolor="white") + wfs_cut_pars.append("centroid") + wfs_cut_pars.append("peak_pos") + wfs_cut_pars.append("risetime") + for ii, par in enumerate(wfs_cut_pars): + pspace = np.linspace( + wsize / 2 - peak_lim, wsize / 2 + peak_lim, 2 * peak_lim + ) + if par == "centroid": + llo, lup = sel_dict["ct_ll"], sel_dict["ct_hh"] + hh, bb = np.histogram(centroid, bins=pspace) + elif par == "peak_pos": + llo, lup = sel_dict["pp_ll"], sel_dict["pp_hh"] + hh, bb = np.histogram(peak_pos, bins=pspace) + elif par == "risetime": + llo, lup = sel_dict["rt_ll"], sel_dict["rt_hh"] + rt_bins = int((rt_high - rt_low) / dplms_dict["period"]) + rt_space = np.linspace(rt_low, rt_high, rt_bins) + hh, bb = np.histogram(risetime, bins=rt_space) + else: + llo, lup = np.min(cal_par[par]), np.max(cal_par[par]) + hh, bb = np.histogram(cal_par[par], bins=np.linspace(llo, lup, 200)) + ax.flat[ii + 1].plot(bb[1:], hh, ds="steps", label=f"cut on {par}") + ax.flat[ii + 1].axvline( + llo, color="k", linestyle=":", label=f"sel. {llo:.1f} {lup:.1f}" + ) + if par != "centroid": + ax.flat[ii + 1].axvline(lup, color="k", linestyle=":") + ax.flat[ii + 1].set_xlabel(par) + ax.flat[ii + 1].set_yscale("log") + ax.flat[ii + 1].legend(title=f"{lh5_path}", loc="upper right") + roughenergy = dsp_cal["trapTmax"].nda + roughenergy_sel = roughenergy[idxs] + ell, ehh = roughenergy.min(), roughenergy.max() + he, be = np.histogram(roughenergy, bins=np.linspace(ell, ehh, 1000)) + hs, be = np.histogram(roughenergy_sel, bins=np.linspace(ell, ehh, 1000)) + ax.flat[0].plot(be[1:], he, c="b", ds="steps", label="initial") + ax.flat[0].plot(be[1:], hs, c="r", ds="steps", label="selected") + ax.flat[0].set_xlabel("rough energy (ADC)") + ax.flat[0].set_yscale("log") + ax.flat[0].legend(loc="upper right", title=f"{lh5_path}") + plot_dict["dplms"]["wf_sel"] = fig + + fig, ax = plt.subplots(figsize=(12, 6.75), facecolor="white") + ax.plot(x, "r-", label=f"filter") + ax.axhline(0, color="black", linestyle=":") + ax.legend(loc="upper right", title=f"{lh5_path}") + axin = ax.inset_axes([0.6, 0.1, 0.35, 0.33]) + axin.plot(x, "r-") + axin.set_xlim( + dplms_dict["length"] / 2 - dplms_dict["zoom"], + dplms_dict["length"] / 2 + dplms_dict["zoom"], + ) + axin.set_yticklabels("") + ax.indicate_inset_zoom(axin) + + return out_dict, plot_dict + else: + return out_dict + + +def is_valid_centroid( + centroid: np.array, lim: int, size: int, full_size: int +) -> list[bool]: + llim = size / 2 - lim + hlim = full_size - size / 2 + idxs = (centroid > llim) & (centroid < hlim) + return idxs, llim, hlim + + +def is_not_pile_up( + peak_pos: np.array, peak_pos_neg: np.array, thr: int, lim: int, size: int +) -> list[bool]: + bin_edges = np.linspace(size / 2 - lim, size / 2 + lim, 2 * lim) + hist, bin_edges = np.histogram(peak_pos, bins=bin_edges) + + thr = thr * hist.max() / 100 + low_thr_idxs = np.where(hist[: hist.argmax()] < thr)[0] + upp_thr_idxs = np.where(hist[hist.argmax() :] < thr)[0] + + idx_low = low_thr_idxs[-1] if low_thr_idxs.size > 0 else 0 + idx_upp = ( + upp_thr_idxs[0] + hist.argmax() if upp_thr_idxs.size > 0 else len(hist) - 1 + ) + + llow, lupp = bin_edges[idx_low], bin_edges[idx_upp] + + idxs = [] + for n, nn in zip(peak_pos, peak_pos_neg): + condition1 = np.count_nonzero(n > 0) == 1 + condition2 = ( + np.count_nonzero((n > 0) & ((n < llow) | (n > lupp) & (n < size))) == 0 + ) + condition3 = np.count_nonzero(nn > 0) == 0 + idxs.append(condition1 and condition2 and condition3) + return idxs, llow, lupp + + +def is_valid_risetime(risetime: np.array, llim: int, perc: float): + hlim = np.percentile(risetime[~np.isnan(risetime)], perc) + idxs = (risetime >= llim) & (risetime <= hlim) + return idxs, llim, hlim + + +def signal_selection(dsp_cal, dplms_dict, coeff_values): + peak_pos = dsp_cal["peak_pos"].nda + peak_pos_neg = dsp_cal["peak_pos_neg"].nda + centroid = dsp_cal["centroid"].nda + risetime = dsp_cal["tp_90"].nda - dsp_cal["tp_10"].nda + + rt_low = dplms_dict["rt_low"] + rt_high = dplms_dict["rt_high"] + peak_lim = dplms_dict["peak_lim"] + wsize = dplms_dict["wsize"] + bsize = dplms_dict["bsize"] + + centroid_lim = dplms_dict["centroid_lim"] + if "rt" in coeff_values: + perc = coeff_values["rt"] + else: + perc = dplms_dict["dp_def"]["rt"] + if "pt" in coeff_values: + thr = coeff_values["pt"] + else: + thr = dplms_dict["dp_def"]["rt"] + + idxs_ct, ct_ll, ct_hh = is_valid_centroid(centroid, centroid_lim, wsize, bsize) + log.info(f"... {len(peak_pos[idxs_ct,:])} signals after alignment") + + idxs_pp, pp_ll, pp_hh = is_not_pile_up(peak_pos, peak_pos_neg, thr, peak_lim, wsize) + log.info(f"... {len(peak_pos[idxs_pp,:])} signals after pile-up cut") + + idxs_rt, rt_ll, rt_hh = is_valid_risetime(risetime, rt_low, perc) + log.info(f"... {len(peak_pos[idxs_rt,:])} signals after risetime cut") + + idxs = idxs_ct & idxs_pp & idxs_rt + sel_dict = { + "idxs": idxs, + "ct_ll": ct_ll, + "ct_hh": ct_hh, + "pp_ll": pp_ll, + "pp_hh": pp_hh, + "rt_ll": rt_ll, + "rt_hh": rt_hh, + } + return sel_dict + + +def noise_matrix(bls: np.array, length: int) -> np.array: + nev, size = bls.shape + ref = np.mean(bls, axis=0) + offset = np.mean(ref) + bls = bls - offset + nmat = np.matmul(bls.T, bls, dtype=float) / nev + kernel = np.identity(size - length + 1) + nmat = convolve2d(nmat, kernel, boundary="symm", mode="valid") / (size - length + 1) + return nmat + + +def signal_matrices( + wfs: np.array, length: int, decay_const: float, ff: int = 2 +) -> np.array: + nev, size = wfs.shape + lo = size // 2 - 100 + flo = size // 2 - length // 2 + fhi = size // 2 + length // 2 + offsets = np.mean(wfs[:, :lo], axis=1) + wfs = wfs - offsets[:, np.newaxis] + + # Reference signal + ref = np.sum(wfs, axis=0) + ref /= np.max(ref) + rmat = np.outer(ref[flo:fhi], ref[flo:fhi]) + + # Pile-up matrix + if decay_const > 0: + decay = np.exp(-np.arange(length) / decay_const) + else: + decay = np.zeros(length) + pmat = np.outer(decay, decay) + + # Flat top matrix + flo -= ff // 2 + fhi += ff // 2 + wfs = wfs[:, flo:fhi] + fmat = np.matmul(wfs.T, wfs, dtype=float) / nev + m1 = ((1, -1), (-1, 1)) + fmat = convolve2d(fmat, m1, boundary="symm", mode="valid") + if ff > 0: + fmat = convolve2d(fmat, np.identity(ff), boundary="symm", mode="valid") / ff + return ref, rmat, pmat, fmat + + +def filter_synthesis( + ref: np.array, + nmat: np.array, + rmat: np.array, + za: int, + pmat: np.array, + fmat: np.array, + length: int, + size: int, + flip: bool = True, +) -> np.array: + mat = nmat + rmat + za * np.ones([length, length]) + pmat + fmat + flo = (size // 2) - (length // 2) + fhi = (size // 2) + (length // 2) + x = np.linalg.solve(mat, ref[flo:fhi]).astype(np.float32) + y = convolve(ref, np.flip(x), mode="valid") + maxy = np.max(y) + x /= maxy + y /= maxy + refy = ref[(size // 2) - (len(y) // 2) : (size // 2) + (len(y) // 2)] + if flip: + return np.flip(x), y, refy + else: + return x, y, refy diff --git a/src/pygama/pargen/ecal_th.py b/src/pygama/pargen/ecal_th.py index d37b74f77..e3526c63a 100644 --- a/src/pygama/pargen/ecal_th.py +++ b/src/pygama/pargen/ecal_th.py @@ -15,7 +15,7 @@ from scipy.stats import binned_statistic mpl.use("agg") -import lgdo.lh5_store as lh5 +import lgdo.lh5 as lh5 import matplotlib.pyplot as plt import numpy as np import pandas as pd @@ -212,7 +212,7 @@ def fit_energy_res(self): indexes.append(i) continue elif peak == 511.0: - log.info(f"e annhilation found at index {i}") + log.info(f"e annihilation found at index {i}") indexes.append(i) continue elif np.isnan(dfwhms[i]): @@ -1326,7 +1326,7 @@ def plot_eres_fit(ecal_class, data, erange=[200, 2700], figsize=[12, 8], fontsiz indexes.append(i) continue elif peak == 511.0: - log.info(f"e annhilation found at index {i}") + log.info(f"e annihilation found at index {i}") indexes.append(i) continue else: diff --git a/src/pygama/pargen/energy_optimisation.py b/src/pygama/pargen/energy_optimisation.py index 24703fda2..905d126f0 100644 --- a/src/pygama/pargen/energy_optimisation.py +++ b/src/pygama/pargen/energy_optimisation.py @@ -13,7 +13,7 @@ import sys from collections import namedtuple -import lgdo.lh5_store as lh5 +import lgdo.lh5 as lh5 import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np @@ -23,8 +23,10 @@ from matplotlib.colors import LogNorm from scipy.optimize import curve_fit, minimize from scipy.stats import chisquare, norm +from sklearn.exceptions import ConvergenceWarning from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import RBF, ConstantKernel +from sklearn.utils._testing import ignore_warnings import pygama.math.histogram as pgh import pygama.math.peak_fitting as pgf @@ -68,8 +70,8 @@ def run_optimisation( Number of events to run over """ grid = set_par_space(opt_config) - waveforms = sto.read_object(f"/raw/{wf_field}", file, idx=cuts, n_rows=n_events)[0] - baseline = sto.read_object("/raw/baseline", file, idx=cuts, n_rows=n_events)[0] + waveforms = sto.read(f"/raw/{wf_field}", file, idx=cuts, n_rows=n_events)[0] + baseline = sto.read("/raw/baseline", file, idx=cuts, n_rows=n_events)[0] tb_data = lh5.Table(col_dict={f"{wf_field}": waveforms, "baseline": baseline}) return opt.run_grid(tb_data, dsp_config, grid, fom, db_dict, **fom_kwargs) @@ -138,12 +140,8 @@ def form_dict(in_dict, length): fom_kwargs = fom_kwargs["fom_kwargs"] fom_kwargs = form_dict(fom_kwargs, len(grid)) sto = lh5.LH5Store() - waveforms = sto.read_object( - f"{lh5_path}/{wf_field}", file, idx=cuts, n_rows=n_events - )[0] - baseline = sto.read_object(f"{lh5_path}/baseline", file, idx=cuts, n_rows=n_events)[ - 0 - ] + waveforms = sto.read(f"{lh5_path}/{wf_field}", file, idx=cuts, n_rows=n_events)[0] + baseline = sto.read(f"{lh5_path}/baseline", file, idx=cuts, n_rows=n_events)[0] tb_data = lh5.Table(col_dict={f"{wf_field}": waveforms, "baseline": baseline}) return opt.run_grid_multiprocess_parallel( tb_data, @@ -926,8 +924,9 @@ def event_selection( if not isinstance(kev_widths, list): kev_widths = [kev_widths] - sto = lh5.LH5Store() - df = lh5.load_dfs(raw_files, ["daqenergy", "timestamp"], lh5_path) + df = sto.read(lh5_path, raw_files, field_mask=["daqenergy", "timestamp"])[ + 0 + ].view_as("pd") if pulser_mask is None: pulser_props = cts.find_pulser_properties(df, energy="daqenergy") @@ -954,17 +953,15 @@ def event_selection( initial_idxs = np.where(initial_mask)[0] guess_keV = 2620 / np.nanpercentile(rough_energy, 99) - Euc_min = threshold / guess_keV + Euc_min = threshold / guess_keV * 0.6 Euc_max = 2620 / guess_keV * 1.1 - dEuc = 5 / guess_keV + dEuc = 1 # / guess_keV hist, bins, var = pgh.get_hist(rough_energy, range=(Euc_min, Euc_max), dx=dEuc) detected_peaks_locs, detected_peaks_keV, roughpars = pgc.hpge_find_E_peaks( hist, bins, var, - np.array( - [238.632, 583.191, 727.330, 860.564, 1592.5, 1620.5, 2103.53, 2614.553] - ), + np.array([238.632, 583.191, 727.330, 860.564, 1620.5, 2103.53, 2614.553]), ) log.debug(f"detected {detected_peaks_keV} keV peaks at {detected_peaks_locs}") @@ -999,9 +996,7 @@ def event_selection( idx_list = get_wf_indexes(sort_index, idx_list_lens) idxs = np.array(sorted(np.concatenate(masks))) - input_data = sto.read_object(f"{lh5_path}", raw_files, idx=idxs, n_rows=len(idxs))[ - 0 - ] + input_data = sto.read(f"{lh5_path}", raw_files, idx=idxs, n_rows=len(idxs))[0] if isinstance(dsp_config, str): with open(dsp_config) as r: @@ -1075,7 +1070,7 @@ def event_selection( log.warning("Less than half number of specified events found") elif len(peak_ids[final_mask]) < 0.1 * n_events: log.error("Less than 10% number of specified events found") - out_events = np.unique(np.array(out_events).flatten()) + out_events = np.unique(np.concatenate(out_events)) sort_index = np.argsort(np.concatenate(final_events)) idx_list = get_wf_indexes(sort_index, [len(mask) for mask in final_events]) return out_events, idx_list @@ -1389,6 +1384,7 @@ def get_first_point(self): self.optimal_ei = None return self.optimal_x, self.optimal_ei + @ignore_warnings(category=ConvergenceWarning) def iterate_values(self): nan_idxs = np.isnan(self.y_init) self.gauss_pr.fit(self.x_init[~nan_idxs], np.array(self.y_init)[~nan_idxs]) @@ -1459,6 +1455,7 @@ def get_best_vals(self): out_dict[name][parameter] = value_str return out_dict + @ignore_warnings(category=ConvergenceWarning) def plot(self, init_samples=None): nan_idxs = np.isnan(self.y_init) fail_idxs = np.isnan(self.yerr_init) @@ -1565,6 +1562,7 @@ def plot(self, init_samples=None): plt.close() return fig + @ignore_warnings(category=ConvergenceWarning) def plot_acq(self, init_samples=None): nan_idxs = np.isnan(self.y_init) self.gauss_pr.fit(self.x_init[~nan_idxs], np.array(self.y_init)[~nan_idxs]) diff --git a/src/pygama/pargen/extract_tau.py b/src/pygama/pargen/extract_tau.py index 72e357fd7..61e833994 100644 --- a/src/pygama/pargen/extract_tau.py +++ b/src/pygama/pargen/extract_tau.py @@ -15,7 +15,7 @@ mpl.use("agg") import lgdo -import lgdo.lh5_store as lh5 +import lgdo.lh5 as lh5 import matplotlib.pyplot as plt import numpy as np @@ -26,6 +26,7 @@ import pygama.pargen.energy_optimisation as om log = logging.getLogger(__name__) +sto = lh5.LH5Store() def load_data( @@ -36,8 +37,9 @@ def load_data( threshold: int = 5000, wf_field: str = "waveform", ) -> lgdo.Table: - sto = lh5.LH5Store() - df = lh5.load_dfs(raw_file, ["daqenergy", "timestamp"], lh5_path) + df = sto.read(lh5_path, raw_file, field_mask=["daqenergy", "timestamp"])[0].view_as( + "pd" + ) if pulser_mask is None: pulser_props = cts.find_pulser_properties(df, energy="daqenergy") @@ -61,12 +63,10 @@ def load_data( cuts = np.where((df.daqenergy.values > threshold) & (~ids))[0] - waveforms = sto.read_object( - f"{lh5_path}/{wf_field}", raw_file, idx=cuts, n_rows=n_events - )[0] - baseline = sto.read_object( - f"{lh5_path}/baseline", raw_file, idx=cuts, n_rows=n_events - )[0] + waveforms = sto.read(f"{lh5_path}/{wf_field}", raw_file, idx=cuts, n_rows=n_events)[ + 0 + ] + baseline = sto.read(f"{lh5_path}/baseline", raw_file, idx=cuts, n_rows=n_events)[0] tb_data = lh5.Table(col_dict={f"{wf_field}": waveforms, "baseline": baseline}) return tb_data @@ -144,8 +144,8 @@ def get_decay_constant( ) axins.axvline(high_bin, color="red") axins.set_xlim(bins[in_min], bins[in_max]) - labels = ax.get_xticklabels() - ax.set_xticklabels(labels=labels, rotation=45) + ax.set_xticks(ax.get_xticks()) + ax.set_xticklabels(labels=ax.get_xticklabels(), rotation=45) out_plot_dict["slope"] = fig if display > 1: plt.show() diff --git a/src/pygama/pargen/utils.py b/src/pygama/pargen/utils.py index 27f4af9ae..a1ec229ab 100644 --- a/src/pygama/pargen/utils.py +++ b/src/pygama/pargen/utils.py @@ -3,12 +3,13 @@ import logging from types import FunctionType -import lgdo.lh5_store as lh5 import numpy as np import pandas as pd from iminuit import Minuit, cost, util +from lgdo import Table, lh5 log = logging.getLogger(__name__) +sto = lh5.LH5Store() def return_nans(input): @@ -50,8 +51,6 @@ def load_data( Loads in the A/E parameters needed and applies calibration constants to energy """ - sto = lh5.LH5Store() - out_df = pd.DataFrame(columns=params) if isinstance(files, dict): @@ -69,16 +68,23 @@ def load_data( all_files = [] masks = np.array([], dtype=bool) for tstamp, tfiles in files.items(): - table = sto.read_object(lh5_path, tfiles)[0] + table = sto.read(lh5_path, tfiles)[0] + + file_df = pd.DataFrame(columns=params) if tstamp in cal_dict: - file_df = table.eval(cal_dict[tstamp]).get_dataframe() + cal_dict_ts = cal_dict[tstamp] else: - file_df = table.eval(cal_dict).get_dataframe() - file_df["run_timestamp"] = np.full(len(file_df), tstamp, dtype=object) - params.append("run_timestamp") + cal_dict_ts = cal_dict + + for outname, info in cal_dict_ts.items(): + outcol = table.eval(info["expression"], info.get("parameters", None)) + table.add_column(outname, outcol) + for param in params: - if param not in file_df: - file_df[param] = lh5.load_nda(tfiles, [param], lh5_path)[param] + file_df[param] = table[param] + + file_df["run_timestamp"] = np.full(len(file_df), tstamp, dtype=object) + if threshold is not None: mask = file_df[cal_energy_param] > threshold file_df.drop(np.where(~mask)[0], inplace=True) @@ -88,6 +94,7 @@ def load_data( df.append(file_df) all_files += tfiles + params.append("run_timestamp") df = pd.concat(df) elif isinstance(files, list): @@ -95,11 +102,13 @@ def load_data( keys = [key.split("/")[-1] for key in keys] params = get_params(keys + list(cal_dict.keys()), params) - table = sto.read_object(lh5_path, files)[0] - df = table.eval(cal_dict).get_dataframe() + table = sto.read(lh5_path, files)[0] + df = pd.DataFrame(columns=params) + for outname, info in cal_dict.items(): + outcol = table.eval(info["expression"], info.get("parameters", None)) + table.add_column(outname, outcol) for param in params: - if param not in df: - df[param] = lh5.load_nda(files, [param], lh5_path)[param] + df[param] = table[param] if threshold is not None: masks = df[cal_energy_param] > threshold df.drop(np.where(~masks)[0], inplace=True) @@ -133,14 +142,23 @@ def get_tcm_pulser_ids(tcm_file, channel, multiplicity_threshold): mask = np.append(mask, file_mask) ids = np.where(mask)[0] else: - data = lh5.load_dfs(tcm_file, ["array_id", "array_idx"], "hardware_tcm_1") - cum_length = lh5.load_nda(tcm_file, ["cumulative_length"], "hardware_tcm_1")[ - "cumulative_length" - ] - cum_length = np.append(np.array([0]), cum_length) - n_channels = np.diff(cum_length) - evt_numbers = np.repeat(np.arange(0, len(cum_length) - 1), np.diff(cum_length)) - evt_mult = np.repeat(np.diff(cum_length), np.diff(cum_length)) + data = pd.DataFrame( + { + "array_id": sto.read("hardware_tcm_1/array_id", tcm_file)[0].view_as( + "np" + ), + "array_idx": sto.read("hardware_tcm_1/array_idx", tcm_file)[0].view_as( + "np" + ), + } + ) + cumulength = sto.read("hardware_tcm_1/cumulative_length", tcm_file)[0].view_as( + "np" + ) + cumulength = np.append(np.array([0]), cumulength) + n_channels = np.diff(cumulength) + evt_numbers = np.repeat(np.arange(0, len(cumulength) - 1), np.diff(cumulength)) + evt_mult = np.repeat(np.diff(cumulength), np.diff(cumulength)) data["evt_number"] = evt_numbers data["evt_mult"] = evt_mult high_mult_events = np.where(n_channels > multiplicity_threshold)[0] diff --git a/src/pygama/skm/__init__.py b/src/pygama/skm/__init__.py new file mode 100644 index 000000000..7b9ae88d2 --- /dev/null +++ b/src/pygama/skm/__init__.py @@ -0,0 +1,7 @@ +""" +Utilities for grouping hit data into events. +""" + +from .build_skm import build_skm + +__all__ = ["build_skm"] diff --git a/src/pygama/skm/build_skm.py b/src/pygama/skm/build_skm.py new file mode 100644 index 000000000..83c601c3a --- /dev/null +++ b/src/pygama/skm/build_skm.py @@ -0,0 +1,245 @@ +""" +This module implements routines to build the `skm` tier, consisting of skimmed +data from lower tiers. +""" + +from __future__ import annotations + +import json +import logging +import os + +import awkward as ak +import numpy as np +from lgdo import Array, Table, lh5 +from lgdo.lh5 import LH5Store + +from pygama.evt import utils + +log = logging.getLogger(__name__) + + +def build_skm( + f_evt: str, + f_hit: str, + f_dsp: str, + f_tcm: str, + skm_conf: dict | str, + f_skm: str | None = None, + wo_mode: str = "w", + skm_group: str = "skm", + evt_group: str = "evt", + tcm_group: str = "hardware_tcm_1", + dsp_group: str = "dsp", + hit_group: str = "hit", + tcm_id_table_pattern: str = "ch{}", +) -> None | Table: + """Builds a skimmed file from a (set) of `evt/hit/dsp` tier file(s). + + Parameters + ---------- + f_evt + path of `evt` file. + f_hit + path of `hit` file. + f_dsp + path of `dsp` file. + f_tcm + path of `tcm` file. + skm_conf + name of configuration file or dictionary defining `skm` fields. + + - ``multiplicity`` defines up to which row length + :class:`.VectorOfVector` fields should be kept. + - ``postfixes`` list of postfixes must be list of + ``len(multiplicity)``. If not given, numbers from 0 to + ``multiplicity -1`` are used + - ``operations`` are forwarded from lower tiers and clipped/padded + according to ``missing_value`` if needed. If the forwarded field + is not an evt tier, ``tcm_idx`` must be passed that specifies the + value to pick across channels. + + For example: + + .. code-block:: json + + { + "multiplicity": 2, + "postfixes":["", "aux"], + "operations": { + "timestamp":{ + "forward_field": "evt.timestamp" + }, + "multiplicity":{ + "forward_field": "evt.multiplicity" + }, + "energy":{ + "forward_field": "hit.cuspEmax_ctc_cal", + "missing_value": "np.nan", + "tcm_idx": "evt.energy_idx" + }, + "energy_id":{ + "forward_field": "tcm.array_id", + "missing_value": 0, + "tcm_idx": "evt.energy_idx" + } + } + } + f_skm + name of the `skm` output file. If ``None``, return the output + class:`.Table` instead of writing to disk. + + wo_mode + writing mode. + + - ``write_safe`` or ``w``: only proceed with writing if the file does + not already exists. + - ``append`` or ``a``: append to file. + - ``overwrite`` or ``o``: replaces existing file. + + skm_group + `skm` LH5 root group name. + evt_group + `evt` LH5 root group name. + hit_group + `hit` LH5 root group name. + dsp_group + `dsp` LH5 root group name. + tcm_group + `tcm` LH5 root group name. + tcm_id_table_pattern + pattern to format `tcm` id values to table name in higher tiers. Must have one + placeholder which is the `tcm` id. + """ + f_dict = {evt_group: f_evt, hit_group: f_hit, dsp_group: f_dsp, tcm_group: f_tcm} + log = logging.getLogger(__name__) + log.debug(f"I am skimming {len(f_evt) if isinstance(f_evt,list) else 1} files") + + tbl_cfg = skm_conf + if not isinstance(tbl_cfg, (str, dict)): + raise TypeError() + if isinstance(tbl_cfg, str): + with open(tbl_cfg) as f: + tbl_cfg = json.load(f) + + # Check if multiplicity is given + if "multiplicity" not in tbl_cfg.keys(): + raise ValueError("multiplicity field missing") + + multi = int(tbl_cfg["multiplicity"]) + store = LH5Store() + # df = pd.DataFrame() + table = Table() + if "operations" in tbl_cfg.keys(): + for op in tbl_cfg["operations"].keys(): + miss_val = np.nan + if "missing_value" in tbl_cfg["operations"][op].keys(): + miss_val = tbl_cfg["operations"][op]["missing_value"] + if isinstance(miss_val, str) and ( + miss_val in ["np.nan", "np.inf", "-np.inf"] + ): + miss_val = eval(miss_val) + + fw_fld = tbl_cfg["operations"][op]["forward_field"] + + # load object if from evt tier + if evt_group in fw_fld.replace(".", "/"): + obj = store.read( + f"/{fw_fld.replace('.','/')}", f_dict[fw_fld.split(".", 1)[0]] + )[0].view_as("ak") + + # else collect data from lower tier via tcm_idx + else: + if "tcm_idx" not in tbl_cfg["operations"][op].keys(): + raise ValueError( + f"{op} is an sub evt level operation. tcm_idx field must be specified" + ) + tcm_idx_fld = tbl_cfg["operations"][op]["tcm_idx"] + tcm_idx = store.read( + f"/{tcm_idx_fld.replace('.','/')}", + f_dict[tcm_idx_fld.split(".")[0]], + )[0].view_as("ak")[:, :multi] + + obj = ak.Array([[] for x in range(len(tcm_idx))]) + + # load TCM data to define an event + ids = store.read(f"/{tcm_group}/array_id", f_tcm)[0].view_as("ak") + ids = ak.unflatten(ids[ak.flatten(tcm_idx)], ak.count(tcm_idx, axis=-1)) + + idx = store.read(f"/{tcm_group}/array_idx", f_tcm)[0].view_as("ak") + idx = ak.unflatten(idx[ak.flatten(tcm_idx)], ak.count(tcm_idx, axis=-1)) + + if "tcm.array_id" == tbl_cfg["operations"][op]["forward_field"]: + obj = ids + elif "tcm.array_idx" == tbl_cfg["operations"][op]["forward_field"]: + obj = idx + + else: + chns = np.unique( + ak.to_numpy(ak.flatten(ids), allow_missing=False) + ).astype(int) + + # Get the data + for ch in chns: + ch_idx = idx[ids == ch] + ct_idx = ak.count(ch_idx, axis=-1) + fl_idx = ak.to_numpy(ak.flatten(ch_idx), allow_missing=False) + + if ( + f"{utils.get_table_name_by_pattern(tcm_id_table_pattern,ch)}/{fw_fld.replace('.','/')}" + not in lh5.ls( + f_dict[[key for key in f_dict if key in fw_fld][0]], + f"ch{ch}/{fw_fld.rsplit('.',1)[0]}/", + ) + ): + och = Array(nda=np.full(len(fl_idx), miss_val)) + else: + och, _ = store.read( + f"{utils.get_table_name_by_pattern(tcm_id_table_pattern,ch)}/{fw_fld.replace('.','/')}", + f_dict[[key for key in f_dict if key in fw_fld][0]], + idx=fl_idx, + ) + if not isinstance(och, Array): + raise ValueError( + f"{type(och)} not supported. Forward only Array fields" + ) + och = och.view_as("ak") + och = ak.unflatten(och, ct_idx) + obj = ak.concatenate((obj, och), axis=-1) + + # Pad, clip and numpyfy + if obj.ndim > 1: + obj = ak.pad_none(obj, multi, clip=True) + obj = ak.to_numpy(ak.fill_none(obj, miss_val)) + + if obj.ndim > 1: + if "postfixes" in tbl_cfg.keys(): + nms = [f"{op}{x}" for x in tbl_cfg["postfixes"]] + else: + nms = [f"{op}_{x}" for x in range(multi)] + + for i in range(len(nms)): + # add attribute if present + ob = Array(nda=obj[:, i]) + if "lgdo_attrs" in tbl_cfg["operations"][op].keys(): + ob.attrs |= tbl_cfg["operations"][op]["lgdo_attrs"] + table.add_field(nms[i], ob, True) + else: + obj = Array(nda=obj) + if "lgdo_attrs" in tbl_cfg["operations"][op].keys(): + obj.attrs |= tbl_cfg["operations"][op]["lgdo_attrs"] + table.add_field(op, obj, True) + + if not f_skm: + return table + + # last thing missing is writing it out + if wo_mode not in ["w", "write_safe", "o", "overwrite", "a", "append"]: + raise ValueError(f"wo_mode {wo_mode} not valid.") + + log.debug("saving skm file") + if (wo_mode in ["w", "write_safe"]) and os.path.exists(f_skm): + raise FileExistsError(f"Write_safe mode: {f_skm} exists.") + + wo = wo_mode if wo_mode not in ["o", "overwrite"] else "of" + store.write(obj=table, name=f"/{skm_group}/", lh5_file=f_skm, wo_mode=wo) diff --git a/tests/configs/icpc-dsp-config.json b/tests/configs/icpc-dsp-config.json index 8536a31ec..28af29239 100644 --- a/tests/configs/icpc-dsp-config.json +++ b/tests/configs/icpc-dsp-config.json @@ -38,19 +38,19 @@ "processors": { "tp_min, tp_max, wf_min, wf_max": { "function": "min_max", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["waveform", "tp_min", "tp_max", "wf_min", "wf_max"], "unit": ["ns", "ns", "ADC", "ADC"] }, "wf_blsub": { "function": "bl_subtract", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["waveform", "baseline", "wf_blsub"], "unit": "ADC" }, "bl_mean , bl_std, bl_slope, bl_intercept": { "function": "linear_slope_fit", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": [ "wf_blsub[0:750]", "bl_mean", @@ -62,51 +62,65 @@ }, "wf_pz": { "function": "pole_zero", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_blsub", "db.pz.tau", "wf_pz"], "unit": "ADC", "defaults": { "db.pz.tau": "27460.5" } }, "pz_mean , pz_std, pz_slope, pz_intercept": { "function": "linear_slope_fit", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_pz[1500:]", "pz_mean", "pz_std", "pz_slope", "pz_intercept"], "unit": ["ADC", "ADC", "ADC", "ADC"] }, - "wf_t0_filter": { + "t0_kernel": { "function": "t0_filter", - "module": "pygama.dsp.processors", - "args": ["wf_pz", "wf_t0_filter(len(wf_pz), 'f', grid=wf_pz.grid)"], - "init_args": ["128*ns/wf_pz.period", "2*us/wf_pz.period"], + "module": "dspeed.processors", + "args": [ + "128*ns/wf_pz.period", + "2*us/wf_pz.period", + "t0_kernel(round((128*ns+2*us)/wf_pz.period), 'f')" + ], + "unit": "ADC" + }, + "wf_t0_filter": { + "function": "convolve_wf", + "module": "dspeed.processors", + "args": [ + "wf_pz", + "t0_kernel", + "'s'", + "wf_t0_filter(len(wf_pz), 'f', grid=wf_pz.grid)" + ], "unit": "ADC" }, "wf_atrap": { "function": "asym_trap_filter", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_pz", "128*ns", "4", "2*us", "wf_atrap"], "unit": "ADC" }, "conv_tmin ,tp_start, conv_min, conv_max": { "function": "min_max", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_t0_filter", "conv_tmin", "tp_start", "conv_min", "conv_max"], "unit": ["ns", "ns", "ADC", "ADC"] }, "tp_0_atrap": { "function": "time_point_thresh", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_atrap", "bl_std", "tp_start", 0, "tp_0_atrap"], "unit": "ns" }, "tp_0_est": { "function": "time_point_thresh", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_t0_filter", "bl_std", "tp_start", 0, "tp_0_est(unit=ns)"], "unit": "ns" }, "wf_trap": { "function": "trap_norm", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_pz", "db.ttrap.rise", "db.ttrap.flat", "wf_trap"], "unit": "ADC", "defaults": { "db.ttrap.rise": "10*us", "db.ttrap.flat": "3.008*us" } @@ -120,7 +134,7 @@ }, "wf_etrap": { "function": "trap_norm", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_pz", "db.etrap.rise", "db.etrap.flat", "wf_etrap"], "unit": "ADC", "defaults": { "db.etrap.rise": "10*us", "db.etrap.flat": "3.008*us" } @@ -134,10 +148,10 @@ }, "trapEftp": { "function": "fixed_time_pickoff", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": [ "wf_etrap", - "tp_0_est+db.etrap.rise+db.etrap.flat*db.etrap.sample", + "round(tp_0_est+db.etrap.rise+db.etrap.flat*db.etrap.sample, wf_etrap.grid)", "'l'", "trapEftp" ], @@ -148,23 +162,33 @@ "db.etrap.sample": "0.8" } }, - "wf_cusp": { + "cusp_kernel": { "function": "cusp_filter", - "module": "pygama.dsp.processors", - "args": ["wf_blsub", "wf_cusp(101, 'f')"], - "init_args": [ - "len(wf_blsub)-100", + "module": "dspeed.processors", + "args": [ "db.cusp.sigma/wf_blsub.period", "round(db.cusp.flat/wf_blsub.period)", - "db.pz.tau" + "db.pz.tau/wf_blsub.period", + "cusp_kernel(round(len(wf_blsub)-(33.6*us/wf_blsub.period)-(4.8*us/wf_blsub.period)), 'f')" ], "defaults": { "db.cusp.sigma": "20*us", "db.cusp.flat": "3*us", - "db.pz.tau": "27460.5" + "db.pz.tau": "450*us" }, "unit": "ADC" }, + "wf_cusp": { + "function": "fft_convolve_wf", + "module": "dspeed.processors", + "args": [ + "wf_blsub[:round(len(wf_blsub)-(33.6*us/wf_blsub.period))]", + "cusp_kernel", + "'v'", + "wf_cusp(round((4.8*us/wf_blsub.period)+1), 'f')" + ], + "unit": "ADC" + }, "cuspEmax": { "function": "amax", "module": "numpy", @@ -174,28 +198,38 @@ }, "cuspEftp": { "function": "fixed_time_pickoff", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_cusp", "db.cusp.sample", "'i'", "cuspEftp"], "unit": "ADC", "defaults": { "db.cusp.sample": "50" } }, - "wf_zac": { + "zac_kernel": { "function": "zac_filter", - "module": "pygama.dsp.processors", - "args": ["wf_blsub", "wf_zac(101, 'f')"], - "init_args": [ - "len(wf_blsub)-100", + "module": "dspeed.processors", + "args": [ "db.zac.sigma/wf_blsub.period", "round(db.zac.flat/wf_blsub.period)", - "db.pz.tau" + "db.pz.tau/wf_blsub.period", + "zac_kernel(round(len(wf_blsub)-(33.6*us/wf_blsub.period)-(4.8*us/wf_blsub.period)), 'f')" ], "defaults": { "db.zac.sigma": "20*us", "db.zac.flat": "3*us", - "db.pz.tau": "27460.5" + "db.pz.tau": "450*us" }, "unit": "ADC" }, + "wf_zac": { + "function": "fft_convolve_wf", + "module": "dspeed.processors", + "args": [ + "wf_blsub[:round(len(wf_blsub)-(33.6*us/wf_blsub.period))]", + "zac_kernel", + "'v'", + "wf_zac(round((4.8*us/wf_blsub.period)+1), 'f')" + ], + "unit": "ADC" + }, "zacEmax": { "function": "amax", "module": "numpy", @@ -205,74 +239,74 @@ }, "zacEftp": { "function": "fixed_time_pickoff", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_zac", "db.zac.sample", "'i'", "zacEftp"], "defaults": { "db.zac.sample": "50" }, "unit": "ADC" }, "tp_100": { "function": "time_point_thresh", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_pz", "trapTmax", "tp_0_est", 1, "tp_100"], "unit": "ns" }, "tp_99": { "function": "time_point_thresh", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_pz", "0.99*trapTmax", "tp_0_est", 1, "tp_99"], "unit": "ns" }, "tp_95": { "function": "time_point_thresh", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_pz", "trapTmax*0.95", "tp_99", 0, "tp_95"], "unit": "ns" }, "tp_90": { "function": "time_point_thresh", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_pz", "trapTmax*0.9", "tp_95", 0, "tp_90"], "unit": "ns" }, "tp_80": { "function": "time_point_thresh", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_pz", "trapTmax*0.8", "tp_90", 0, "tp_80"], "unit": "ns" }, "tp_50": { "function": "time_point_thresh", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_pz", "trapTmax*0.5", "tp_80", 0, "tp_50"], "unit": "ns" }, "tp_20": { "function": "time_point_thresh", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_pz", "trapTmax*0.2", "tp_50", 0, "tp_20"], "unit": "ns" }, "tp_10": { "function": "time_point_thresh", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_pz", "trapTmax*0.1", "tp_20", 0, "tp_10"], "unit": "ns" }, "tp_01": { "function": "time_point_thresh", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_pz", "trapTmax*0.01", "tp_10", 0, "tp_01"], "unit": "ns" }, "wf_trap2": { "function": "trap_norm", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_pz", "4*us", "96*ns", "wf_trap2"], "unit": "ADC" }, "trapQftp": { "function": "fixed_time_pickoff", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_trap2", "tp_0_est + 8.096*us", "'l'", "trapQftp"], "unit": "ADC" }, @@ -290,31 +324,31 @@ }, "wf_le": { "function": "windower", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_pz", "tp_0_est", "wf_le(301, 'f')"], "unit": "ADC" }, "curr": { "function": "avg_current", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["wf_le", 1, "curr(len(wf_le)-1, 'f')"], "unit": "ADC/sample" }, "curr_up": { "function": "upsampler", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["curr", "16", "curr_up(4784, 'f')"], "unit": "ADC/sample" }, "curr_av": { "function": "moving_window_multi", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["curr_up", "48", 3, 0, "curr_av"], "unit": "ADC/sample" }, "aoe_t_min, tp_aoe_max, A_min, A_max": { "function": "min_max", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["curr_av", "aoe_t_min", "tp_aoe_max", "A_min", "A_max"], "unit": ["ns", "ns", "ADC/sample", "ADC/sample"] }, diff --git a/tests/configs/sipm-dplms-config.json b/tests/configs/sipm-dplms-config.json index cc7919e33..dd69bac0f 100644 --- a/tests/configs/sipm-dplms-config.json +++ b/tests/configs/sipm-dplms-config.json @@ -10,26 +10,26 @@ "processors": { "wf_gaus": { "function": "gaussian_filter1d", - "module": "pygama.dsp.processors", + "module": "dspeed.processors.gaussian_filter1d", "args": ["waveform", "wf_gaus(len(waveform))"], "init_args": ["1", "4.0"], "unit": "ADC" }, "curr": { "function": "avg_current", - "module": "pygama.dsp.processors", + "module": "dspeed.processors.moving_windows", "args": ["wf_gaus", 5, "curr(len(wf_gaus)-5)"], "unit": "ADC" }, "hist_weights , hist_borders": { "function": "histogram", - "module": "pygama.dsp.processors", + "module": "dspeed.processors.histogram", "args": ["curr", "hist_weights(100)", "hist_borders(101)"], "unit": ["none", "ADC"] }, "fwhm, idx_out_c, max_out": { "function": "histogram_stats", - "module": "pygama.dsp.processors", + "module": "dspeed.processors.histogram", "args": [ "hist_weights", "hist_borders", @@ -42,7 +42,7 @@ }, "vt_max_candidate_out, vt_min_out, n_max_out, n_min_out": { "function": "get_multi_local_extrema", - "module": "pygama.dsp.processors", + "module": "dspeed.processors.get_multi_local_extrema", "args": [ "curr", 5, @@ -59,7 +59,7 @@ }, "trigger_pos, no_out": { "function": "peak_snr_threshold", - "module": "pygama.dsp.processors", + "module": "dspeed.processors.peak_snr_threshold", "args": [ "curr", "vt_max_candidate_out", @@ -72,13 +72,13 @@ }, "energies": { "function": "multi_a_filter", - "module": "pygama.dsp.processors", + "module": "dspeed.processors.multi_a_filter", "args": ["curr", "trigger_pos", "energies"], "unit": ["ADC"] }, "bl_mean , bl_std, bl_slope, bl_intercept": { "function": "linear_slope_fit", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": [ "waveform[:50]", "bl_mean", @@ -90,34 +90,45 @@ }, "wf_diff": { "function": "avg_current", - "module": "pygama.dsp.processors", + "module": "dspeed.processors", "args": ["waveform", 1, "wf_diff(len(waveform)-1)"], "unit": "ADC" }, - "wf_dplms": { + "dplms_kernel": { "function": "dplms_filter", - "module": "pygama.dsp.processors", - "args": ["wf_diff", "wf_dplms(len(wf_diff)-49, 'f')"], - "unit": "ADC", - "init_args": [ + "module": "dspeed.processors", + "args": [ "db.dplms.noise_matrix", "db.dplms.reference", - "50", "0.01", "1", "0", - "0" - ] + "0", + "dplms_kernel(50, 'f')" + ], + "unit": "ADC" + }, + "wf_dplms": { + "description": "convolve optimised cusp filter", + "function": "convolve_wf", + "module": "dspeed.processors", + "args": [ + "wf_diff", + "dplms_kernel", + "'s'", + "wf_dplms(len(wf_diff)-49, 'f')" + ], + "unit": "ADC" }, "h_weights , h_borders": { "function": "histogram", - "module": "pygama.dsp.processors", + "module": "dspeed.processors.histogram", "args": ["wf_dplms", "h_weights(100)", "h_borders(101)"], "unit": ["none", "ADC"] }, "fwhm_d, idx_out_d, max_out_d": { "function": "histogram_stats", - "module": "pygama.dsp.processors", + "module": "dspeed.processors.histogram", "args": [ "h_weights", "h_borders", @@ -130,7 +141,7 @@ }, "vt_max_candidate_out_d, vt_min_out_d, n_max_out_d, n_min_out_d": { "function": "get_multi_local_extrema", - "module": "pygama.dsp.processors", + "module": "dspeed.processors.get_multi_local_extrema", "args": [ "wf_dplms", 10, @@ -145,7 +156,7 @@ }, "trigger_pos_dplms, no_out_d": { "function": "peak_snr_threshold", - "module": "pygama.dsp.processors", + "module": "dspeed.processors.peak_snr_threshold", "args": [ "wf_dplms", "vt_max_candidate_out_d", @@ -158,7 +169,7 @@ }, "energies_dplms": { "function": "multi_a_filter", - "module": "pygama.dsp.processors", + "module": "dspeed.processors.multi_a_filter", "args": ["wf_dplms", "trigger_pos_dplms", "energies_dplms"], "unit": ["ADC"] } diff --git a/tests/configs/sipm-dsp-config.json b/tests/configs/sipm-dsp-config.json index 8de5bd2e6..bb7878a5d 100644 --- a/tests/configs/sipm-dsp-config.json +++ b/tests/configs/sipm-dsp-config.json @@ -3,26 +3,26 @@ "processors": { "wf_gaus": { "function": "gaussian_filter1d", - "module": "pygama.dsp.processors", + "module": "dspeed.processors.gaussian_filter1d", "args": ["waveform", "wf_gaus(len(waveform))"], "init_args": ["1", "4.0"], "unit": "ADC" }, "curr": { "function": "avg_current", - "module": "pygama.dsp.processors", + "module": "dspeed.processors.moving_windows", "args": ["wf_gaus", 5, "curr(len(wf_gaus)-5)"], "unit": "ADC" }, "hist_weights , hist_borders": { "function": "histogram", - "module": "pygama.dsp.processors", + "module": "dspeed.processors.histogram", "args": ["curr", "hist_weights(100)", "hist_borders(101)"], "unit": ["none", "ADC"] }, "fwhm, idx_out_c, max_out": { "function": "histogram_stats", - "module": "pygama.dsp.processors", + "module": "dspeed.processors.histogram", "args": [ "hist_weights", "hist_borders", @@ -35,7 +35,7 @@ }, "vt_max_candidate_out, vt_min_out, n_max_out, n_min_out": { "function": "get_multi_local_extrema", - "module": "pygama.dsp.processors", + "module": "dspeed.processors.get_multi_local_extrema", "args": [ "curr", 5, @@ -52,7 +52,7 @@ }, "trigger_pos, no_out": { "function": "peak_snr_threshold", - "module": "pygama.dsp.processors", + "module": "dspeed.processors.peak_snr_threshold", "args": [ "curr", "vt_max_candidate_out", @@ -65,7 +65,7 @@ }, "energies": { "function": "multi_a_filter", - "module": "pygama.dsp.processors", + "module": "dspeed.processors.multi_a_filter", "args": ["curr", "trigger_pos", "energies"], "unit": ["ADC"] } diff --git a/tests/evt/configs/basic-evt-config.json b/tests/evt/configs/basic-evt-config.json new file mode 100644 index 000000000..3a8c62753 --- /dev/null +++ b/tests/evt/configs/basic-evt-config.json @@ -0,0 +1,90 @@ +{ + "channels": { + "geds_on": ["ch1084803", "ch1084804", "ch1121600"] + }, + "outputs": [ + "multiplicity", + "energy", + "energy_id", + "energy_idx", + "energy_any_above1MeV", + "energy_all_above1MeV", + "energy_aux", + "energy_sum", + "is_usable_aoe", + "aoe", + "is_aoe_rejected" + ], + "operations": { + "multiplicity": { + "channels": "geds_on", + "aggregation_mode": "sum", + "expression": "hit.cuspEmax_ctc_cal > a", + "parameters": { "a": 25 }, + "initial": 0, + "lgdo_attrs": { "statement": "0bb decay is real" } + }, + "energy": { + "channels": "geds_on", + "aggregation_mode": "first_at:dsp.tp_0_est", + "query": "hit.cuspEmax_ctc_cal>25", + "expression": "hit.cuspEmax_ctc_cal", + "initial": "np.nan" + }, + "energy_id": { + "channels": "geds_on", + "aggregation_mode": "first_at:dsp.tp_0_est", + "query": "hit.cuspEmax_ctc_cal>25", + "expression": "tcm.array_id", + "initial": 0 + }, + "energy_idx": { + "channels": "geds_on", + "aggregation_mode": "first_at:dsp.tp_0_est", + "query": "hit.cuspEmax_ctc_cal>25", + "expression": "tcm.index", + "initial": 999999999999 + }, + "energy_any_above1MeV": { + "channels": "geds_on", + "aggregation_mode": "any", + "expression": "hit.cuspEmax_ctc_cal>1000", + "initial": false + }, + "energy_all_above1MeV": { + "channels": "geds_on", + "aggregation_mode": "all", + "expression": "hit.cuspEmax_ctc_cal>1000", + "initial": false + }, + "energy_aux": { + "channels": "geds_on", + "aggregation_mode": "last_at:dsp.tp_0_est", + "query": "hit.cuspEmax_ctc_cal>25", + "expression": "hit.cuspEmax_ctc_cal", + "initial": "np.nan" + }, + "energy_sum": { + "channels": "geds_on", + "aggregation_mode": "sum", + "query": "hit.cuspEmax_ctc_cal>25", + "expression": "hit.cuspEmax_ctc_cal", + "initial": 0.0 + }, + "is_usable_aoe": { + "aggregation_mode": "keep_at_ch:evt.energy_id", + "expression": "True", + "initial": false + }, + "aoe": { + "aggregation_mode": "keep_at_ch:evt.energy_id", + "expression": "hit.AoE_Classifier", + "initial": "np.nan" + }, + "is_aoe_rejected": { + "aggregation_mode": "keep_at_ch:evt.energy_id", + "expression": "~(hit.AoE_Double_Sided_Cut)", + "initial": false + } + } +} diff --git a/tests/evt/configs/module-test-evt-config.json b/tests/evt/configs/module-test-evt-config.json new file mode 100644 index 000000000..0daa94658 --- /dev/null +++ b/tests/evt/configs/module-test-evt-config.json @@ -0,0 +1,72 @@ +{ + "channels": { + "spms_on": ["ch1057600", "ch1059201", "ch1062405"], + "geds_on": ["ch1084803", "ch1084804", "ch1121600"] + }, + "outputs": [ + "energy_first", + "energy_first_id", + "t0", + "lar_energy", + "lar_multiplicity", + "is_lar_rejected", + "lar_classifier", + "lar_energy_dplms", + "lar_multiplicity_dplms", + "lar_time_shift" + ], + "operations": { + "energy_first": { + "channels": "geds_on", + "aggregation_mode": "first_at:dsp.tp_0_est", + "query": "hit.cuspEmax_ctc_cal>25", + "expression": "hit.cuspEmax_ctc_cal", + "initial": "np.nan" + }, + "energy_first_id": { + "channels": "geds_on", + "aggregation_mode": "first_at:dsp.tp_0_est", + "query": "hit.cuspEmax_ctc_cal>25", + "expression": "tcm.array_id", + "initial": 0 + }, + "t0": { + "aggregation_mode": "keep_at_ch:evt.energy_first_id", + "expression": "dsp.tp_0_est", + "initial": 0.0 + }, + "lar_energy": { + "channels": "spms_on", + "aggregation_mode": "function", + "expression": "pygama.evt.modules.spm.get_energy(0.5,evt.t0,48000,1000,5000)" + }, + "lar_multiplicity": { + "channels": "spms_on", + "aggregation_mode": "function", + "expression": ".modules.spm.get_majority(0.5,evt.t0,48000,1000,5000)" + }, + "is_lar_rejected": { + "expression": "(evt.lar_energy >4) | (evt.lar_multiplicity > 4) " + }, + "lar_classifier": { + "channels": "spms_on", + "aggregation_mode": "function", + "expression": ".modules.spm.get_etc(0.5,evt.t0,48000,100,6000,80,1,0,50)" + }, + "lar_energy_dplms": { + "channels": "spms_on", + "aggregation_mode": "function", + "expression": ".modules.spm.get_energy_dplms(0.5,evt.t0,48000,1000,5000)" + }, + "lar_multiplicity_dplms": { + "channels": "spms_on", + "aggregation_mode": "function", + "expression": ".modules.spm.get_majority_dplms(0.5,evt.t0,48000,1000,5000)" + }, + "lar_time_shift": { + "channels": "spms_on", + "aggregation_mode": "function", + "expression": ".modules.spm.get_time_shift(0.5,evt.t0,48000,1000,5000)" + } + } +} diff --git a/tests/evt/configs/module-test-t0-vov-evt-config.json b/tests/evt/configs/module-test-t0-vov-evt-config.json new file mode 100644 index 000000000..cda042337 --- /dev/null +++ b/tests/evt/configs/module-test-t0-vov-evt-config.json @@ -0,0 +1,82 @@ +{ + "channels": { + "spms_on": ["ch1057600", "ch1059201", "ch1062405"], + "geds_on": ["ch1084803", "ch1084804", "ch1121600"] + }, + "outputs": [ + "energy", + "energy_id", + "t0", + "lar_energy", + "lar_multiplicity", + "is_lar_rejected", + "lar_classifier", + "lar_energy_dplms", + "lar_multiplicity_dplms", + "lar_time_shift", + "lar_tcm_index", + "lar_pulse_index" + ], + "operations": { + "energy": { + "channels": "geds_on", + "aggregation_mode": "gather", + "query": "hit.cuspEmax_ctc_cal>25", + "expression": "hit.cuspEmax_ctc_cal" + }, + "energy_id": { + "channels": "geds_on", + "aggregation_mode": "gather", + "query": "hit.cuspEmax_ctc_cal>25", + "expression": "tcm.array_id" + }, + "t0": { + "aggregation_mode": "keep_at_ch:evt.energy_id", + "expression": "dsp.tp_0_est", + "initial": 0.0 + }, + "lar_energy": { + "channels": "spms_on", + "aggregation_mode": "function", + "expression": ".modules.spm.get_energy(0.5,evt.t0,48000,1000,5000)" + }, + "lar_multiplicity": { + "channels": "spms_on", + "aggregation_mode": "function", + "expression": ".modules.spm.get_majority(0.5,evt.t0,48000,1000,5000)" + }, + "is_lar_rejected": { + "expression": "(evt.lar_energy >4) | (evt.lar_multiplicity > 4) " + }, + "lar_classifier": { + "channels": "spms_on", + "aggregation_mode": "function", + "expression": ".modules.spm.get_etc(0.5,evt.t0,48000,100,6000,80,1,0,50)" + }, + "lar_energy_dplms": { + "channels": "spms_on", + "aggregation_mode": "function", + "expression": ".modules.spm.get_energy_dplms(0.5,evt.t0,48000,1000,5000)" + }, + "lar_multiplicity_dplms": { + "channels": "spms_on", + "aggregation_mode": "function", + "expression": ".modules.spm.get_majority_dplms(0.5,evt.t0,48000,1000,5000)" + }, + "lar_time_shift": { + "channels": "spms_on", + "aggregation_mode": "function", + "expression": ".modules.spm.get_time_shift(0.5,evt.t0,48000,1000,5000)" + }, + "lar_tcm_index": { + "channels": "spms_on", + "aggregation_mode": "function", + "expression": ".modules.spm.get_masked_tcm_idx(0.5,evt.t0,48000,1000,5000,1)" + }, + "lar_pulse_index": { + "channels": "spms_on", + "aggregation_mode": "function", + "expression": ".modules.spm.get_masked_tcm_idx(0.5,evt.t0,48000,1000,5000,0)" + } + } +} diff --git a/tests/evt/configs/query-test-evt-config.json b/tests/evt/configs/query-test-evt-config.json new file mode 100644 index 000000000..901d2d6c1 --- /dev/null +++ b/tests/evt/configs/query-test-evt-config.json @@ -0,0 +1,102 @@ +{ + "channels": { + "geds_on": ["ch1084803", "ch1084804", "ch1121600"] + }, + "outputs": [ + "multiplicity", + "test_sum", + "test_first", + "test_first2", + "test_last", + "test_last2", + "test_any", + "test_any2", + "test_all", + "test_all2", + "test_vov", + "test_vov2" + ], + "operations": { + "multiplicity": { + "channels": "geds_on", + "aggregation_mode": "sum", + "expression": "hit.cuspEmax_ctc_cal > a", + "parameters": { "a": 25 }, + "initial": 0 + }, + "test_sum": { + "channels": "geds_on", + "aggregation_mode": "sum", + "query": "evt.multiplicity == 1", + "expression": "True", + "initial": false + }, + "test_first": { + "channels": "geds_on", + "aggregation_mode": "first_at:dsp.tp_0_est", + "query": "evt.multiplicity == 1", + "expression": "True", + "initial": false + }, + "test_first2": { + "channels": "geds_on", + "aggregation_mode": "first_at:dsp.tp_0_est", + "expression": "True", + "initial": false + }, + "test_last": { + "channels": "geds_on", + "aggregation_mode": "last_at:dsp.tp_0_est", + "query": "evt.multiplicity == 1", + "expression": "True", + "initial": false + }, + "test_last2": { + "channels": "geds_on", + "aggregation_mode": "last_at:dsp.tp_0_est", + "expression": "True", + "initial": false + }, + "test_any": { + "channels": "geds_on", + "aggregation_mode": "any", + "query": "evt.multiplicity == 1", + "expression": "True", + "initial": false + }, + "test_any2": { + "channels": "geds_on", + "aggregation_mode": "any", + "query": "hit.cuspEmax_ctc_cal >25", + "expression": "True", + "initial": false + }, + "test_all": { + "channels": "geds_on", + "aggregation_mode": "all", + "query": "evt.multiplicity == 1", + "expression": "True", + "initial": false + }, + "test_all2": { + "channels": "geds_on", + "aggregation_mode": "all", + "query": "hit.cuspEmax_ctc_cal >25", + "expression": "True", + "initial": false + }, + "test_vov": { + "channels": "geds_on", + "aggregation_mode": "gather", + "query": "evt.multiplicity == 1", + "expression": "True", + "initial": false + }, + "test_vov2": { + "channels": "geds_on", + "aggregation_mode": "gather", + "expression": "True", + "initial": false + } + } +} diff --git a/tests/evt/configs/vov-test-evt-config.json b/tests/evt/configs/vov-test-evt-config.json new file mode 100644 index 000000000..31334101e --- /dev/null +++ b/tests/evt/configs/vov-test-evt-config.json @@ -0,0 +1,85 @@ +{ + "channels": { + "geds_on": ["ch1084803", "ch1084804", "ch1121600"], + "ts_master": "ch1084803" + }, + "outputs": [ + "timestamp", + "energy", + "energy_sum", + "energy_id", + "energy_idx", + "aoe", + "aoe_idx", + "multiplicity", + "is_saturated", + "energy_times_aoe", + "energy_times_multiplicity", + "multiplicity_squared" + ], + "operations": { + "timestamp": { + "channels": "ts_master", + "aggregation_mode": "sum", + "expression": "dsp.timestamp", + "initial": 0.0 + }, + "energy": { + "channels": "geds_on", + "aggregation_mode": "gather", + "query": "hit.cuspEmax_ctc_cal>25", + "expression": "hit.cuspEmax_ctc_cal" + }, + "energy_sum": { + "channels": "geds_on", + "aggregation_mode": "sum", + "query": "hit.cuspEmax_ctc_cal>25", + "expression": "hit.cuspEmax_ctc_cal", + "initial": 0.0 + }, + "energy_idx": { + "channels": "geds_on", + "aggregation_mode": "gather", + "query": "hit.cuspEmax_ctc_cal>25", + "expression": "tcm.index", + "sort": "ascend_by:dsp.tp_0_est", + "initial": 0 + }, + "energy_id": { + "channels": "geds_on", + "aggregation_mode": "gather", + "query": "hit.cuspEmax_ctc_cal>25", + "expression": "tcm.array_id", + "sort": "ascend_by:dsp.tp_0_est", + "initial": 0 + }, + "aoe": { + "aggregation_mode": "keep_at_ch:evt.energy_id", + "expression": "hit.AoE_Classifier" + }, + "aoe_idx": { + "aggregation_mode": "keep_at_idx:evt.energy_idx", + "expression": "hit.AoE_Classifier" + }, + "multiplicity": { + "channels": "geds_on", + "aggregation_mode": "sum", + "expression": "hit.cuspEmax_ctc_cal > a", + "parameters": { "a": 25 }, + "initial": 0 + }, + "is_saturated": { + "aggregation_mode": "keep_at_ch:evt.energy_id", + "expression": "hit.is_saturated" + }, + "energy_times_aoe": { + "expression": "evt.energy*evt.aoe" + }, + "energy_times_multiplicity": { + "expression": "evt.energy*evt.multiplicity" + }, + "multiplicity_squared": { + "expression": "evt.multiplicity*evt.multiplicity" + } + } +} diff --git a/tests/evt/test_build_evt.py b/tests/evt/test_build_evt.py new file mode 100644 index 000000000..80a40d9a8 --- /dev/null +++ b/tests/evt/test_build_evt.py @@ -0,0 +1,315 @@ +import os +from pathlib import Path + +import awkward as ak +import numpy as np +import pytest +from lgdo import Array, VectorOfVectors, lh5 +from lgdo.lh5 import LH5Store + +from pygama.evt import build_evt + +config_dir = Path(__file__).parent / "configs" +store = LH5Store() + + +def test_basics(lgnd_test_data, tmptestdir): + outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" + tcm_path = "lh5/prod-ref-l200/generated/tier/tcm/phy/p03/r001/l200-p03-r001-phy-20230322T160139Z-tier_tcm.lh5" + if os.path.exists(outfile): + os.remove(outfile) + + build_evt( + f_tcm=lgnd_test_data.get_path(tcm_path), + f_dsp=lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), + f_hit=lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), + evt_config=f"{config_dir}/basic-evt-config.json", + f_evt=outfile, + wo_mode="o", + evt_group="evt", + hit_group="hit", + dsp_group="dsp", + tcm_group="hardware_tcm_1", + ) + + assert "statement" in store.read("/evt/multiplicity", outfile)[0].getattrs().keys() + assert ( + store.read("/evt/multiplicity", outfile)[0].getattrs()["statement"] + == "0bb decay is real" + ) + assert os.path.exists(outfile) + assert len(lh5.ls(outfile, "/evt/")) == 11 + nda = { + e: store.read(f"/evt/{e}", outfile)[0].view_as("np") + for e in ["energy", "energy_aux", "energy_sum", "multiplicity"] + } + assert ( + nda["energy"][nda["multiplicity"] == 1] + == nda["energy_aux"][nda["multiplicity"] == 1] + ).all() + assert ( + nda["energy"][nda["multiplicity"] == 1] + == nda["energy_sum"][nda["multiplicity"] == 1] + ).all() + assert ( + nda["energy_aux"][nda["multiplicity"] == 1] + == nda["energy_sum"][nda["multiplicity"] == 1] + ).all() + + eid = store.read("/evt/energy_id", outfile)[0].view_as("np") + eidx = store.read("/evt/energy_idx", outfile)[0].view_as("np") + eidx = eidx[eidx != 999999999999] + + ids = store.read("hardware_tcm_1/array_id", lgnd_test_data.get_path(tcm_path))[ + 0 + ].view_as("np") + ids = ids[eidx] + assert ak.all(ids == eid[eid != 0]) + + +def test_lar_module(lgnd_test_data, tmptestdir): + outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" + tcm_path = "lh5/prod-ref-l200/generated/tier/tcm/phy/p03/r001/l200-p03-r001-phy-20230322T160139Z-tier_tcm.lh5" + if os.path.exists(outfile): + os.remove(outfile) + build_evt( + f_tcm=lgnd_test_data.get_path(tcm_path), + f_dsp=lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), + f_hit=lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), + evt_config=f"{config_dir}/module-test-evt-config.json", + f_evt=outfile, + wo_mode="o", + evt_group="evt", + hit_group="hit", + dsp_group="dsp", + tcm_group="hardware_tcm_1", + ) + + assert os.path.exists(outfile) + assert len(lh5.ls(outfile, "/evt/")) == 10 + nda = { + e: store.read(f"/evt/{e}", outfile)[0].view_as("np") + for e in ["lar_multiplicity", "lar_multiplicity_dplms", "t0", "lar_time_shift"] + } + assert np.max(nda["lar_multiplicity"]) <= 3 + assert np.max(nda["lar_multiplicity_dplms"]) <= 3 + assert ((nda["lar_time_shift"] + nda["t0"]) >= 0).all() + + +def test_lar_t0_vov_module(lgnd_test_data, tmptestdir): + outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" + tcm_path = "lh5/prod-ref-l200/generated/tier/tcm/phy/p03/r001/l200-p03-r001-phy-20230322T160139Z-tier_tcm.lh5" + if os.path.exists(outfile): + os.remove(outfile) + build_evt( + f_tcm=lgnd_test_data.get_path(tcm_path), + f_dsp=lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), + f_hit=lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), + evt_config=f"{config_dir}/module-test-t0-vov-evt-config.json", + f_evt=outfile, + wo_mode="o", + evt_group="evt", + hit_group="hit", + dsp_group="dsp", + tcm_group="hardware_tcm_1", + ) + + assert os.path.exists(outfile) + assert len(lh5.ls(outfile, "/evt/")) == 12 + nda = { + e: store.read(f"/evt/{e}", outfile)[0].view_as("np") + for e in ["lar_multiplicity", "lar_multiplicity_dplms", "lar_time_shift"] + } + assert np.max(nda["lar_multiplicity"]) <= 3 + assert np.max(nda["lar_multiplicity_dplms"]) <= 3 + + ch_idx = store.read("/evt/lar_tcm_index", outfile)[0].view_as("ak") + pls_idx = store.read("/evt/lar_pulse_index", outfile)[0].view_as("ak") + assert ak.count(ch_idx) == ak.count(pls_idx) + assert ak.all(ak.count(ch_idx, axis=-1) == ak.count(pls_idx, axis=-1)) + + +def test_vov(lgnd_test_data, tmptestdir): + outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" + tcm_path = "lh5/prod-ref-l200/generated/tier/tcm/phy/p03/r001/l200-p03-r001-phy-20230322T160139Z-tier_tcm.lh5" + if os.path.exists(outfile): + os.remove(outfile) + build_evt( + f_tcm=lgnd_test_data.get_path(tcm_path), + f_dsp=lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), + f_hit=lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), + evt_config=f"{config_dir}/vov-test-evt-config.json", + f_evt=outfile, + wo_mode="o", + evt_group="evt", + hit_group="hit", + dsp_group="dsp", + tcm_group="hardware_tcm_1", + ) + + assert os.path.exists(outfile) + assert len(lh5.ls(outfile, "/evt/")) == 12 + vov_ene, _ = store.read("/evt/energy", outfile) + vov_aoe, _ = store.read("/evt/aoe", outfile) + arr_ac, _ = store.read("/evt/multiplicity", outfile) + vov_aoeene, _ = store.read("/evt/energy_times_aoe", outfile) + vov_eneac, _ = store.read("/evt/energy_times_multiplicity", outfile) + arr_ac2, _ = store.read("/evt/multiplicity_squared", outfile) + assert isinstance(vov_ene, VectorOfVectors) + assert isinstance(vov_aoe, VectorOfVectors) + assert isinstance(arr_ac, Array) + assert isinstance(vov_aoeene, VectorOfVectors) + assert isinstance(vov_eneac, VectorOfVectors) + assert isinstance(arr_ac2, Array) + assert (np.diff(vov_ene.cumulative_length.nda, prepend=[0]) == arr_ac.nda).all() + + vov_eid = store.read("/evt/energy_id", outfile)[0].view_as("ak") + vov_eidx = store.read("/evt/energy_idx", outfile)[0].view_as("ak") + vov_aoe_idx = store.read("/evt/aoe_idx", outfile)[0].view_as("ak") + + ids = store.read("hardware_tcm_1/array_id", lgnd_test_data.get_path(tcm_path))[ + 0 + ].view_as("ak") + ids = ak.unflatten(ids[ak.flatten(vov_eidx)], ak.count(vov_eidx, axis=-1)) + assert ak.all(ids == vov_eid) + + arr_ene = store.read("/evt/energy_sum", outfile)[0].view_as("ak") + assert ak.all(arr_ene == ak.nansum(vov_ene.view_as("ak"), axis=-1)) + assert ak.all(vov_aoe.view_as("ak") == vov_aoe_idx) + + +def test_graceful_crashing(lgnd_test_data, tmptestdir): + outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" + tcm_path = "lh5/prod-ref-l200/generated/tier/tcm/phy/p03/r001/l200-p03-r001-phy-20230322T160139Z-tier_tcm.lh5" + if os.path.exists(outfile): + os.remove(outfile) + f_tcm = lgnd_test_data.get_path(tcm_path) + f_dsp = lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")) + f_hit = lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")) + f_config = f"{config_dir}/basic-evt-config.json" + + with pytest.raises(KeyError): + build_evt(f_dsp, f_tcm, f_hit, f_config, outfile) + + with pytest.raises(KeyError): + build_evt(f_tcm, f_hit, f_dsp, f_config, outfile) + + with pytest.raises(TypeError): + build_evt(f_tcm, f_dsp, f_hit, None, outfile) + + conf = {"operations": {}} + with pytest.raises(ValueError): + build_evt(f_tcm, f_dsp, f_hit, conf, outfile) + + conf = {"channels": {"geds_on": ["ch1084803", "ch1084804", "ch1121600"]}} + with pytest.raises(ValueError): + build_evt(f_tcm, f_dsp, f_hit, conf, outfile) + + conf = { + "channels": {"geds_on": ["ch1084803", "ch1084804", "ch1121600"]}, + "outputs": ["foo"], + "operations": { + "foo": { + "channels": "geds_on", + "aggregation_mode": "banana", + "expression": "hit.cuspEmax_ctc_cal > a", + "parameters": {"a": 25}, + "initial": 0, + } + }, + } + with pytest.raises(ValueError): + build_evt(f_tcm, f_dsp, f_hit, conf, outfile) + + +def test_query(lgnd_test_data, tmptestdir): + outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" + tcm_path = "lh5/prod-ref-l200/generated/tier/tcm/phy/p03/r001/l200-p03-r001-phy-20230322T160139Z-tier_tcm.lh5" + if os.path.exists(outfile): + os.remove(outfile) + build_evt( + f_tcm=lgnd_test_data.get_path(tcm_path), + f_dsp=lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), + f_hit=lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), + evt_config=f"{config_dir}/query-test-evt-config.json", + f_evt=outfile, + wo_mode="o", + evt_group="evt", + hit_group="hit", + dsp_group="dsp", + tcm_group="hardware_tcm_1", + ) + assert len(lh5.ls(outfile, "/evt/")) == 12 + + +def test_vector_sort(lgnd_test_data, tmptestdir): + outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" + tcm_path = "lh5/prod-ref-l200/generated/tier/tcm/phy/p03/r001/l200-p03-r001-phy-20230322T160139Z-tier_tcm.lh5" + if os.path.exists(outfile): + os.remove(outfile) + f_tcm = lgnd_test_data.get_path(tcm_path) + f_dsp = lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")) + f_hit = lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")) + + conf = { + "channels": {"geds_on": ["ch1084803", "ch1084804", "ch1121600"]}, + "outputs": ["acend_id", "t0_acend", "decend_id", "t0_decend"], + "operations": { + "acend_id": { + "channels": "geds_on", + "aggregation_mode": "gather", + "query": "hit.cuspEmax_ctc_cal>25", + "expression": "tcm.array_id", + "sort": "ascend_by:dsp.tp_0_est", + }, + "t0_acend": { + "aggregation_mode": "keep_at_ch:evt.acend_id", + "expression": "dsp.tp_0_est", + }, + "decend_id": { + "channels": "geds_on", + "aggregation_mode": "gather", + "query": "hit.cuspEmax_ctc_cal>25", + "expression": "tcm.array_id", + "sort": "descend_by:dsp.tp_0_est", + }, + "t0_decend": { + "aggregation_mode": "keep_at_ch:evt.acend_id", + "expression": "dsp.tp_0_est", + }, + }, + } + build_evt(f_tcm, f_dsp, f_hit, conf, outfile) + + assert os.path.exists(outfile) + assert len(lh5.ls(outfile, "/evt/")) == 4 + vov_t0, _ = store.read("/evt/t0_acend", outfile) + nda_t0 = vov_t0.to_aoesa().view_as("np") + assert ((np.diff(nda_t0) >= 0) | (np.isnan(np.diff(nda_t0)))).all() + vov_t0, _ = store.read("/evt/t0_decend", outfile) + nda_t0 = vov_t0.to_aoesa().view_as("np") + assert ((np.diff(nda_t0) <= 0) | (np.isnan(np.diff(nda_t0)))).all() + + +def test_tcm_id_table_pattern(lgnd_test_data, tmptestdir): + outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" + tcm_path = "lh5/prod-ref-l200/generated/tier/tcm/phy/p03/r001/l200-p03-r001-phy-20230322T160139Z-tier_tcm.lh5" + if os.path.exists(outfile): + os.remove(outfile) + f_tcm = lgnd_test_data.get_path(tcm_path) + f_dsp = lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")) + f_hit = lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")) + f_config = f"{config_dir}/basic-evt-config.json" + + with pytest.raises(ValueError): + build_evt(f_tcm, f_dsp, f_hit, f_config, outfile, tcm_id_table_pattern="ch{{}}") + with pytest.raises(ValueError): + build_evt(f_tcm, f_dsp, f_hit, f_config, outfile, tcm_id_table_pattern="ch{}{}") + with pytest.raises(NotImplementedError): + build_evt( + f_tcm, f_dsp, f_hit, f_config, outfile, tcm_id_table_pattern="ch{tcm_id}" + ) + with pytest.raises(ValueError): + build_evt( + f_tcm, f_dsp, f_hit, f_config, outfile, tcm_id_table_pattern="apple{}banana" + ) diff --git a/tests/evt/test_build_tcm.py b/tests/evt/test_build_tcm.py index 505196825..c0ba352e0 100644 --- a/tests/evt/test_build_tcm.py +++ b/tests/evt/test_build_tcm.py @@ -2,7 +2,7 @@ import lgdo import numpy as np -from lgdo import LH5Store +from lgdo import lh5 from pygama import evt @@ -11,11 +11,11 @@ def test_generate_tcm_cols(lgnd_test_data): f_raw = lgnd_test_data.get_path( "lh5/prod-ref-l200/generated/tier/raw/cal/p03/r001/l200-p03-r001-cal-20230318T012144Z-tier_raw.lh5" ) - tables = lgdo.ls(f_raw) - store = LH5Store() + tables = lh5.ls(f_raw) + store = lh5.LH5Store() coin_data = [] for tbl in tables: - ts, _ = store.read_object(f"{tbl}/raw/timestamp", f_raw) + ts, _ = store.read(f"{tbl}/raw/timestamp", f_raw) coin_data.append(ts) tcm_cols = evt.generate_tcm_cols( @@ -66,7 +66,7 @@ def test_build_tcm(lgnd_test_data, tmptestdir): wo_mode="of", ) assert os.path.exists(out_file) - store = LH5Store() - obj, n_rows = store.read_object("hardware_tcm", out_file) + store = lh5.LH5Store() + obj, n_rows = store.read("hardware_tcm", out_file) assert isinstance(obj, lgdo.Struct) assert list(obj.keys()) == ["cumulative_length", "array_id", "array_idx"] diff --git a/tests/flow/test_data_loader.py b/tests/flow/test_data_loader.py index 7236fa73a..21a0fc795 100644 --- a/tests/flow/test_data_loader.py +++ b/tests/flow/test_data_loader.py @@ -165,13 +165,13 @@ def test_setter_overwrite(test_dl): test_dl.set_cuts({"hit": "trapEmax > 5000"}) test_dl.set_output(columns=["trapEmax"]) - data = test_dl.load().get_dataframe() + data = test_dl.load().view_as("pd") test_dl.set_files("timestamp == '20230318T012144Z'") test_dl.set_datastreams([1084803, 1121600], "ch") test_dl.set_cuts({"hit": "trapEmax > 0"}) - data2 = test_dl.load().get_dataframe() + data2 = test_dl.load().view_as("pd") assert 1084804 not in data2["hit_table"] assert len(pd.unique(data2["file"])) == 1 diff --git a/tests/hit/configs/basic-hit-config.json b/tests/hit/configs/basic-hit-config.json index 0cf98137e..1946f162c 100644 --- a/tests/hit/configs/basic-hit-config.json +++ b/tests/hit/configs/basic-hit-config.json @@ -1,18 +1,25 @@ { "outputs": ["calE", "AoE", "A_max"], "operations": { - "twice_trap_e_max": { - "expression": "2 * trapEmax" + "AoE": { + "expression": "A_max/calE" }, "calE": { "expression": "sqrt(a + b * twice_trap_e_max**2)", "parameters": { "a": 1.23, "b": 42.69 + }, + "lgdo_attrs": { + "units": "keV", + "hdf5_settings": { + "compression": "gzip", + "shuffle": true + } } }, - "AoE": { - "expression": "A_max/calE" + "twice_trap_e_max": { + "expression": "2 * trapEmax" } } } diff --git a/tests/hit/test_build_hit.py b/tests/hit/test_build_hit.py index 924928da6..a0d8542c3 100644 --- a/tests/hit/test_build_hit.py +++ b/tests/hit/test_build_hit.py @@ -1,16 +1,33 @@ import os from pathlib import Path -import lgdo.lh5_store as store import numpy as np import pytest -from lgdo import LH5Store, ls +from lgdo import lh5 from pygama.hit import build_hit +from pygama.hit.build_hit import _reorder_table_operations config_dir = Path(__file__).parent / "configs" +def test_ops_reorder(): + assert list(_reorder_table_operations({}).keys()) == [] + + ops = { + "out1": {"expression": "out2 + out3 * outy"}, + "out2": {"expression": "log(out4)"}, + "out3": {"expression": "outx + 2"}, + "out4": {"expression": "outz + out3"}, + } + assert list(_reorder_table_operations(ops).keys()) == [ + "out3", + "out4", + "out2", + "out1", + ] + + def test_basics(dsp_test_file, tmptestdir): outfile = f"{tmptestdir}/LDQTA_r117_20200110T105115Z_cal_geds_hit.lh5" @@ -22,7 +39,11 @@ def test_basics(dsp_test_file, tmptestdir): ) assert os.path.exists(outfile) - assert ls(outfile, "/geds/") == ["geds/hit"] + assert lh5.ls(outfile, "/geds/") == ["geds/hit"] + + store = lh5.LH5Store() + tbl, _ = store.read("geds/hit", outfile) + assert tbl.calE.attrs == {"datatype": "array<1>{real}", "units": "keV"} def test_illegal_arguments(dsp_test_file): @@ -57,7 +78,7 @@ def test_lh5_table_configs(dsp_test_file, tmptestdir): ) assert os.path.exists(outfile) - assert ls(outfile, "/geds/") == ["geds/hit"] + assert lh5.ls(outfile, "/geds/") == ["geds/hit"] lh5_tables_config = { "/geds/dsp": { @@ -80,7 +101,7 @@ def test_lh5_table_configs(dsp_test_file, tmptestdir): ) assert os.path.exists(outfile) - assert ls(outfile, "/geds/") == ["geds/hit"] + assert lh5.ls(outfile, "/geds/") == ["geds/hit"] def test_outputs_specification(dsp_test_file, tmptestdir): @@ -93,9 +114,9 @@ def test_outputs_specification(dsp_test_file, tmptestdir): wo_mode="overwrite", ) - store = LH5Store() - obj, _ = store.read_object("/geds/hit", outfile) - assert list(obj.keys()) == ["calE", "AoE", "A_max"] + store = lh5.LH5Store() + obj, _ = store.read("/geds/hit", outfile) + assert sorted(obj.keys()) == ["A_max", "AoE", "calE"] def test_aggregation_outputs(dsp_test_file, tmptestdir): @@ -108,8 +129,8 @@ def test_aggregation_outputs(dsp_test_file, tmptestdir): wo_mode="overwrite", ) - sto = LH5Store() - obj, _ = sto.read_object("/geds/hit", outfile) + sto = lh5.LH5Store() + obj, _ = sto.read("/geds/hit", outfile) assert list(obj.keys()) == [ "is_valid_rt", "is_valid_t0", @@ -118,11 +139,7 @@ def test_aggregation_outputs(dsp_test_file, tmptestdir): "aggr2", ] - df = store.load_dfs( - outfile, - ["is_valid_rt", "is_valid_t0", "is_valid_tmax", "aggr1", "aggr2"], - "geds/hit/", - ) + df = sto.read("geds/hit", outfile)[0].view_as("pd") # aggr1 consists of 3 bits --> max number can be 7, aggr2 consists of 2 bits so max number can be 3 assert not (df["aggr1"] > 7).any() @@ -153,9 +170,9 @@ def test_build_hit_spms_basic(dsp_test_file_spm, tmptestdir): hit_config=f"{config_dir}/spms-hit-config.json", wo_mode="overwrite_file", ) - assert ls(out_file) == ["ch0", "ch1", "ch2"] - assert ls(out_file, "ch0/") == ["ch0/hit"] - assert ls(out_file, "ch0/hit/") == [ + assert lh5.ls(out_file) == ["ch0", "ch1", "ch2"] + assert lh5.ls(out_file, "ch0/") == ["ch0/hit"] + assert lh5.ls(out_file, "ch0/hit/") == [ "ch0/hit/energy_in_pe", "ch0/hit/quality_cut", "ch0/hit/trigger_pos", @@ -171,9 +188,9 @@ def test_build_hit_spms_multiconfig(dsp_test_file_spm, tmptestdir): lh5_tables_config=f"{config_dir}/spms-hit-multi-config.json", wo_mode="overwrite", ) - assert ls(out_file) == ["ch0", "ch1", "ch2"] - assert ls(out_file, "ch0/") == ["ch0/hit"] - assert ls(out_file, "ch0/hit/") == [ + assert lh5.ls(out_file) == ["ch0", "ch1", "ch2"] + assert lh5.ls(out_file, "ch0/") == ["ch0/hit"] + assert lh5.ls(out_file, "ch0/hit/") == [ "ch0/hit/energy_in_pe", "ch0/hit/quality_cut", "ch0/hit/trigger_pos", @@ -189,22 +206,23 @@ def test_build_hit_spms_calc(dsp_test_file_spm, tmptestdir): wo_mode="overwrite_file", lh5_tables_config=f"{config_dir}/spms-hit-a-config.json", ) - assert ls(out_file) == ["ch0", "ch1", "ch2"] - assert ls(out_file, "ch0/") == ["ch0/hit"] - assert ls(out_file, "ch0/hit/") == ["ch0/hit/energy_in_pe"] - - df0 = store.load_nda(out_file, ["energy_in_pe"], "ch0/hit/") - df1 = store.load_nda(out_file, ["energy_in_pe"], "ch1/hit/") - df2 = store.load_nda(out_file, ["energy_in_pe"], "ch2/hit/") - - assert len(df0["energy_in_pe"]) == 5 - assert len(df1["energy_in_pe"]) == 5 - assert len(df2["energy_in_pe"]) == 5 - - assert len(df0["energy_in_pe"][0]) == 20 - assert len(df1["energy_in_pe"][0]) == 20 - assert len(df2["energy_in_pe"][0]) == 20 - - assert np.nanmean(df0["energy_in_pe"]) == 0 - assert np.nanmean(df1["energy_in_pe"]) == 1 - assert np.nanmean(df2["energy_in_pe"]) == 2 + assert lh5.ls(out_file) == ["ch0", "ch1", "ch2"] + assert lh5.ls(out_file, "ch0/") == ["ch0/hit"] + assert lh5.ls(out_file, "ch0/hit/") == ["ch0/hit/energy_in_pe"] + + store = lh5.LH5Store() + df0 = store.read("ch0/hit/energy_in_pe", out_file)[0].view_as("np") + df1 = store.read("ch1/hit/energy_in_pe", out_file)[0].view_as("np") + df2 = store.read("ch2/hit/energy_in_pe", out_file)[0].view_as("np") + + assert len(df0) == 5 + assert len(df1) == 5 + assert len(df2) == 5 + + assert len(df0[0]) == 20 + assert len(df1[0]) == 20 + assert len(df2[0]) == 20 + + assert np.nanmean(df0) == 0 + assert np.nanmean(df1) == 1 + assert np.nanmean(df2) == 2 diff --git a/tests/skm/configs/basic-skm-config.json b/tests/skm/configs/basic-skm-config.json new file mode 100644 index 000000000..e1ffda941 --- /dev/null +++ b/tests/skm/configs/basic-skm-config.json @@ -0,0 +1,25 @@ +{ + "multiplicity": 3, + "operations": { + "timestamp": { + "forward_field": "evt.timestamp", + "lgdo_attrs": { "info": "pk was here" } + }, + "energy_sum": { + "forward_field": "evt.energy_sum" + }, + "multiplicity": { + "forward_field": "evt.multiplicity" + }, + "energy": { + "forward_field": "hit.cuspEmax_ctc_cal", + "missing_value": 0.0, + "tcm_idx": "evt.energy_idx" + }, + "energy_id": { + "forward_field": "tcm.array_id", + "missing_value": 0, + "tcm_idx": "evt.energy_idx" + } + } +} diff --git a/tests/skm/test_build_skm.py b/tests/skm/test_build_skm.py new file mode 100644 index 000000000..c60c460f0 --- /dev/null +++ b/tests/skm/test_build_skm.py @@ -0,0 +1,128 @@ +import os +from pathlib import Path + +import awkward as ak +import lgdo +from lgdo.lh5 import LH5Store + +from pygama.evt import build_evt +from pygama.skm import build_skm + +config_dir = Path(__file__).parent / "configs" +evt_config_dir = Path(__file__).parent.parent / "evt" / "configs" +store = LH5Store() + + +def test_basics(lgnd_test_data, tmptestdir): + outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" + tcm_path = "lh5/prod-ref-l200/generated/tier/tcm/phy/p03/r001/l200-p03-r001-phy-20230322T160139Z-tier_tcm.lh5" + if os.path.exists(outfile): + os.remove(outfile) + + build_evt( + f_tcm=lgnd_test_data.get_path(tcm_path), + f_dsp=lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), + f_hit=lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), + evt_config=f"{evt_config_dir}/vov-test-evt-config.json", + f_evt=outfile, + wo_mode="o", + evt_group="evt", + hit_group="hit", + dsp_group="dsp", + tcm_group="hardware_tcm_1", + ) + + skm_conf = f"{config_dir}/basic-skm-config.json" + skm_out = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_skm.lh5" + + result = build_skm( + outfile, + lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), + lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), + lgnd_test_data.get_path(tcm_path), + skm_conf, + ) + + assert isinstance(result, lgdo.Table) + + build_skm( + outfile, + lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), + lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), + lgnd_test_data.get_path(tcm_path), + skm_conf, + skm_out, + wo_mode="o", + ) + + assert os.path.exists(skm_out) + obj, _ = store.read("/skm/", skm_out) + + assert obj == result + + df = obj.view_as("pd") + assert "timestamp" in df.keys() + assert "energy_0" in df.keys() + assert "energy_1" in df.keys() + assert "energy_2" in df.keys() + assert "energy_id_0" in df.keys() + assert "energy_id_1" in df.keys() + assert "energy_id_2" in df.keys() + assert "multiplicity" in df.keys() + assert "energy_sum" in df.keys() + assert (df.multiplicity.to_numpy() <= 3).all() + assert ( + df.energy_0.to_numpy() + df.energy_1.to_numpy() + df.energy_2.to_numpy() + == df.energy_sum.to_numpy() + ).all() + + vov_eid = ak.to_numpy( + ak.fill_none( + ak.pad_none( + store.read("/evt/energy_id", outfile)[0].view_as("ak"), 3, clip=True + ), + 0, + ), + allow_missing=False, + ) + assert (vov_eid[:, 0] == df.energy_id_0.to_numpy()).all() + assert (vov_eid[:, 1] == df.energy_id_1.to_numpy()).all() + assert (vov_eid[:, 2] == df.energy_id_2.to_numpy()).all() + + +def test_attribute_passing(lgnd_test_data, tmptestdir): + outfile = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_evt.lh5" + tcm_path = "lh5/prod-ref-l200/generated/tier/tcm/phy/p03/r001/l200-p03-r001-phy-20230322T160139Z-tier_tcm.lh5" + if os.path.exists(outfile): + os.remove(outfile) + + build_evt( + f_tcm=lgnd_test_data.get_path(tcm_path), + f_dsp=lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), + f_hit=lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), + evt_config=f"{evt_config_dir}/vov-test-evt-config.json", + f_evt=outfile, + wo_mode="o", + evt_group="evt", + hit_group="hit", + dsp_group="dsp", + tcm_group="hardware_tcm_1", + ) + + skm_conf = f"{config_dir}/basic-skm-config.json" + + skm_out = f"{tmptestdir}/l200-p03-r001-phy-20230322T160139Z-tier_skm.lh5" + + build_skm( + outfile, + lgnd_test_data.get_path(tcm_path.replace("tcm", "hit")), + lgnd_test_data.get_path(tcm_path.replace("tcm", "dsp")), + lgnd_test_data.get_path(tcm_path), + skm_conf, + f_skm=skm_out, + wo_mode="o", + ) + + assert os.path.exists(skm_out) + assert "info" in store.read("/skm/timestamp", skm_out)[0].getattrs().keys() + assert store.read("/skm/timestamp", skm_out)[0].getattrs()["info"] == "pk was here"