From e2cec5cc6ec3e196c721f3e517cd2fbb5d3d0682 Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Fri, 3 Apr 2020 19:02:45 +0200 Subject: [PATCH 1/5] Add option to only produce bins for observed values --- fast_carpenter/summary/binned_dataframe.py | 10 ++++--- tests/summary/test_binned_dataframe.py | 34 +++++++++++++--------- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/fast_carpenter/summary/binned_dataframe.py b/fast_carpenter/summary/binned_dataframe.py index b7f3666..cd1839f 100644 --- a/fast_carpenter/summary/binned_dataframe.py +++ b/fast_carpenter/summary/binned_dataframe.py @@ -161,7 +161,7 @@ def __init__(self, name, out_dir, binning, weights=None, dataset_col=False): """ - def __init__(self, name, out_dir, binning, weights=None, dataset_col=True, pad_missing=False, file_format=None): + def __init__(self, name, out_dir, binning, weights=None, dataset_col=True, pad_missing=False, file_format=None, observed=False): self.name = name self.out_dir = out_dir ins, outs, binnings = cfg.create_binning_list(self.name, binning) @@ -173,6 +173,7 @@ def __init__(self, name, out_dir, binning, weights=None, dataset_col=True, pad_m self._weights = cfg.create_weights(self.name, weights) self._pad_missing = pad_missing self._file_format = cfg.create_file_format(self.name, file_format) + self._observed = observed self.contents = None def collector(self): @@ -202,7 +203,8 @@ def event(self, chunk): binnings=self._binnings, weights=weights, out_weights=self._weights.keys(), - out_dimensions=self._out_bin_dims) + out_dimensions=self._out_bin_dims, + observed=self._observed) if self.contents is None: self.contents = binned_values else: @@ -228,7 +230,7 @@ def _make_column_labels(weights): return [count_label] + labels -def _bin_values(data, dimensions, binnings, weights, out_dimensions=None, out_weights=None): +def _bin_values(data, dimensions, binnings, weights, out_dimensions=None, out_weights=None, observed=True): if not out_dimensions: out_dimensions = dimensions if not out_weights: @@ -247,7 +249,7 @@ def _bin_values(data, dimensions, binnings, weights, out_dimensions=None, out_we weight_sq_dims = [w + "_squared" for w in weights] data[weight_sq_dims] = data[weights] ** 2 - bins = data.groupby(final_bin_dims) + bins = data.groupby(final_bin_dims, observed=observed) counts = bins[data.columns[0]].count() if weights: diff --git a/tests/summary/test_binned_dataframe.py b/tests/summary/test_binned_dataframe.py index 5fa0bae..87e5df9 100644 --- a/tests/summary/test_binned_dataframe.py +++ b/tests/summary/test_binned_dataframe.py @@ -86,6 +86,14 @@ def test_BinnedDataframe_run_data(binned_df_2, tmpdir, infile): chunk = FakeBEEvent(infile, "data") binned_df_2.event(chunk) + collector = binned_df_2.collector() + dataset_readers_list = (("test_dataset", (binned_df_2,)),) + results = collector._prepare_output(dataset_readers_list) + + totals = results.sum() + # Based on: events->Draw("Jet_Py", "", "goff") + assert totals["n"] == 4616 + def test_BinnedDataframe_run_twice(binned_df_1, tmpdir, infile): chunk = FakeBEEvent(infile, "mc") @@ -107,10 +115,11 @@ def test_BinnedDataframe_run_twice(binned_df_1, tmpdir, infile): assert totals["EventWeight:sumw"] == pytest.approx(231.91339 * 2) -@pytest.fixture -def run_twice_data_mc(config_1, infile): +@pytest.fixture #(scope="function") +def run_twice_data_mc(config_1, infile, observed): chunk_mc = FakeBEEvent(infile, "mc") chunk_data = FakeBEEvent(infile, "data") + config_1["observed"] = observed binned_dfs = [make_binned_df_1(config_1) for _ in range(4)] binned_dfs[0].event(chunk_mc) @@ -122,9 +131,11 @@ def run_twice_data_mc(config_1, infile): ("test_data", (binned_dfs[2], binned_dfs[3]))) +@pytest.mark.skipif(int(pd.__version__.split(".")[0]) < 1, reason="requires Pandas 1.0 or higher") @pytest.mark.parametrize("dataset_col", [True, False]) @pytest.mark.parametrize("pad_missing", [True, False]) -def test_binneddataframe_run_twice_data_mc(run_twice_data_mc, dataset_col, pad_missing): +@pytest.mark.parametrize("observed", [True, False]) +def test_binneddataframe_run_twice_data_mc(run_twice_data_mc, dataset_col, pad_missing, observed): binned_df_1, dataset_readers_list = run_twice_data_mc binned_df_1._pad_missing = pad_missing binned_df_1._dataset_col = dataset_col @@ -132,16 +143,13 @@ def test_binneddataframe_run_twice_data_mc(run_twice_data_mc, dataset_col, pad_m results = collector._prepare_output(dataset_readers_list) assert results.index.nlevels == 2 + int(dataset_col) - if tuple(map(int, pd.__version__.split("."))) >= (1, 0, 0): - length = (4 * 31) * (1 + int(dataset_col)) - else: - # Pre Pandas 1.0.0 the following lengths were needed. - if pad_missing or not dataset_col: - length = (4 * 31) * (1 + int(dataset_col)) - else: - length = None - if length: - assert len(results) == length + if pad_missing or not observed: + length = (4 * 31) + elif observed: + length = 111 + + length *= 1 + int(dataset_col) + assert len(results) == length totals = results.sum() # Based on: events->Draw("Jet_Py", "", "goff") From b8c2da6a5ebee0827c141d75c1e08fad8edecf81 Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Fri, 3 Apr 2020 19:04:38 +0200 Subject: [PATCH 2/5] Update CHANGELOG --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b79f4d..f14d471 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.17.5] - 2020-04-03 +### Changed +- Add `observed` option to BinnedDataframe for speed boost with many bins, PR #118 [@BenKrikler](https://github.com/benkrikler) + ## [0.17.4] - 2020-03-12 ### Changed - `pad_missing` was replacing bin contents when set to True, PR #116 [@BenKrikler](https://github.com/benkrikler) From 1df11c5a6935612e868c5560df5dd86415a63046 Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Fri, 3 Apr 2020 19:04:47 +0200 Subject: [PATCH 3/5] =?UTF-8?q?Bump=20version:=200.17.4=20=E2=86=92=200.17?= =?UTF-8?q?.5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fast_carpenter/version.py | 2 +- setup.cfg | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fast_carpenter/version.py b/fast_carpenter/version.py index 67ce98a..593d24c 100644 --- a/fast_carpenter/version.py +++ b/fast_carpenter/version.py @@ -12,5 +12,5 @@ def split_version(version): return tuple(result) -__version__ = '0.17.4' +__version__ = '0.17.5' version_info = split_version(__version__) # noqa diff --git a/setup.cfg b/setup.cfg index 4dd7d39..46f1718 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.17.4 +current_version = 0.17.5 commit = True tag = False @@ -18,3 +18,4 @@ test = pytest [tool:pytest] collect_ignore = ['setup.py'] + From c7d9d10ae7bd627844adddac32860daf01635c15 Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Fri, 3 Apr 2020 19:08:45 +0200 Subject: [PATCH 4/5] Fix up the docs --- fast_carpenter/summary/binned_dataframe.py | 8 +++++++- tests/summary/test_binned_dataframe.py | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/fast_carpenter/summary/binned_dataframe.py b/fast_carpenter/summary/binned_dataframe.py index cd1839f..a3df678 100644 --- a/fast_carpenter/summary/binned_dataframe.py +++ b/fast_carpenter/summary/binned_dataframe.py @@ -150,6 +150,11 @@ def __init__(self, name, out_dir, binning, weights=None, dataset_col=False): excluded from the stored dataframe. Leaving this ``False`` can save some disk-space and improve processing time, particularly if the bins are only very sparsely filled. + observed (bool): If ``False`` bins in the dataframe will only be filled + if their are datapoints contained within them. Otherwise, depending on + the binning specification for each dimension, all bins for that + dimension will be present. Use `pad_missing: true` to force all bins + to be present. Other Parameters: name (str): The name of this stage (handled automatically by fast-flow) @@ -161,7 +166,8 @@ def __init__(self, name, out_dir, binning, weights=None, dataset_col=False): """ - def __init__(self, name, out_dir, binning, weights=None, dataset_col=True, pad_missing=False, file_format=None, observed=False): + def __init__(self, name, out_dir, binning, weights=None, dataset_col=True, + pad_missing=False, file_format=None, observed=False): self.name = name self.out_dir = out_dir ins, outs, binnings = cfg.create_binning_list(self.name, binning) diff --git a/tests/summary/test_binned_dataframe.py b/tests/summary/test_binned_dataframe.py index 87e5df9..b49a821 100644 --- a/tests/summary/test_binned_dataframe.py +++ b/tests/summary/test_binned_dataframe.py @@ -115,7 +115,7 @@ def test_BinnedDataframe_run_twice(binned_df_1, tmpdir, infile): assert totals["EventWeight:sumw"] == pytest.approx(231.91339 * 2) -@pytest.fixture #(scope="function") +@pytest.fixture def run_twice_data_mc(config_1, infile, observed): chunk_mc = FakeBEEvent(infile, "mc") chunk_data = FakeBEEvent(infile, "data") From 1afbc865f2ebdb98c6e89666f20e9a044668aac4 Mon Sep 17 00:00:00 2001 From: Ben Krikler Date: Fri, 3 Apr 2020 19:35:36 +0200 Subject: [PATCH 5/5] Pin mantichora as a dependency of alphatwirl --- CHANGELOG.md | 5 ++++- setup.py | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f14d471..cabf4e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,9 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ## [0.17.5] - 2020-04-03 -### Changed +### Added - Add `observed` option to BinnedDataframe for speed boost with many bins, PR #118 [@BenKrikler](https://github.com/benkrikler) +### Changed +- Pin the version for the Mantichora package that AlphaTwirl depends on + ## [0.17.4] - 2020-03-12 ### Changed - `pad_missing` was replacing bin contents when set to True, PR #116 [@BenKrikler](https://github.com/benkrikler) diff --git a/setup.py b/setup.py index a9c3a89..1595b6d 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,8 @@ def get_version(): return _globals["__version__"] -requirements = ['atuproot==0.1.13', 'atsge==0.2.1', 'fast-flow', 'fast-curator', 'awkward', +requirements = ['atuproot==0.1.13', 'atsge==0.2.1', 'mantichora==0.9.7', + 'fast-flow', 'fast-curator', 'awkward', 'pandas', 'numpy', 'numba', 'numexpr', 'uproot>=3'] repositories = []