Skip to content

Commit

Permalink
Merge pull request #118 from FAST-HEP/BK_add_observed_option
Browse files Browse the repository at this point in the history
Add `observed` option for speed with many bins
  • Loading branch information
benkrikler authored Apr 3, 2020
2 parents e889c12 + 1afbc86 commit 8fa33c0
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 19 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.17.5] - 2020-04-03
### Added
- Add `observed` option to BinnedDataframe for speed boost with many bins, PR #118 [@BenKrikler](https://github.com/benkrikler)

### Changed
- Pin the version for the Mantichora package that AlphaTwirl depends on

## [0.17.4] - 2020-03-12
### Changed
- `pad_missing` was replacing bin contents when set to True, PR #116 [@BenKrikler](https://github.com/benkrikler)
Expand Down
16 changes: 12 additions & 4 deletions fast_carpenter/summary/binned_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,11 @@ def __init__(self, name, out_dir, binning, weights=None, dataset_col=False):
excluded from the stored dataframe. Leaving this ``False`` can save
some disk-space and improve processing time, particularly if the bins are
only very sparsely filled.
observed (bool): If ``False`` bins in the dataframe will only be filled
if their are datapoints contained within them. Otherwise, depending on
the binning specification for each dimension, all bins for that
dimension will be present. Use `pad_missing: true` to force all bins
to be present.
Other Parameters:
name (str): The name of this stage (handled automatically by fast-flow)
Expand All @@ -161,7 +166,8 @@ def __init__(self, name, out_dir, binning, weights=None, dataset_col=False):
"""

def __init__(self, name, out_dir, binning, weights=None, dataset_col=True, pad_missing=False, file_format=None):
def __init__(self, name, out_dir, binning, weights=None, dataset_col=True,
pad_missing=False, file_format=None, observed=False):
self.name = name
self.out_dir = out_dir
ins, outs, binnings = cfg.create_binning_list(self.name, binning)
Expand All @@ -173,6 +179,7 @@ def __init__(self, name, out_dir, binning, weights=None, dataset_col=True, pad_m
self._weights = cfg.create_weights(self.name, weights)
self._pad_missing = pad_missing
self._file_format = cfg.create_file_format(self.name, file_format)
self._observed = observed
self.contents = None

def collector(self):
Expand Down Expand Up @@ -202,7 +209,8 @@ def event(self, chunk):
binnings=self._binnings,
weights=weights,
out_weights=self._weights.keys(),
out_dimensions=self._out_bin_dims)
out_dimensions=self._out_bin_dims,
observed=self._observed)
if self.contents is None:
self.contents = binned_values
else:
Expand All @@ -228,7 +236,7 @@ def _make_column_labels(weights):
return [count_label] + labels


def _bin_values(data, dimensions, binnings, weights, out_dimensions=None, out_weights=None):
def _bin_values(data, dimensions, binnings, weights, out_dimensions=None, out_weights=None, observed=True):
if not out_dimensions:
out_dimensions = dimensions
if not out_weights:
Expand All @@ -247,7 +255,7 @@ def _bin_values(data, dimensions, binnings, weights, out_dimensions=None, out_we
weight_sq_dims = [w + "_squared" for w in weights]
data[weight_sq_dims] = data[weights] ** 2

bins = data.groupby(final_bin_dims)
bins = data.groupby(final_bin_dims, observed=observed)
counts = bins[data.columns[0]].count()

if weights:
Expand Down
2 changes: 1 addition & 1 deletion fast_carpenter/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ def split_version(version):
return tuple(result)


__version__ = '0.17.4'
__version__ = '0.17.5'
version_info = split_version(__version__) # noqa
3 changes: 2 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.17.4
current_version = 0.17.5
commit = True
tag = False

Expand All @@ -18,3 +18,4 @@ test = pytest

[tool:pytest]
collect_ignore = ['setup.py']

3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def get_version():
return _globals["__version__"]


requirements = ['atuproot==0.1.13', 'atsge==0.2.1', 'fast-flow', 'fast-curator', 'awkward',
requirements = ['atuproot==0.1.13', 'atsge==0.2.1', 'mantichora==0.9.7',
'fast-flow', 'fast-curator', 'awkward',
'pandas', 'numpy', 'numba', 'numexpr', 'uproot>=3']
repositories = []

Expand Down
32 changes: 20 additions & 12 deletions tests/summary/test_binned_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,14 @@ def test_BinnedDataframe_run_data(binned_df_2, tmpdir, infile):
chunk = FakeBEEvent(infile, "data")
binned_df_2.event(chunk)

collector = binned_df_2.collector()
dataset_readers_list = (("test_dataset", (binned_df_2,)),)
results = collector._prepare_output(dataset_readers_list)

totals = results.sum()
# Based on: events->Draw("Jet_Py", "", "goff")
assert totals["n"] == 4616


def test_BinnedDataframe_run_twice(binned_df_1, tmpdir, infile):
chunk = FakeBEEvent(infile, "mc")
Expand All @@ -108,9 +116,10 @@ def test_BinnedDataframe_run_twice(binned_df_1, tmpdir, infile):


@pytest.fixture
def run_twice_data_mc(config_1, infile):
def run_twice_data_mc(config_1, infile, observed):
chunk_mc = FakeBEEvent(infile, "mc")
chunk_data = FakeBEEvent(infile, "data")
config_1["observed"] = observed

binned_dfs = [make_binned_df_1(config_1) for _ in range(4)]
binned_dfs[0].event(chunk_mc)
Expand All @@ -122,26 +131,25 @@ def run_twice_data_mc(config_1, infile):
("test_data", (binned_dfs[2], binned_dfs[3])))


@pytest.mark.skipif(int(pd.__version__.split(".")[0]) < 1, reason="requires Pandas 1.0 or higher")
@pytest.mark.parametrize("dataset_col", [True, False])
@pytest.mark.parametrize("pad_missing", [True, False])
def test_binneddataframe_run_twice_data_mc(run_twice_data_mc, dataset_col, pad_missing):
@pytest.mark.parametrize("observed", [True, False])
def test_binneddataframe_run_twice_data_mc(run_twice_data_mc, dataset_col, pad_missing, observed):
binned_df_1, dataset_readers_list = run_twice_data_mc
binned_df_1._pad_missing = pad_missing
binned_df_1._dataset_col = dataset_col
collector = binned_df_1.collector()
results = collector._prepare_output(dataset_readers_list)

assert results.index.nlevels == 2 + int(dataset_col)
if tuple(map(int, pd.__version__.split("."))) >= (1, 0, 0):
length = (4 * 31) * (1 + int(dataset_col))
else:
# Pre Pandas 1.0.0 the following lengths were needed.
if pad_missing or not dataset_col:
length = (4 * 31) * (1 + int(dataset_col))
else:
length = None
if length:
assert len(results) == length
if pad_missing or not observed:
length = (4 * 31)
elif observed:
length = 111

length *= 1 + int(dataset_col)
assert len(results) == length

totals = results.sum()
# Based on: events->Draw("Jet_Py", "", "goff")
Expand Down

0 comments on commit 8fa33c0

Please sign in to comment.