From e2cec5cc6ec3e196c721f3e517cd2fbb5d3d0682 Mon Sep 17 00:00:00 2001
From: Ben Krikler <bek07@ic.ac.uk>
Date: Fri, 3 Apr 2020 19:02:45 +0200
Subject: [PATCH 1/5] Add option to only produce bins for observed values

---
 fast_carpenter/summary/binned_dataframe.py | 10 ++++---
 tests/summary/test_binned_dataframe.py     | 34 +++++++++++++---------
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/fast_carpenter/summary/binned_dataframe.py b/fast_carpenter/summary/binned_dataframe.py
index b7f3666..cd1839f 100644
--- a/fast_carpenter/summary/binned_dataframe.py
+++ b/fast_carpenter/summary/binned_dataframe.py
@@ -161,7 +161,7 @@ def __init__(self, name, out_dir, binning, weights=None, dataset_col=False):
 
     """
 
-    def __init__(self, name, out_dir, binning, weights=None, dataset_col=True, pad_missing=False, file_format=None):
+    def __init__(self, name, out_dir, binning, weights=None, dataset_col=True, pad_missing=False, file_format=None, observed=False):
         self.name = name
         self.out_dir = out_dir
         ins, outs, binnings = cfg.create_binning_list(self.name, binning)
@@ -173,6 +173,7 @@ def __init__(self, name, out_dir, binning, weights=None, dataset_col=True, pad_m
         self._weights = cfg.create_weights(self.name, weights)
         self._pad_missing = pad_missing
         self._file_format = cfg.create_file_format(self.name, file_format)
+        self._observed = observed
         self.contents = None
 
     def collector(self):
@@ -202,7 +203,8 @@ def event(self, chunk):
                                     binnings=self._binnings,
                                     weights=weights,
                                     out_weights=self._weights.keys(),
-                                    out_dimensions=self._out_bin_dims)
+                                    out_dimensions=self._out_bin_dims,
+                                    observed=self._observed)
         if self.contents is None:
             self.contents = binned_values
         else:
@@ -228,7 +230,7 @@ def _make_column_labels(weights):
     return [count_label] + labels
 
 
-def _bin_values(data, dimensions, binnings, weights, out_dimensions=None, out_weights=None):
+def _bin_values(data, dimensions, binnings, weights, out_dimensions=None, out_weights=None, observed=True):
     if not out_dimensions:
         out_dimensions = dimensions
     if not out_weights:
@@ -247,7 +249,7 @@ def _bin_values(data, dimensions, binnings, weights, out_dimensions=None, out_we
         weight_sq_dims = [w + "_squared" for w in weights]
         data[weight_sq_dims] = data[weights] ** 2
 
-    bins = data.groupby(final_bin_dims)
+    bins = data.groupby(final_bin_dims, observed=observed)
     counts = bins[data.columns[0]].count()
 
     if weights:
diff --git a/tests/summary/test_binned_dataframe.py b/tests/summary/test_binned_dataframe.py
index 5fa0bae..87e5df9 100644
--- a/tests/summary/test_binned_dataframe.py
+++ b/tests/summary/test_binned_dataframe.py
@@ -86,6 +86,14 @@ def test_BinnedDataframe_run_data(binned_df_2, tmpdir, infile):
     chunk = FakeBEEvent(infile, "data")
     binned_df_2.event(chunk)
 
+    collector = binned_df_2.collector()
+    dataset_readers_list = (("test_dataset", (binned_df_2,)),)
+    results = collector._prepare_output(dataset_readers_list)
+
+    totals = results.sum()
+    # Based on: events->Draw("Jet_Py", "", "goff")
+    assert totals["n"] == 4616
+
 
 def test_BinnedDataframe_run_twice(binned_df_1, tmpdir, infile):
     chunk = FakeBEEvent(infile, "mc")
@@ -107,10 +115,11 @@ def test_BinnedDataframe_run_twice(binned_df_1, tmpdir, infile):
     assert totals["EventWeight:sumw"] == pytest.approx(231.91339 * 2)
 
 
-@pytest.fixture
-def run_twice_data_mc(config_1, infile):
+@pytest.fixture #(scope="function")
+def run_twice_data_mc(config_1, infile, observed):
     chunk_mc = FakeBEEvent(infile, "mc")
     chunk_data = FakeBEEvent(infile, "data")
+    config_1["observed"] = observed
 
     binned_dfs = [make_binned_df_1(config_1) for _ in range(4)]
     binned_dfs[0].event(chunk_mc)
@@ -122,9 +131,11 @@ def run_twice_data_mc(config_1, infile):
                            ("test_data", (binned_dfs[2], binned_dfs[3])))
 
 
+@pytest.mark.skipif(int(pd.__version__.split(".")[0]) < 1, reason="requires Pandas 1.0 or higher")
 @pytest.mark.parametrize("dataset_col", [True, False])
 @pytest.mark.parametrize("pad_missing", [True, False])
-def test_binneddataframe_run_twice_data_mc(run_twice_data_mc, dataset_col, pad_missing):
+@pytest.mark.parametrize("observed", [True, False])
+def test_binneddataframe_run_twice_data_mc(run_twice_data_mc, dataset_col, pad_missing, observed):
     binned_df_1, dataset_readers_list = run_twice_data_mc
     binned_df_1._pad_missing = pad_missing
     binned_df_1._dataset_col = dataset_col
@@ -132,16 +143,13 @@ def test_binneddataframe_run_twice_data_mc(run_twice_data_mc, dataset_col, pad_m
     results = collector._prepare_output(dataset_readers_list)
 
     assert results.index.nlevels == 2 + int(dataset_col)
-    if tuple(map(int, pd.__version__.split("."))) >= (1, 0, 0):
-        length = (4 * 31) * (1 + int(dataset_col))
-    else:
-        # Pre Pandas 1.0.0 the following lengths were needed.
-        if pad_missing or not dataset_col:
-            length = (4 * 31) * (1 + int(dataset_col))
-        else:
-            length = None
-    if length:
-        assert len(results) == length
+    if pad_missing or not observed:
+        length = (4 * 31)
+    elif observed:
+        length = 111
+
+    length *= 1 + int(dataset_col)
+    assert len(results) == length
 
     totals = results.sum()
     # Based on: events->Draw("Jet_Py", "", "goff")

From b8c2da6a5ebee0827c141d75c1e08fad8edecf81 Mon Sep 17 00:00:00 2001
From: Ben Krikler <bek07@ic.ac.uk>
Date: Fri, 3 Apr 2020 19:04:38 +0200
Subject: [PATCH 2/5] Update CHANGELOG

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8b79f4d..f14d471 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.17.5] - 2020-04-03
+### Changed
+- Add `observed` option to BinnedDataframe for speed boost with many bins, PR #118 [@BenKrikler](https://github.com/benkrikler)
+
 ## [0.17.4] - 2020-03-12
 ### Changed
 - `pad_missing` was replacing bin contents when set to True, PR #116 [@BenKrikler](https://github.com/benkrikler)

From 1df11c5a6935612e868c5560df5dd86415a63046 Mon Sep 17 00:00:00 2001
From: Ben Krikler <bek07@ic.ac.uk>
Date: Fri, 3 Apr 2020 19:04:47 +0200
Subject: [PATCH 3/5] =?UTF-8?q?Bump=20version:=200.17.4=20=E2=86=92=200.17?=
 =?UTF-8?q?.5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fast_carpenter/version.py | 2 +-
 setup.cfg                 | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/fast_carpenter/version.py b/fast_carpenter/version.py
index 67ce98a..593d24c 100644
--- a/fast_carpenter/version.py
+++ b/fast_carpenter/version.py
@@ -12,5 +12,5 @@ def split_version(version):
     return tuple(result)
 
 
-__version__ = '0.17.4'
+__version__ = '0.17.5'
 version_info = split_version(__version__) # noqa
diff --git a/setup.cfg b/setup.cfg
index 4dd7d39..46f1718 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.17.4
+current_version = 0.17.5
 commit = True
 tag = False
 
@@ -18,3 +18,4 @@ test = pytest
 
 [tool:pytest]
 collect_ignore = ['setup.py']
+

From c7d9d10ae7bd627844adddac32860daf01635c15 Mon Sep 17 00:00:00 2001
From: Ben Krikler <bek07@ic.ac.uk>
Date: Fri, 3 Apr 2020 19:08:45 +0200
Subject: [PATCH 4/5] Fix up the docs

---
 fast_carpenter/summary/binned_dataframe.py | 8 +++++++-
 tests/summary/test_binned_dataframe.py     | 2 +-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/fast_carpenter/summary/binned_dataframe.py b/fast_carpenter/summary/binned_dataframe.py
index cd1839f..a3df678 100644
--- a/fast_carpenter/summary/binned_dataframe.py
+++ b/fast_carpenter/summary/binned_dataframe.py
@@ -150,6 +150,11 @@ def __init__(self, name, out_dir, binning, weights=None, dataset_col=False):
         excluded from the stored dataframe.  Leaving this ``False`` can save
         some disk-space and improve processing time, particularly if the bins are
         only very sparsely filled.
+      observed (bool): If ``False`` bins in the dataframe will only be filled
+        if their are datapoints contained within them.  Otherwise, depending on
+        the binning specification for each dimension, all bins for that
+        dimension will be present.  Use `pad_missing: true` to force all bins
+        to be present.
 
     Other Parameters:
       name (str):  The name of this stage (handled automatically by fast-flow)
@@ -161,7 +166,8 @@ def __init__(self, name, out_dir, binning, weights=None, dataset_col=False):
 
     """
 
-    def __init__(self, name, out_dir, binning, weights=None, dataset_col=True, pad_missing=False, file_format=None, observed=False):
+    def __init__(self, name, out_dir, binning, weights=None, dataset_col=True,
+                 pad_missing=False, file_format=None, observed=False):
         self.name = name
         self.out_dir = out_dir
         ins, outs, binnings = cfg.create_binning_list(self.name, binning)
diff --git a/tests/summary/test_binned_dataframe.py b/tests/summary/test_binned_dataframe.py
index 87e5df9..b49a821 100644
--- a/tests/summary/test_binned_dataframe.py
+++ b/tests/summary/test_binned_dataframe.py
@@ -115,7 +115,7 @@ def test_BinnedDataframe_run_twice(binned_df_1, tmpdir, infile):
     assert totals["EventWeight:sumw"] == pytest.approx(231.91339 * 2)
 
 
-@pytest.fixture #(scope="function")
+@pytest.fixture
 def run_twice_data_mc(config_1, infile, observed):
     chunk_mc = FakeBEEvent(infile, "mc")
     chunk_data = FakeBEEvent(infile, "data")

From 1afbc865f2ebdb98c6e89666f20e9a044668aac4 Mon Sep 17 00:00:00 2001
From: Ben Krikler <bek07@ic.ac.uk>
Date: Fri, 3 Apr 2020 19:35:36 +0200
Subject: [PATCH 5/5] Pin mantichora as a dependency of alphatwirl

---
 CHANGELOG.md | 5 ++++-
 setup.py     | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f14d471..cabf4e4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,9 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 ## [0.17.5] - 2020-04-03
-### Changed
+### Added
 - Add `observed` option to BinnedDataframe for speed boost with many bins, PR #118 [@BenKrikler](https://github.com/benkrikler)
 
+### Changed
+- Pin the version for the Mantichora package that AlphaTwirl depends on
+
 ## [0.17.4] - 2020-03-12
 ### Changed
 - `pad_missing` was replacing bin contents when set to True, PR #116 [@BenKrikler](https://github.com/benkrikler)
diff --git a/setup.py b/setup.py
index a9c3a89..1595b6d 100644
--- a/setup.py
+++ b/setup.py
@@ -21,7 +21,8 @@ def get_version():
     return _globals["__version__"]
 
 
-requirements = ['atuproot==0.1.13', 'atsge==0.2.1', 'fast-flow', 'fast-curator', 'awkward',
+requirements = ['atuproot==0.1.13', 'atsge==0.2.1', 'mantichora==0.9.7',
+                'fast-flow', 'fast-curator', 'awkward',
                 'pandas', 'numpy', 'numba', 'numexpr', 'uproot>=3']
 repositories = []