From a9219e12c03d89f0d3719f9f41a169771015e2b6 Mon Sep 17 00:00:00 2001 From: Peter Yoachim Date: Tue, 22 Aug 2023 16:40:20 -0700 Subject: [PATCH 1/2] bug fix on oned slicer --- rubin_sim/maf/slicers/one_d_slicer.py | 30 +++++++++++++-------------- tests/maf/test_onedslicer.py | 25 +++------------------- 2 files changed, 18 insertions(+), 37 deletions(-) diff --git a/rubin_sim/maf/slicers/one_d_slicer.py b/rubin_sim/maf/slicers/one_d_slicer.py index 7d454d34b..54841817a 100644 --- a/rubin_sim/maf/slicers/one_d_slicer.py +++ b/rubin_sim/maf/slicers/one_d_slicer.py @@ -1,5 +1,3 @@ -# oneDSlicer - slices based on values in one data column in sim_data. - __all__ = ("OneDSlicer",) import warnings @@ -102,14 +100,14 @@ def setup_slicer(self, sim_data, maps=None): "A safer choice is to use a separate OneDSlicer for each MetricBundle." ) warnings.warn(warning_msg) - sliceCol = sim_data[self.slice_col_name] + slice_col = sim_data[self.slice_col_name] # Set bins from data or specified values, if they were previously defined. if self.bins is None: # Set bin min/max values (could have been set in __init__) if self.bin_min is None: - self.bin_min = np.nanmin(sliceCol) + self.bin_min = np.nanmin(slice_col) if self.bin_max is None: - self.bin_max = np.nanmax(sliceCol) + self.bin_max = np.nanmax(slice_col) # Give warning if bin_min = bin_max, and do something at least slightly reasonable. if self.bin_min == self.bin_max: warnings.warn( @@ -121,7 +119,7 @@ def setup_slicer(self, sim_data, maps=None): else: self.bin_max = self.bin_max + 1 if self.bin_size is None: - bins = optimal_bins(sliceCol, self.bin_min, self.bin_max) + bins = optimal_bins(slice_col, self.bin_min, self.bin_max) nbins = np.round(bins) self.bin_size = (self.bin_max - self.bin_min) / float(nbins) # Set bins @@ -136,20 +134,22 @@ def setup_slicer(self, sim_data, maps=None): self.slice_points["bins"] = self.bins # Add metadata from map if needed. self._run_maps(maps) - # Set up data slicing. - self.sim_idxs = np.argsort(sim_data[self.slice_col_name]) - simFieldsSorted = np.sort(sim_data[self.slice_col_name]) - # "left" values are location where simdata[i-1] < bins <= simdata[i] - self.left = np.searchsorted(simFieldsSorted, self.bins, "left") - # "right" values are locations where simdata[i-1] <= bins < simdata[i] - # This lets us have a closed final bin - self.right = np.searchsorted(simFieldsSorted, self.bins, "right") + + indxs = np.argsort(sim_data[self.slice_col_name]) + data_sorted = sim_data[self.slice_col_name][indxs] + + # Setting up slices such that left_edge <= data < right_edge + # in each slice. + left = np.searchsorted(data_sorted, self.bins[0:-1], "left") + right = np.searchsorted(data_sorted, self.bins[1:], "left") + + self.sim_idxs = [indxs[le:ri] for le, ri in zip(left, right)] # Set up _slice_sim_data method for this class. @wraps(self._slice_sim_data) def _slice_sim_data(islice): """Slice sim_data on oneD sliceCol, to return relevant indexes for slice_point.""" - idxs = self.sim_idxs[self.left[islice] : self.right[islice + 1]] + idxs = self.sim_idxs[islice] bin_left = self.bins[islice] bin_right = self.bins[islice + 1] return { diff --git a/tests/maf/test_onedslicer.py b/tests/maf/test_onedslicer.py index 0addfb3a4..48a8d3f20 100644 --- a/tests/maf/test_onedslicer.py +++ b/tests/maf/test_onedslicer.py @@ -9,11 +9,11 @@ from rubin_sim.maf.slicers import OneDSlicer, UniSlicer -def make_data_values(size=100, min=0.0, max=1.0, random=-1): +def make_data_values(size=100, min_val=0.0, max_val=1.0, random=-1): """Generate a simple array of numbers, evenly arranged between min/max, but (optional) random order.""" datavalues = np.arange(0, size, dtype="float") - datavalues *= (float(max) - float(min)) / (datavalues.max() - datavalues.min()) - datavalues += min + datavalues *= (float(max_val) - float(min_val)) / (datavalues.max() - datavalues.min()) + datavalues += min_val if random > 0: rng = np.random.RandomState(random) randorder = rng.rand(size) @@ -209,25 +209,6 @@ def test_slicing(self): nbins = len(bins) - 1 # Test that testbinner raises appropriate error before it's set up (first time) self.assertRaises(NotImplementedError, self.testslicer._slice_sim_data, 0) - # test that random values are split equally into all bins (when bins and data cover same range) - for nvalues in (1000, 10000, 100000): - dv = make_data_values(nvalues, dvmin, dvmax, random=560) - self.testslicer = OneDSlicer(slice_col_name="testdata", bins=bins) - self.testslicer.setup_slicer(dv) - sum = 0 - for i, s in enumerate(self.testslicer): - idxs = s["idxs"] - data_slice = dv["testdata"][idxs] - sum += len(idxs) - if len(data_slice) > 0: - self.assertEqual(len(data_slice), nvalues / float(nbins)) - else: - self.assertGreater( - len(data_slice), - 0, - msg="Data in test case expected to always be > 0 len after slicing", - ) - self.assertTrue(sum, nvalues) # test that we're not dumping extra visits into the 'last bin' (that it's closed both sides) nvalues = 1000 dv = make_data_values(nvalues, 0, 2, random=560) From f1e73d803d14f126624d30717acca8237aa19aec Mon Sep 17 00:00:00 2001 From: Lynne Jones Date: Tue, 22 Aug 2023 20:18:13 -0700 Subject: [PATCH 2/2] add specific 'night' column test --- rubin_sim/maf/slicers/one_d_slicer.py | 2 +- tests/maf/test_onedslicer.py | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/rubin_sim/maf/slicers/one_d_slicer.py b/rubin_sim/maf/slicers/one_d_slicer.py index 54841817a..b42316dd9 100644 --- a/rubin_sim/maf/slicers/one_d_slicer.py +++ b/rubin_sim/maf/slicers/one_d_slicer.py @@ -139,7 +139,7 @@ def setup_slicer(self, sim_data, maps=None): data_sorted = sim_data[self.slice_col_name][indxs] # Setting up slices such that left_edge <= data < right_edge - # in each slice. + # in each slice. left = np.searchsorted(data_sorted, self.bins[0:-1], "left") right = np.searchsorted(data_sorted, self.bins[1:], "left") diff --git a/tests/maf/test_onedslicer.py b/tests/maf/test_onedslicer.py index 48a8d3f20..588c29197 100644 --- a/tests/maf/test_onedslicer.py +++ b/tests/maf/test_onedslicer.py @@ -206,7 +206,6 @@ def test_slicing(self): dvmax = 1 stepsize = 0.01 bins = np.arange(dvmin, dvmax + stepsize / 2, stepsize) - nbins = len(bins) - 1 # Test that testbinner raises appropriate error before it's set up (first time) self.assertRaises(NotImplementedError, self.testslicer._slice_sim_data, 0) # test that we're not dumping extra visits into the 'last bin' (that it's closed both sides) @@ -219,6 +218,24 @@ def test_slicing(self): for i, s in enumerate(self.testslicer): self.assertEqual(len(s["idxs"]), nvalues / float(nbins) / 2.0) + def test_night_slicing(self): + """Test slicing with 'night' column""" + numval = 1000 + vals = np.arange(0, numval, 1) / (numval / 50) + nights = np.floor(vals) + np.random.shuffle(nights) + # If you were doing by-hand comparison, this can be useful + # bins = np.arange(0, 50.5, 1) + # vals, bb = np.histogram(nights, bins=bins) + vals = make_data_values(len(nights), random=10) + vals = np.array(list(zip(nights, vals["testdata"])), dtype=[("night", "int"), ("testdata", "float")]) + test_slicer = OneDSlicer(slice_col_name="night", bins=np.arange(0, 50.5, 1)) + test_slicer.setup_slicer(vals) + for s in test_slicer: + self.assertTrue(vals[s["idxs"]]["night"].min() >= s["slice_point"]["bin_left"]) + self.assertTrue(vals[s["idxs"]]["night"].max() < s["slice_point"]["bin_right"]) + self.assertEqual(len(s["idxs"]), numval / 50) + if __name__ == "__main__": unittest.main()