From ec55d3c4d9476e2a5dd50963857f221ba0607175 Mon Sep 17 00:00:00 2001 From: lvarriano Date: Mon, 6 Nov 2023 10:45:49 -0800 Subject: [PATCH 01/12] improve performance of read_object with idx by reading in whole object, then slicing --- src/lgdo/lh5_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lgdo/lh5_store.py b/src/lgdo/lh5_store.py index dfdfa87f..2fe80f36 100644 --- a/src/lgdo/lh5_store.py +++ b/src/lgdo/lh5_store.py @@ -741,7 +741,7 @@ def read_object( tmp_shape = (0,) + h5f[name].shape[1:] nda = np.empty(tmp_shape, h5f[name].dtype) else: - nda = h5f[name][source_sel] + nda = h5f[name][...][source_sel] # special handling for bools # (c and Julia store as uint8 so cast to bool) From 11ea312d1b30920733bc7a85c0a44fc1051cc1cc Mon Sep 17 00:00:00 2001 From: lvarriano Date: Mon, 6 Nov 2023 14:53:42 -0800 Subject: [PATCH 02/12] add idx read features --- src/lgdo/lh5_store.py | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/src/lgdo/lh5_store.py b/src/lgdo/lh5_store.py index 2fe80f36..16765d6d 100644 --- a/src/lgdo/lh5_store.py +++ b/src/lgdo/lh5_store.py @@ -169,13 +169,23 @@ def read_object( start_row: int = 0, n_rows: int = sys.maxsize, idx: np.ndarray | list | tuple | list[np.ndarray | list | tuple] = None, + use_h5idx: bool = False, field_mask: dict[str, bool] | list[str] | tuple[str] = None, obj_buf: LGDO = None, obj_buf_start: int = 0, - decompress: bool = True, + decompress: bool = True, ) -> tuple[LGDO, int]: """Read LH5 object data from a file. + Individual rows of data may be read by passing the ``idx`` parameter. + However, reading individual rows is often much slower than reading the whole + object and then indexing the desired rows. This behavior may be controlled with + the ``use_h5idx`` flag, where the default behavior is to use more memory for a + much faster read. Note that passing an ``obj_buf`` object will ignore the + ``use_h5idx`` flag and suffer a speed penalty. See + [legend-pydataobj #29](https://github.com/legend-exp/legend-pydataobj/issues/29) + for additional information. + Parameters ---------- name @@ -201,7 +211,17 @@ def read_object( identical read). If used in conjunction with `start_row` and `n_rows`, will be sliced to obey those constraints, where `n_rows` is interpreted as the (max) number of *selected* values (in `idx`) to be - read out. + read out. See + use_h5idx + ``True`` will directly pass the ``idx`` parameter to the underlying + ``h5py`` call such that only the selected rows are read into memory, + which conserves memory at the cost of speed. + ``False`` (default) will read the entire object into memory before + performing the indexing. The default is much faster (1-2 orders of + magnitude) but requires additional memory, though a relatively small + amount in the typical use case. Note that this option is ignored if + ``obj_buf`` is passed, which will read directly and therefore suffer + a speed penalty. field_mask For tables and structs, determines which fields get written out. Only applies to immediate fields of the requested objects. If a dict @@ -214,7 +234,9 @@ def read_object( obj_buf Read directly into memory provided in `obj_buf`. Note: the buffer will be expanded to accommodate the data requested. To maintain the - buffer length, send in ``n_rows = len(obj_buf)``. + buffer length, send in ``n_rows = len(obj_buf)``. Note that passing + this parameter will ignore the ``use_h5idx`` flag and suffer a speed + penalty if also passing ``idx``. obj_buf_start Start location in ``obj_buf`` for read. For concatenating data to array-like objects. @@ -223,6 +245,7 @@ def read_object( after reading. The option has no effect on data encoded with HDF5 built-in filters, which is always decompressed upstream by HDF5. + Returns ------- (object, n_rows_read) @@ -741,7 +764,10 @@ def read_object( tmp_shape = (0,) + h5f[name].shape[1:] nda = np.empty(tmp_shape, h5f[name].dtype) else: - nda = h5f[name][...][source_sel] + if (use_h5idx): + nda = h5f[name][source_sel] + else: + nda = h5f[name][...][source_sel] # special handling for bools # (c and Julia store as uint8 so cast to bool) From de4574b793546610d88ca549009a826b46fb6f33 Mon Sep 17 00:00:00 2001 From: lvarriano Date: Tue, 7 Nov 2023 21:49:58 -0800 Subject: [PATCH 03/12] add np.copy to fix multiple file reads --- src/lgdo/lh5_store.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/lgdo/lh5_store.py b/src/lgdo/lh5_store.py index 16765d6d..f722acdb 100644 --- a/src/lgdo/lh5_store.py +++ b/src/lgdo/lh5_store.py @@ -284,6 +284,7 @@ def read_object( start_row=start_row, n_rows=n_rows_i, idx=idx_i, + use_h5idx=use_h5idx, field_mask=field_mask, obj_buf=obj_buf, obj_buf_start=obj_buf_start, @@ -381,6 +382,7 @@ def read_object( start_row=start_row, n_rows=n_rows, idx=idx, + use_h5idx=use_h5idx, decompress=decompress, ) # modify datatype in attrs if a field_mask was used @@ -427,6 +429,7 @@ def read_object( start_row=start_row, n_rows=n_rows, idx=idx, + use_h5idx=use_h5idx, obj_buf=fld_buf, obj_buf_start=obj_buf_start, decompress=decompress, @@ -596,6 +599,7 @@ def read_object( start_row=start_row, n_rows=n_rows, idx=idx, + use_h5idx=use_h5idx, obj_buf=cumulen_buf, obj_buf_start=obj_buf_start, ) @@ -620,6 +624,7 @@ def read_object( start_row=start_row, n_rows=n_rows, idx=idx2, + use_h5idx=use_h5idx, ) fd_starts = fd_starts.nda # we just need the nda if fd_start is None: @@ -702,6 +707,7 @@ def read_object( start_row=fd_start, n_rows=fd_n_rows, idx=fd_idx, + use_h5idx=use_h5idx, obj_buf=fd_buf, obj_buf_start=fd_buf_start, ) @@ -757,7 +763,13 @@ def read_object( if len(obj_buf) < buf_size: obj_buf.resize(buf_size) dest_sel = np.s_[obj_buf_start:buf_size] - h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel) + + # until better solution found + if (use_h5idx): + h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel) + else: + obj_buf.nda[dest_sel] = np.copy(h5f[name][...][source_sel]) + nda = obj_buf.nda else: if n_rows == 0: @@ -767,7 +779,7 @@ def read_object( if (use_h5idx): nda = h5f[name][source_sel] else: - nda = h5f[name][...][source_sel] + nda = np.copy(h5f[name][...][source_sel]) # special handling for bools # (c and Julia store as uint8 so cast to bool) From 3677539fdcfeaf834e67665f9286e8bfdd2c68ec Mon Sep 17 00:00:00 2001 From: lvarriano Date: Tue, 7 Nov 2023 21:51:37 -0800 Subject: [PATCH 04/12] add some comments --- src/lgdo/lh5_store.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/lgdo/lh5_store.py b/src/lgdo/lh5_store.py index f722acdb..e9201902 100644 --- a/src/lgdo/lh5_store.py +++ b/src/lgdo/lh5_store.py @@ -764,7 +764,8 @@ def read_object( obj_buf.resize(buf_size) dest_sel = np.s_[obj_buf_start:buf_size] - # until better solution found + # this is required to make the read of multiple files faster + # until better solution found. if (use_h5idx): h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel) else: @@ -779,6 +780,10 @@ def read_object( if (use_h5idx): nda = h5f[name][source_sel] else: + # a copy is made in case this is given to an obj_buf that + # then needs to be resized. A view is returned by the + # source_sel indexing, which cannot be resized by ndarray.resize(). + # This occurs in particular when multiple files are being read. nda = np.copy(h5f[name][...][source_sel]) # special handling for bools From 18d5455bfc9258dea6538a2c49b2bce8659e2282 Mon Sep 17 00:00:00 2001 From: lvarriano Date: Wed, 8 Nov 2023 15:54:13 -0800 Subject: [PATCH 05/12] improve implementation w/ idx to restore speed --- src/lgdo/lh5_store.py | 84 +++++++++++++++++++++++++++++-------------- 1 file changed, 57 insertions(+), 27 deletions(-) diff --git a/src/lgdo/lh5_store.py b/src/lgdo/lh5_store.py index e9201902..5ba35ca1 100644 --- a/src/lgdo/lh5_store.py +++ b/src/lgdo/lh5_store.py @@ -177,12 +177,13 @@ def read_object( ) -> tuple[LGDO, int]: """Read LH5 object data from a file. - Individual rows of data may be read by passing the ``idx`` parameter. - However, reading individual rows is often much slower than reading the whole - object and then indexing the desired rows. This behavior may be controlled with - the ``use_h5idx`` flag, where the default behavior is to use more memory for a - much faster read. Note that passing an ``obj_buf`` object will ignore the - ``use_h5idx`` flag and suffer a speed penalty. See + Use the``idx`` parameter to read out particular rows of the data. The ``use_h5idx`` flag + controls whether *only* those rows are read from disk or if the rows are indexed after reading + the entire object. Reading individual rows can be orders of magnitude slower than reading + the whole object and then indexing the desired rows. The default behavior (``use_h5idx=False``) + is to use slightly more memory for a much faster read. Note that there is approximately a 2x + penalty to speed if ``idx`` contains all of the rows of the object, as opposed to just reading + the whole object in without the ``idx`` parameter. See [legend-pydataobj #29](https://github.com/legend-exp/legend-pydataobj/issues/29) for additional information. @@ -202,26 +203,30 @@ def read_object( actual number of rows read will be returned as one of the return values (see below). idx - For NumPy-style "fancying indexing" for the read. Used to read out - rows that pass some selection criteria. Only selection along the first - axis is supported, so tuple arguments must be one-tuples. If `n_rows` - is not false, `idx` will be truncated to `n_rows` before reading. To use - with a list of files, can pass in a list of `idx`'s (one for each - file) or use a long contiguous list (e.g. built from a previous + For NumPy-style "fancying indexing" for the read to select only some + rows, e.g. after applying some cuts to particular columns. + Only selection along the first axis is supported, so tuple arguments + must be one-tuples. If `n_rows` is not false, `idx` will be truncated to + `n_rows` before reading. To use with a list of files, can pass in a list of + `idx`'s (one for each file) or use a long contiguous list (e.g. built from a previous identical read). If used in conjunction with `start_row` and `n_rows`, will be sliced to obey those constraints, where `n_rows` is interpreted as the (max) number of *selected* values (in `idx`) to be - read out. See + read out. Note that the ``use_h5idx`` parameter controls some behaviour of the + read and that the default behavior (``use_h5idx=False``) prioritizes speed over + a small memory penalty. Note also that there is approximately a 2x + penalty to speed if ``idx`` contains all of the rows of the object, as opposed to just reading + the whole object in without the ``idx`` parameter. use_h5idx ``True`` will directly pass the ``idx`` parameter to the underlying - ``h5py`` call such that only the selected rows are read into memory, - which conserves memory at the cost of speed. + ``h5py`` call such that only the selected rows are read directly into memory, + which conserves memory at the cost of speed. There can be a significant penalty + to speed for larger files (1 - 2 orders of magnitude longer time). ``False`` (default) will read the entire object into memory before performing the indexing. The default is much faster (1-2 orders of magnitude) but requires additional memory, though a relatively small - amount in the typical use case. Note that this option is ignored if - ``obj_buf`` is passed, which will read directly and therefore suffer - a speed penalty. + amount in the typical use case. It is recommended to leave this parameter as + its default. field_mask For tables and structs, determines which fields get written out. Only applies to immediate fields of the requested objects. If a dict @@ -259,6 +264,14 @@ def read_object( if not isinstance(lh5_file, (str, h5py.File)): lh5_file = list(lh5_file) n_rows_read = 0 + + # to know whether we are reading in a list of files. + # this is part of the fix for reading data by idx + # (see https://github.com/legend-exp/legend-pydataobj/issues/29) + # so that we only make a copy of the data if absolutely necessary + # or if we can read the data from file without having to make a copy + self.in_file_loop = True + for i, h5f in enumerate(lh5_file): if isinstance(idx, list) and len(idx) > 0 and not np.isscalar(idx[0]): # a list of lists: must be one per file @@ -278,6 +291,11 @@ def read_object( else: idx_i = None n_rows_i = n_rows - n_rows_read + + # maybe someone passed in a list of len==1? + if i == (len(lh5_file) - 1): + self.in_file_loop = False + obj_buf, n_rows_read_i = self.read_object( name, lh5_file[i], @@ -290,11 +308,15 @@ def read_object( obj_buf_start=obj_buf_start, decompress=decompress, ) + n_rows_read += n_rows_read_i if n_rows_read >= n_rows or obj_buf is None: return obj_buf, n_rows_read start_row = 0 obj_buf_start += n_rows_read_i + + self.in_file_loop = False + return obj_buf, n_rows_read # get the file from the store @@ -523,6 +545,7 @@ def read_object( start_row=start_row, n_rows=n_rows, idx=idx, + use_h5idx=use_h5idx, obj_buf=None if decompress else decoded_size_buf, obj_buf_start=0 if decompress else obj_buf_start, ) @@ -534,6 +557,7 @@ def read_object( start_row=start_row, n_rows=n_rows, idx=idx, + use_h5idx=use_h5idx, obj_buf=None if decompress else encoded_data_buf, obj_buf_start=0 if decompress else obj_buf_start, ) @@ -765,11 +789,12 @@ def read_object( dest_sel = np.s_[obj_buf_start:buf_size] # this is required to make the read of multiple files faster - # until better solution found. - if (use_h5idx): + # until a better solution found. + if idx is None or use_h5idx: h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel) else: - obj_buf.nda[dest_sel] = np.copy(h5f[name][...][source_sel]) + # it is faster to read the whole object and then do fancy indexing + obj_buf.nda[dest_sel] = h5f[name][...][source_sel] nda = obj_buf.nda else: @@ -777,14 +802,19 @@ def read_object( tmp_shape = (0,) + h5f[name].shape[1:] nda = np.empty(tmp_shape, h5f[name].dtype) else: - if (use_h5idx): + if idx is None or use_h5idx: nda = h5f[name][source_sel] else: - # a copy is made in case this is given to an obj_buf that - # then needs to be resized. A view is returned by the - # source_sel indexing, which cannot be resized by ndarray.resize(). - # This occurs in particular when multiple files are being read. - nda = np.copy(h5f[name][...][source_sel]) + # it is faster to read the whole object and then do fancy indexing + nda = h5f[name][...][source_sel] + + # if reading a list of files recursively, this is given to obj_buf on + # the first file read. obj_buf needs to be resized and therefore + # it needs to hold the data itself (not a view of the data). + # a view is returned by the source_sel indexing, which cannot be resized + # by ndarray.resize(). + if hasattr(self, 'in_file_loop') and self.in_file_loop: + nda = np.copy(nda) # special handling for bools # (c and Julia store as uint8 so cast to bool) From 47cb717d55cef18b776885889f7242f30f109a26 Mon Sep 17 00:00:00 2001 From: lvarriano Date: Wed, 8 Nov 2023 15:56:26 -0800 Subject: [PATCH 06/12] 2x -> x2 --- src/lgdo/lh5_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lgdo/lh5_store.py b/src/lgdo/lh5_store.py index 5ba35ca1..91c6ac45 100644 --- a/src/lgdo/lh5_store.py +++ b/src/lgdo/lh5_store.py @@ -181,7 +181,7 @@ def read_object( controls whether *only* those rows are read from disk or if the rows are indexed after reading the entire object. Reading individual rows can be orders of magnitude slower than reading the whole object and then indexing the desired rows. The default behavior (``use_h5idx=False``) - is to use slightly more memory for a much faster read. Note that there is approximately a 2x + is to use slightly more memory for a much faster read. Note that there is approximately a x2 penalty to speed if ``idx`` contains all of the rows of the object, as opposed to just reading the whole object in without the ``idx`` parameter. See [legend-pydataobj #29](https://github.com/legend-exp/legend-pydataobj/issues/29) @@ -214,7 +214,7 @@ def read_object( interpreted as the (max) number of *selected* values (in `idx`) to be read out. Note that the ``use_h5idx`` parameter controls some behaviour of the read and that the default behavior (``use_h5idx=False``) prioritizes speed over - a small memory penalty. Note also that there is approximately a 2x + a small memory penalty. Note also that there is approximately a x2 penalty to speed if ``idx`` contains all of the rows of the object, as opposed to just reading the whole object in without the ``idx`` parameter. use_h5idx From f5026ecebfb64cdaf26b30557dbfcbcae00bf2fd Mon Sep 17 00:00:00 2001 From: lvarriano Date: Wed, 8 Nov 2023 15:57:02 -0800 Subject: [PATCH 07/12] update obj_buf doc --- src/lgdo/lh5_store.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/lgdo/lh5_store.py b/src/lgdo/lh5_store.py index 91c6ac45..5e3e8782 100644 --- a/src/lgdo/lh5_store.py +++ b/src/lgdo/lh5_store.py @@ -239,9 +239,7 @@ def read_object( obj_buf Read directly into memory provided in `obj_buf`. Note: the buffer will be expanded to accommodate the data requested. To maintain the - buffer length, send in ``n_rows = len(obj_buf)``. Note that passing - this parameter will ignore the ``use_h5idx`` flag and suffer a speed - penalty if also passing ``idx``. + buffer length, send in ``n_rows = len(obj_buf)``. obj_buf_start Start location in ``obj_buf`` for read. For concatenating data to array-like objects. From d4a8975d346c2d93bd13d004d7bb6c33398a4770 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Nov 2023 00:59:18 +0000 Subject: [PATCH 08/12] style: pre-commit fixes --- src/lgdo/lh5_store.py | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/src/lgdo/lh5_store.py b/src/lgdo/lh5_store.py index 5e3e8782..9d87694a 100644 --- a/src/lgdo/lh5_store.py +++ b/src/lgdo/lh5_store.py @@ -173,18 +173,18 @@ def read_object( field_mask: dict[str, bool] | list[str] | tuple[str] = None, obj_buf: LGDO = None, obj_buf_start: int = 0, - decompress: bool = True, + decompress: bool = True, ) -> tuple[LGDO, int]: """Read LH5 object data from a file. Use the``idx`` parameter to read out particular rows of the data. The ``use_h5idx`` flag - controls whether *only* those rows are read from disk or if the rows are indexed after reading - the entire object. Reading individual rows can be orders of magnitude slower than reading + controls whether *only* those rows are read from disk or if the rows are indexed after reading + the entire object. Reading individual rows can be orders of magnitude slower than reading the whole object and then indexing the desired rows. The default behavior (``use_h5idx=False``) - is to use slightly more memory for a much faster read. Note that there is approximately a x2 + is to use slightly more memory for a much faster read. Note that there is approximately a x2 penalty to speed if ``idx`` contains all of the rows of the object, as opposed to just reading - the whole object in without the ``idx`` parameter. See - [legend-pydataobj #29](https://github.com/legend-exp/legend-pydataobj/issues/29) + the whole object in without the ``idx`` parameter. See + [legend-pydataobj #29](https://github.com/legend-exp/legend-pydataobj/issues/29) for additional information. Parameters @@ -204,27 +204,27 @@ def read_object( values (see below). idx For NumPy-style "fancying indexing" for the read to select only some - rows, e.g. after applying some cuts to particular columns. - Only selection along the first axis is supported, so tuple arguments - must be one-tuples. If `n_rows` is not false, `idx` will be truncated to - `n_rows` before reading. To use with a list of files, can pass in a list of + rows, e.g. after applying some cuts to particular columns. + Only selection along the first axis is supported, so tuple arguments + must be one-tuples. If `n_rows` is not false, `idx` will be truncated to + `n_rows` before reading. To use with a list of files, can pass in a list of `idx`'s (one for each file) or use a long contiguous list (e.g. built from a previous identical read). If used in conjunction with `start_row` and `n_rows`, will be sliced to obey those constraints, where `n_rows` is interpreted as the (max) number of *selected* values (in `idx`) to be read out. Note that the ``use_h5idx`` parameter controls some behaviour of the read and that the default behavior (``use_h5idx=False``) prioritizes speed over - a small memory penalty. Note also that there is approximately a x2 + a small memory penalty. Note also that there is approximately a x2 penalty to speed if ``idx`` contains all of the rows of the object, as opposed to just reading the whole object in without the ``idx`` parameter. use_h5idx - ``True`` will directly pass the ``idx`` parameter to the underlying + ``True`` will directly pass the ``idx`` parameter to the underlying ``h5py`` call such that only the selected rows are read directly into memory, which conserves memory at the cost of speed. There can be a significant penalty to speed for larger files (1 - 2 orders of magnitude longer time). ``False`` (default) will read the entire object into memory before performing the indexing. The default is much faster (1-2 orders of - magnitude) but requires additional memory, though a relatively small + magnitude) but requires additional memory, though a relatively small amount in the typical use case. It is recommended to leave this parameter as its default. field_mask @@ -266,7 +266,7 @@ def read_object( # to know whether we are reading in a list of files. # this is part of the fix for reading data by idx # (see https://github.com/legend-exp/legend-pydataobj/issues/29) - # so that we only make a copy of the data if absolutely necessary + # so that we only make a copy of the data if absolutely necessary # or if we can read the data from file without having to make a copy self.in_file_loop = True @@ -289,7 +289,7 @@ def read_object( else: idx_i = None n_rows_i = n_rows - n_rows_read - + # maybe someone passed in a list of len==1? if i == (len(lh5_file) - 1): self.in_file_loop = False @@ -785,7 +785,7 @@ def read_object( if len(obj_buf) < buf_size: obj_buf.resize(buf_size) dest_sel = np.s_[obj_buf_start:buf_size] - + # this is required to make the read of multiple files faster # until a better solution found. if idx is None or use_h5idx: @@ -807,12 +807,12 @@ def read_object( nda = h5f[name][...][source_sel] # if reading a list of files recursively, this is given to obj_buf on - # the first file read. obj_buf needs to be resized and therefore - # it needs to hold the data itself (not a view of the data). - # a view is returned by the source_sel indexing, which cannot be resized + # the first file read. obj_buf needs to be resized and therefore + # it needs to hold the data itself (not a view of the data). + # a view is returned by the source_sel indexing, which cannot be resized # by ndarray.resize(). - if hasattr(self, 'in_file_loop') and self.in_file_loop: - nda = np.copy(nda) + if hasattr(self, "in_file_loop") and self.in_file_loop: + nda = np.copy(nda) # special handling for bools # (c and Julia store as uint8 so cast to bool) From b8e5c6a32be5e5639283ea9b4b88f519434787ca Mon Sep 17 00:00:00 2001 From: lvarriano Date: Wed, 8 Nov 2023 17:06:35 -0800 Subject: [PATCH 09/12] fix docstring --- src/lgdo/lh5_store.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lgdo/lh5_store.py b/src/lgdo/lh5_store.py index 9d87694a..1f0ab7df 100644 --- a/src/lgdo/lh5_store.py +++ b/src/lgdo/lh5_store.py @@ -177,14 +177,14 @@ def read_object( ) -> tuple[LGDO, int]: """Read LH5 object data from a file. - Use the``idx`` parameter to read out particular rows of the data. The ``use_h5idx`` flag + Use the ``idx`` parameter to read out particular rows of the data. The ``use_h5idx`` flag controls whether *only* those rows are read from disk or if the rows are indexed after reading the entire object. Reading individual rows can be orders of magnitude slower than reading the whole object and then indexing the desired rows. The default behavior (``use_h5idx=False``) is to use slightly more memory for a much faster read. Note that there is approximately a x2 penalty to speed if ``idx`` contains all of the rows of the object, as opposed to just reading - the whole object in without the ``idx`` parameter. See - [legend-pydataobj #29](https://github.com/legend-exp/legend-pydataobj/issues/29) + the whole object in without the ``idx`` parameter. See + `legend-pydataobj #29 `_ for additional information. Parameters From 0963d99d8fed7a5b983a068e5be98960b94fcafe Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Nov 2023 01:06:53 +0000 Subject: [PATCH 10/12] style: pre-commit fixes --- src/lgdo/lh5_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lgdo/lh5_store.py b/src/lgdo/lh5_store.py index 1f0ab7df..2d5644e0 100644 --- a/src/lgdo/lh5_store.py +++ b/src/lgdo/lh5_store.py @@ -183,7 +183,7 @@ def read_object( the whole object and then indexing the desired rows. The default behavior (``use_h5idx=False``) is to use slightly more memory for a much faster read. Note that there is approximately a x2 penalty to speed if ``idx`` contains all of the rows of the object, as opposed to just reading - the whole object in without the ``idx`` parameter. See + the whole object in without the ``idx`` parameter. See `legend-pydataobj #29 `_ for additional information. From 7ca1e69220105f69f133759f6975d0b3df76d17e Mon Sep 17 00:00:00 2001 From: lvarriano Date: Thu, 9 Nov 2023 08:02:17 -0800 Subject: [PATCH 11/12] convert idx to slice if possible --- src/lgdo/lh5_store.py | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/src/lgdo/lh5_store.py b/src/lgdo/lh5_store.py index 2d5644e0..8eeaaf1f 100644 --- a/src/lgdo/lh5_store.py +++ b/src/lgdo/lh5_store.py @@ -181,9 +181,7 @@ def read_object( controls whether *only* those rows are read from disk or if the rows are indexed after reading the entire object. Reading individual rows can be orders of magnitude slower than reading the whole object and then indexing the desired rows. The default behavior (``use_h5idx=False``) - is to use slightly more memory for a much faster read. Note that there is approximately a x2 - penalty to speed if ``idx`` contains all of the rows of the object, as opposed to just reading - the whole object in without the ``idx`` parameter. See + is to use slightly more memory for a much faster read. See `legend-pydataobj #29 `_ for additional information. @@ -214,19 +212,16 @@ def read_object( interpreted as the (max) number of *selected* values (in `idx`) to be read out. Note that the ``use_h5idx`` parameter controls some behaviour of the read and that the default behavior (``use_h5idx=False``) prioritizes speed over - a small memory penalty. Note also that there is approximately a x2 - penalty to speed if ``idx`` contains all of the rows of the object, as opposed to just reading - the whole object in without the ``idx`` parameter. + a small memory penalty. use_h5idx ``True`` will directly pass the ``idx`` parameter to the underlying ``h5py`` call such that only the selected rows are read directly into memory, which conserves memory at the cost of speed. There can be a significant penalty to speed for larger files (1 - 2 orders of magnitude longer time). ``False`` (default) will read the entire object into memory before - performing the indexing. The default is much faster (1-2 orders of - magnitude) but requires additional memory, though a relatively small - amount in the typical use case. It is recommended to leave this parameter as - its default. + performing the indexing. The default is much faster but requires additional memory, + though a relatively small amount in the typical use case. It is recommended to + leave this parameter as its default. field_mask For tables and structs, determines which fields get written out. Only applies to immediate fields of the requested objects. If a dict @@ -773,12 +768,25 @@ def read_object( if n_rows_to_read > n_rows: n_rows_to_read = n_rows + # if idx is passed, check if we can make it a slice instead (faster) + change_idx_to_slice = False + # prepare the selection for the read. Use idx if available if idx is not None: - source_sel = idx + # check if idx is empty and convert to slice instead + if len(idx[0]) == 0: + source_sel = np.s_[0 : 0] + change_idx_to_slice = True + # check if idx is contiguous and increasing + # if so, convert it to a slice instead (faster) + elif np.all(np.diff(idx[0]) == 1): + source_sel = np.s_[idx[0][0] : idx[0][-1] + 1] + change_idx_to_slice = True + else: + source_sel = idx else: source_sel = np.s_[start_row : start_row + n_rows_to_read] - + # Now read the array if obj_buf is not None and n_rows_to_read > 0: buf_size = obj_buf_start + n_rows_to_read @@ -788,7 +796,7 @@ def read_object( # this is required to make the read of multiple files faster # until a better solution found. - if idx is None or use_h5idx: + if change_idx_to_slice or idx is None or use_h5idx: h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel) else: # it is faster to read the whole object and then do fancy indexing @@ -800,7 +808,7 @@ def read_object( tmp_shape = (0,) + h5f[name].shape[1:] nda = np.empty(tmp_shape, h5f[name].dtype) else: - if idx is None or use_h5idx: + if change_idx_to_slice or idx is None or use_h5idx: nda = h5f[name][source_sel] else: # it is faster to read the whole object and then do fancy indexing From 9d43392f175d930f573181f21b3627171970e691 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Nov 2023 16:02:39 +0000 Subject: [PATCH 12/12] style: pre-commit fixes --- src/lgdo/lh5_store.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lgdo/lh5_store.py b/src/lgdo/lh5_store.py index 8eeaaf1f..d0fef051 100644 --- a/src/lgdo/lh5_store.py +++ b/src/lgdo/lh5_store.py @@ -219,8 +219,8 @@ def read_object( which conserves memory at the cost of speed. There can be a significant penalty to speed for larger files (1 - 2 orders of magnitude longer time). ``False`` (default) will read the entire object into memory before - performing the indexing. The default is much faster but requires additional memory, - though a relatively small amount in the typical use case. It is recommended to + performing the indexing. The default is much faster but requires additional memory, + though a relatively small amount in the typical use case. It is recommended to leave this parameter as its default. field_mask For tables and structs, determines which fields get written out. @@ -775,7 +775,7 @@ def read_object( if idx is not None: # check if idx is empty and convert to slice instead if len(idx[0]) == 0: - source_sel = np.s_[0 : 0] + source_sel = np.s_[0:0] change_idx_to_slice = True # check if idx is contiguous and increasing # if so, convert it to a slice instead (faster) @@ -786,7 +786,7 @@ def read_object( source_sel = idx else: source_sel = np.s_[start_row : start_row + n_rows_to_read] - + # Now read the array if obj_buf is not None and n_rows_to_read > 0: buf_size = obj_buf_start + n_rows_to_read