Skip to content

Commit

Permalink
Externallinkfix (#153)
Browse files Browse the repository at this point in the history
* Show linked chunk counts

* support hdf5:// prefix in external link refs

* updates for numpy deprecation errors
  • Loading branch information
jreadey authored Sep 28, 2023
1 parent ca1f8ae commit 0f10aa1
Show file tree
Hide file tree
Showing 24 changed files with 165 additions and 344 deletions.
50 changes: 39 additions & 11 deletions h5pyd/_apps/hsls.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,32 +145,60 @@ def dump(name, obj, visited=None):
if isinstance(obj.id.layout, dict):
# H5D_CHUNKED_REF layout
chunk_dims = obj.id.layout["dims"]
storage_desc = "Storage " + obj.id.layout["class"]
obj_layout = obj.id.layout["class"]
else:
chunk_dims = obj.chunks
storage_desc = "Storage H5D_CHUNKED"
for chunk_dim in chunk_dims:
obj_layout = "H5D_CHUNKED"
storage_desc = f"Storage {obj_layout}"
max_chunk_count = 1
rank = len(obj.shape)
for i in range(rank):
extent = obj.shape[i]
chunk_dim = chunk_dims[i]
chunk_size *= chunk_dim
max_chunk_count *= -(-extent // chunk_dim)
dset_size = obj.dtype.itemsize
for dim_extent in obj.shape:
dset_size *= dim_extent

num_chunks = obj.num_chunks
allocated_size = obj.allocated_size
if obj_layout == "H5D_CHUNKED_REF_INDIRECT":
chunk_table_id = obj.id.layout["chunk_table"]
chunk_table = obj.file[f"datasets/{chunk_table_id}"]
num_chunks = int(np.prod(chunk_table.shape))
chunk_table_elements = chunk_table[...].reshape((num_chunks,))
num_linked_chunks = 0
allocated_size = 0
for e in chunk_table_elements:
chunk_offset = e[0]
chunk_size = e[1]
if chunk_offset > 0 and chunk_size > 0:
num_linked_chunks += 1
allocated_size += chunk_size
num_chunks = num_linked_chunks
chunk_type = "linked"

else:
num_chunks = obj.num_chunks
allocated_size = obj.allocated_size
chunk_type = "allocated"

if num_chunks is not None and allocated_size is not None:
fstr = " {0:>32}: {1} {2} bytes, {3} allocated chunks"
print(fstr.format("Chunks", chunk_dims, intToStr(chunk_size),
intToStr(num_chunks)))
fstr = " {0:>32}: {1} {2} bytes, {3}/{4} {5} chunks"

s = fstr.format("Chunks", chunk_dims, intToStr(chunk_size), intToStr(num_chunks),
intToStr(max_chunk_count), chunk_type)
print(s)
if dset_size > 0:
utilization = allocated_size / dset_size
fstr = " {0:>32}: {1} logical bytes, {2} allocated bytes, {3:.2f}% utilization"
fstr = " {0:>32}: {1} logical bytes, {2} {3} bytes, {4:.2f}% utilization"
print(fstr.format(storage_desc, intToStr(dset_size),
intToStr(allocated_size),
chunk_type,
utilization * 100.0))
else:
fstr = " {0:>32}: {1} logical bytes, {2} allocated bytes"
fstr = " {0:>32}: {1} logical bytes, {2} {3} bytes"
print(fstr.format(storage_desc, intToStr(dset_size),
intToStr(allocated_size)))
intToStr(allocated_size), chunk_type))

else:
# verbose info not available, just show the chunk layout
Expand Down
4 changes: 0 additions & 4 deletions h5pyd/_apps/hstouch.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,6 @@ def touchDomain(domain):
parent_domain = getParentDomain(domain)

if parent_domain == "/":
if not domain.endswith("/"):
msg = "Only folders can be created as a top-level domain"
logging.error(msg)
sys.exit(msg)
if len(domain) < 4:
msg = "Top-level folders must be at least three characters"
logging.error(msg)
Expand Down
4 changes: 2 additions & 2 deletions h5pyd/_hl/attrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def create(self, name, data, shape=None, dtype=None):
# Not an array type; make sure to check the number of elements
# is compatible, and reshape if needed.
else:
if numpy.product(shape) != numpy.product(data.shape):
if numpy.prod(shape) != numpy.prod(data.shape):
raise ValueError("Shape of new attribute conflicts with shape of data")

if shape != data.shape:
Expand Down Expand Up @@ -321,7 +321,7 @@ def modify(self, name, value):
# Allow the case of () <-> (1,)
if (value.shape != attr.shape) and not \
(numpy.product(value.shape) == 1 and numpy.product(attr.shape) == 1):
(numpy.prod(value.shape) == 1 and numpy.prod(attr.shape) == 1):
raise TypeError("Shape of data is incompatible with existing attribute")
attr.write(value)
"""
Expand Down
5 changes: 3 additions & 2 deletions h5pyd/_hl/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,9 +464,10 @@ def getElementCount(buffer, offset):
count_bytes = bytes(buffer[offset:(offset+4)])

try:
count = int(np.frombuffer(count_bytes, dtype="<i4"))
arr =np.frombuffer(count_bytes, dtype="<i4")
count = int(arr[0])
except TypeError as e:
msg = "Unexpected error reading count value for variable length elemennt: {}".format(e)
msg = f"Unexpected error reading count value for variable length elemennt: {e}"
raise TypeError(msg)
if count < 0:
# shouldn't be negative
Expand Down
65 changes: 23 additions & 42 deletions h5pyd/_hl/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,8 @@ def make_new_dset(
else:
shape = (shape,) if isinstance(shape, int) else tuple(shape)
if data is not None and (
numpy.product(shape, dtype=numpy.ulonglong)
!= numpy.product(data.shape, dtype=numpy.ulonglong)
numpy.prod(shape, dtype=numpy.ulonglong)
!= numpy.prod(data.shape, dtype=numpy.ulonglong)
):
raise ValueError("Shape tuple is incompatible with data")

Expand Down Expand Up @@ -399,7 +399,6 @@ def __init__(self, dset, source_sel=None):

if not dset.chunks:
# can only use with chunked datasets
# (currently all datasets are chunked, but check for future compat)
raise TypeError("Chunked dataset required")

if isinstance(dset.chunks, dict):
Expand All @@ -426,22 +425,15 @@ def __init__(self, dset, source_sel=None):
for dim in range(rank):
s = self._sel[dim]
if s.start < 0 or s.stop > self._shape[dim] or s.stop <= s.start:
raise ValueError(
"Invalid selection - selection region must be within dataset space"
)
msg = "Invalid selection - selection region must be within dataset space"
raise ValueError(msg)
index = s.start // self._layout[dim]
self._chunk_index.append(index)

def __iter__(self):
return self

def __next__(self):
def get_ret(item):
if len(item) == 1:
return item[0]
else:
return tuple(item)

rank = len(self._shape)
slices = []
if rank == 0 or self._chunk_index[0] * self._layout[0] >= self._sel[0].stop:
Expand Down Expand Up @@ -475,7 +467,7 @@ def get_ret(item):
# reset to the start and continue iterating with higher dimension
self._chunk_index[dim] = 0
dim -= 1
return get_ret(slices)
return tuple(slices)


class Dataset(HLObject):
Expand Down Expand Up @@ -910,7 +902,7 @@ def _getQueryParam(self, start, stop, step=None):
step = (1,) * rank
param += "["
for i in range(rank):
field = "{}:{}:{}".format(start[i], stop[i], step[i])
field = f"{start[i]}:{stop[i]}:{step[i]}"
param += field
if i != (rank - 1):
param += ","
Expand Down Expand Up @@ -973,7 +965,7 @@ def __getitem__(self, args, new_dtype=None):
mshape = sel.guess_shape(sid)
if mshape is None:
return numpy.array((0,), dtype=new_dtype)
if numpy.product(mshape) == 0:
if numpy.prod(mshape) == 0:
return numpy.array(mshape, dtype=new_dtype)
out = numpy.empty(mshape, dtype=new_dtype)
sid_out = h5s.create_simple(mshape)
Expand All @@ -993,7 +985,7 @@ def __getitem__(self, args, new_dtype=None):

if self._shape == ():
selection = sel.select(self, args)
self.log.info("selection.mshape: {}".format(selection.mshape))
self.log.info(f"selection.mshape: {selection.mshape}")

# TBD - refactor the following with the code for the non-scalar case
req = "/datasets/" + self.id.uuid + "/value"
Expand All @@ -1006,7 +998,7 @@ def __getitem__(self, args, new_dtype=None):
arr = bytesToArray(rsp, new_dtype, self._shape)

if not self.dtype.shape:
self.log.debug("reshape arr to: {}".format(self._shape))
self.log.debug(f"reshape arr to: {self._shape}")
arr = numpy.reshape(arr, self._shape)
else:
# got JSON response
Expand All @@ -1024,11 +1016,8 @@ def __getitem__(self, args, new_dtype=None):
arr = numpy.empty((), dtype=new_dtype)
arr[()] = data
if selection.mshape is None:
self.log.info(
"return scalar selection of: {}, dtype: {}, shape: {}".format(
arr, arr.dtype, arr.shape
)
)
msg = f"return scalar selection of: {arr}, dtype: {arr.dtype}, shape: {arr.shape}"
self.log.info(msg)
val = arr[()]
if isinstance(val, str):
# h5py always returns bytes, so encode the str
Expand Down Expand Up @@ -1308,9 +1297,7 @@ def __getitem__(self, args, new_dtype=None):
points, dtype="u8"
) # must use unsigned 64-bit int
body = arr_points.tobytes()
self.log.info(
"point select binary request, num bytes: {}".format(len(body))
)
self.log.info(f"point select binary request, num bytes: {len(body)}")
else:
if delistify:
self.log.info("delistifying point selection")
Expand All @@ -1324,7 +1311,7 @@ def __getitem__(self, args, new_dtype=None):
else:
# can just assign
body["points"] = points
self.log.info("sending point selection request: {}".format(body))
self.log.info(f"sending point selection request: {body}")
rsp = self.POST(req, format=format, body=body)
if type(rsp) in (bytes, bytearray):
if len(rsp) // mtype.itemsize != selection.mshape[0]:
Expand All @@ -1337,18 +1324,14 @@ def __getitem__(self, args, new_dtype=None):
else:
data = rsp["value"]
if len(data) != selection.mshape[0]:
raise IOError(
"Expected {} elements, but got {}".format(
selection.mshape[0], len(data)
)
)

msg = f"Expected {selection.mshape[0]} elements, but got {len(data)}"
raise IOError(msg)
arr = numpy.asarray(data, dtype=mtype, order="C")

else:
raise ValueError("selection type not supported")

self.log.info("got arr: {}, cleaning up shape!".format(arr.shape))
self.log.info(f"got arr: {arr.shape}, cleaning up shape!")
# Patch up the output for NumPy
if len(names) == 1:
arr = arr[names[0]] # Single-field recarray convention
Expand All @@ -1368,7 +1351,7 @@ def __setitem__(self, args, val):
(slices and integers). For advanced indexing, the shapes must
match.
"""
self.log.info("Dataset __setitem__, args: {}".format(args))
self.log.info(f"Dataset __setitem__, args: {args}")
use_base64 = True # may need to set this to false below for some types

args = args if isinstance(args, tuple) else (args,)
Expand All @@ -1378,7 +1361,7 @@ def __setitem__(self, args, val):
self.log.debug(
f"val dtype: {val.dtype}, shape: {val.shape} metadata: {val.dtype.metadata}"
)
if numpy.product(val.shape) == 0:
if numpy.prod(val.shape) == 0:
self.log.info("no elements in numpy array, skipping write")
except AttributeError:
self.log.debug("val not ndarray")
Expand Down Expand Up @@ -1428,7 +1411,7 @@ def __setitem__(self, args, val):
i
for i in val.reshape(
(
numpy.product(val.shape[:-1], dtype=numpy.ulonglong),
numpy.prod(val.shape[:-1], dtype=numpy.ulonglong),
val.shape[-1],
)
)
Expand Down Expand Up @@ -1480,7 +1463,7 @@ def __setitem__(self, args, val):
# TBD - need to handle cases where the type shape is different
self.log.debug("got numpy array")
if val.dtype != self.dtype and val.dtype.shape == self.dtype.shape:
self.log.info("converting {} to {}".format(val.dtype, self.dtype))
self.log.info(f"converting {val.dtype} to {self.dtype}")
# convert array
tmp = numpy.empty(val.shape, dtype=self.dtype)
tmp[...] = val[...]
Expand Down Expand Up @@ -1584,15 +1567,13 @@ def __setitem__(self, args, val):
data = val.tobytes()
data = base64.b64encode(data)
data = data.decode("ascii")
self.log.debug("data: {}".format(data))
body["value_base64"] = data
self.log.debug("writing base64 data, {} bytes".format(len(data)))
self.log.debug(f"writing base64 data, {len(data)} bytes")
else:
if type(val) is not list:
val = val.tolist()
val = _decode(val)
self.log.debug("writing json data, {} elements".format(len(val)))
self.log.debug("data: {}".format(val))
self.log.debug(f"writing json data, {len(val)} elements")
body["value"] = val

if selection.select_type != sel.H5S_SELECT_ALL:
Expand Down Expand Up @@ -1702,7 +1683,7 @@ def __array__(self, dtype=None):
arr = numpy.empty(self._shape, dtype=self.dtype if dtype is None else dtype)

# Special case for (0,)*-shape datasets
if self._shape is None or numpy.product(self._shape) == 0:
if self._shape is None or numpy.prod(self._shape) == 0:
return arr

self.read_direct(arr)
Expand Down
6 changes: 3 additions & 3 deletions h5pyd/_hl/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ def guess_chunk(shape, maxshape, typesize):

# Determine the optimal chunk size in bytes using a PyTables expression.
# This is kept as a float.
dset_size = np.product(chunks) * typesize
dset_size = np.prod(chunks) * typesize
target_size = CHUNK_BASE * (2 ** np.log10(dset_size / (1024.0 * 1024)))

if target_size > CHUNK_MAX:
Expand All @@ -321,15 +321,15 @@ def guess_chunk(shape, maxshape, typesize):
# 1b. We're within 50% of the target chunk size, AND
# 2. The chunk is smaller than the maximum chunk size

chunk_bytes = np.product(chunks) * typesize
chunk_bytes = np.prod(chunks) * typesize

if (
chunk_bytes < target_size
or abs(chunk_bytes - target_size) / target_size < 0.5
) and chunk_bytes < CHUNK_MAX:
break

if np.product(chunks) == 1:
if np.prod(chunks) == 1:
break # Element size larger than CHUNK_MAX

chunks[idx % ndims] = np.ceil(chunks[idx % ndims] / 2.0)
Expand Down
3 changes: 2 additions & 1 deletion h5pyd/_hl/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,14 +651,15 @@ def __getitem__(self, name):
# (and hince the httpconn socket won't be closed)
from .files import File
external_domain = link_json['h5domain']
if not op.isabs(external_domain):
if not external_domain.startswith("hdf5://") and not op.isabs(external_domain):
current_domain = self._id.http_conn.domain
external_domain = op.join(op.dirname(current_domain), external_domain)
external_domain = op.normpath(external_domain)
try:
endpoint = self.id.http_conn.endpoint
username = self.id.http_conn.username
password = self.id.http_conn.password
print(external_domain)
f = File(external_domain, endpoint=endpoint, username=username, password=password, mode='r')
except IOError:
# unable to find external link
Expand Down
Loading

0 comments on commit 0f10aa1

Please sign in to comment.