Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[c++,python] Fix default reads on unwritten dense arrays #3136

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions apis/python/src/tiledbsoma/_dense_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,10 +189,18 @@ def read(
# all, in which case the best we can do is use the schema shape.
handle: clib.SOMADenseNDArray = self._handle._handle

data_shape = handle.shape
ned = self.non_empty_domain()
if ned is not None:
data_shape = tuple(slot[1] + 1 for slot in ned)
ned = []
for dim_name in handle.dimension_names:
dtype = np.dtype(self.schema.field(dim_name).type.to_pandas_dtype())
slot = handle.non_empty_domain_slot_opt(dim_name, dtype)
if slot is None:
use_shape = True
break
ned.append(slot[1] + 1)
else:
use_shape = False

data_shape = tuple(handle.shape if use_shape else ned)
target_shape = dense_indices_to_shape(coords, data_shape, result_order)

context = handle.context()
Expand Down
56 changes: 56 additions & 0 deletions apis/python/src/tiledbsoma/soma_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,62 @@ void load_soma_array(py::module& m) {
}
})

.def(
"non_empty_domain_slot_opt",
[](SOMAArray& array, std::string name, py::dtype dtype) {
switch (np_to_tdb_dtype(dtype)) {
case TILEDB_UINT64:
return py::cast(
array.non_empty_domain_slot_opt<uint64_t>(name));
case TILEDB_DATETIME_YEAR:
case TILEDB_DATETIME_MONTH:
case TILEDB_DATETIME_WEEK:
case TILEDB_DATETIME_DAY:
case TILEDB_DATETIME_HR:
case TILEDB_DATETIME_MIN:
case TILEDB_DATETIME_SEC:
case TILEDB_DATETIME_MS:
case TILEDB_DATETIME_US:
case TILEDB_DATETIME_NS:
case TILEDB_DATETIME_PS:
case TILEDB_DATETIME_FS:
case TILEDB_DATETIME_AS:
case TILEDB_INT64:
return py::cast(
array.non_empty_domain_slot_opt<int64_t>(name));
case TILEDB_UINT32:
return py::cast(
array.non_empty_domain_slot_opt<uint32_t>(name));
case TILEDB_INT32:
return py::cast(
array.non_empty_domain_slot_opt<int32_t>(name));
case TILEDB_UINT16:
return py::cast(
array.non_empty_domain_slot_opt<uint16_t>(name));
case TILEDB_INT16:
return py::cast(
array.non_empty_domain_slot_opt<int16_t>(name));
case TILEDB_UINT8:
return py::cast(
array.non_empty_domain_slot_opt<uint8_t>(name));
case TILEDB_INT8:
return py::cast(
array.non_empty_domain_slot_opt<int8_t>(name));
case TILEDB_FLOAT64:
return py::cast(
array.non_empty_domain_slot_opt<double>(name));
case TILEDB_FLOAT32:
return py::cast(
array.non_empty_domain_slot_opt<float>(name));
case TILEDB_STRING_UTF8:
case TILEDB_STRING_ASCII:
return py::cast(array.non_empty_domain_slot_var(name));
default:
throw TileDBSOMAError(
"Unsupported dtype for nonempty domain.");
}
})

.def(
"soma_domain_slot",
[](SOMAArray& array, std::string name, py::dtype dtype) {
Expand Down
8 changes: 2 additions & 6 deletions apis/python/tests/test_basic_anndata_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,12 +176,8 @@ def test_import_anndata(conftest_pbmc_small, ingest_modes, X_kind):
assert X.used_shape() == tuple([(0, e - 1) for e in orig.shape])
else:
assert X.metadata.get(metakey) == "SOMADenseNDArray"
if have_ingested:
matrix = X.read(coords=all2d)
assert matrix.size == orig.X.size
else:
with pytest.raises(ValueError):
X.read(coords=all2d)
matrix = X.read(coords=all2d)
assert matrix.size == orig.X.size

# Check raw/X/data (sparse)
assert exp.ms["raw"].X["data"].metadata.get(metakey) == "SOMASparseNDArray"
Expand Down
15 changes: 15 additions & 0 deletions apis/python/tests/test_dense_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,3 +491,18 @@ def test_fixed_timestamp(tmp_path: pathlib.Path):

with pytest.raises(soma.SOMAError):
soma.open(tmp_path.as_posix(), context=fixed_time, tiledb_timestamp=111)


@pytest.mark.parametrize("shape", [(10,), (10, 20), (10, 20, 2), (2, 4, 6, 8)])
def test_read_to_unwritten_array(tmp_path, shape):
uri = tmp_path.as_posix()

soma.DenseNDArray.create(uri, type=pa.uint8(), shape=shape)

with tiledb.open(uri, "r") as A:
expected = A[:]["soma_data"]

with soma.DenseNDArray.open(uri, "r") as A:
actual = A.read().to_numpy()

assert np.array_equal(expected, actual)
42 changes: 28 additions & 14 deletions libtiledbsoma/src/soma/managed_query.cc
Original file line number Diff line number Diff line change
Expand Up @@ -100,22 +100,36 @@ void ManagedQuery::setup_read() {
return;
}

auto schema = array_->schema();

// If the query is uninitialized, set the subarray for the query
if (status == Query::Status::UNINITIALIZED) {
// Dense array must have a subarray set. If the array is dense and no
// ranges have been set, add a range for the array's entire non-empty
// domain on dimension 0.
if (array_->schema().array_type() == TILEDB_DENSE &&
!subarray_range_set_) {
auto non_empty_domain = array_->non_empty_domain<int64_t>(0);
subarray_->add_range(
0, non_empty_domain.first, non_empty_domain.second);
// domain on dimension 0. In the case that the non-empty domain does not
// exist (when the array has not been written to yet), use dimension 0's
// full domain
if (schema.array_type() == TILEDB_DENSE && !subarray_range_set_) {
// Check if the array has been written to by using the C API as
// there is no way to to check for an empty domain using the current
// CPP API
int32_t is_empty;
int64_t ned[2];
ctx_->handle_error(tiledb_array_get_non_empty_domain_from_index(
ctx_->ptr().get(), array_->ptr().get(), 0, &ned, &is_empty));

std::pair<int64_t, int64_t> array_shape;
if (is_empty == 1) {
array_shape = schema.domain().dimension(0).domain<int64_t>();
} else {
array_shape = std::make_pair(ned[0], ned[1]);
}

subarray_->add_range(0, array_shape.first, array_shape.second);
LOG_DEBUG(fmt::format(
"[ManagedQuery] Add full NED range to dense subarray = (0, {}, "
"{})",
non_empty_domain.first,
non_empty_domain.second));
"[ManagedQuery] Add full range to dense subarray = (0, {}, {})",
array_shape.first,
array_shape.second));
}

// Set the subarray for range slicing
Expand All @@ -125,14 +139,14 @@ void ManagedQuery::setup_read() {
// If no columns were selected, select all columns.
// Add dims and attrs in the same order as specified in the schema
if (columns_.empty()) {
if (array_->schema().array_type() == TILEDB_SPARSE) {
for (const auto& dim : array_->schema().domain().dimensions()) {
if (schema.array_type() == TILEDB_SPARSE) {
for (const auto& dim : schema.domain().dimensions()) {
columns_.push_back(dim.name());
}
}
int attribute_num = array_->schema().attribute_num();
int attribute_num = schema.attribute_num();
for (int i = 0; i < attribute_num; i++) {
columns_.push_back(array_->schema().attribute(i).name());
columns_.push_back(schema.attribute(i).name());
}
}

Expand Down
38 changes: 37 additions & 1 deletion libtiledbsoma/src/soma/soma_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -722,7 +722,8 @@ class SOMAArray : public SOMAObject {

/**
* Retrieves the non-empty domain from the array. This is the union of the
* non-empty domains of the array fragments.
* non-empty domains of the array fragments. Returns (0, 0) for empty
* domains.
*/
template <typename T>
std::pair<T, T> non_empty_domain_slot(const std::string& name) const {
Expand All @@ -733,6 +734,41 @@ class SOMAArray : public SOMAObject {
}
}

/**
* Retrieves the non-empty domain from the array. This is the union of the
* non-empty domains of the array fragments. Return std::nullopt for empty
* domains.
*/
template <typename T>
std::optional<std::pair<T, T>> non_empty_domain_slot_opt(
nguyenv marked this conversation as resolved.
Show resolved Hide resolved
const std::string& name) const {
try {
int32_t is_empty;
T ned[2];

// TODO currently we need to use the TileDB C API in order to check
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have a tracking task with @teo-tsirpanis ? If not please open one! :)

// if the domain is empty or not. The C++ API returns (0, 0)
// currently which could also represent a single point at coordinate
// 0. Replace this when the C++ API supports correct checking for
// empty domains
ctx_->tiledb_ctx()->handle_error(
tiledb_array_get_non_empty_domain_from_name(
ctx_->tiledb_ctx()->ptr().get(),
arr_->ptr().get(),
name.c_str(),
&ned,
&is_empty));

if (is_empty == 1) {
return std::nullopt;
} else {
return std::make_pair(ned[0], ned[1]);
}
} catch (const std::exception& e) {
throw TileDBSOMAError(e.what());
}
}

/**
* Retrieves the non-empty domain from the array on the given dimension.
* This is the union of the non-empty domains of the array fragments.
Expand Down
Loading