Skip to content

Commit

Permalink
Move all-features-deleted-diff code into rich_base_dataset
Browse files Browse the repository at this point in the history
A couple of different places need to generate a feature-diff
between the empty-set and all the features in a particular dataset.
This moves the code to do that inside the dataset itself.
  • Loading branch information
olsen232 committed Nov 10, 2021
1 parent 36380ea commit 20ce4fc
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 35 deletions.
53 changes: 33 additions & 20 deletions kart/base_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,44 +228,54 @@ def schema(self):
self._schema = super().schema
return self._schema

def get_data_at(self, rel_path, as_memoryview=False, missing_ok=False, tree=None):
def get_blob_at(self, rel_path, missing_ok=False, tree=None):
"""
Return the data at the given relative path from within this dataset.
Data is usually returned as a bytestring.
If as_memoryview=True is given, data is returned as a memoryview instead
(this avoids a copy, so can make loops more efficient for many rows)
Return the blob at the given relative path from within this dataset.
If missing_ok is true, we return None instead of raising a KeyError for
missing data.
If tree is set, the caller can override the tree in which to look for the data.
"""
leaf = None
key_error = None
tree = tree or self.inner_tree
try:
leaf = tree / str(rel_path)
except KeyError:
if leaf is not None and leaf.type_str == "blob":
return leaf
except KeyError as e:
key_error = e
except (AttributeError, TypeError):
pass

if leaf is not None and leaf.type_str == "blob":
if as_memoryview:
try:
return memoryview(leaf)
except TypeError:
pass
else:
try:
return leaf.data
except AttributeError:
pass

# If we got here, that means leaf wasn't a blob, or one of the above
# exceptions happened...
if missing_ok:
return None
else:
raise KeyError(f"No data found at rel-path {rel_path}, type={type(leaf)}")
detail = f": {key_error.args[0]}" if key_error is not None else ''
raise KeyError(
f"No data found at rel-path {rel_path}, type={type(leaf)}" + detail
)

def get_data_at(self, rel_path, as_memoryview=False, missing_ok=False, tree=None):
"""
Return the data at the given relative path from within this dataset.
Data is usually returned as a bytestring.
If as_memoryview=True is given, data is returned as a memoryview instead
(this avoids a copy, so can make loops more efficient for many rows)
If missing_ok is true, we return None instead of raising a KeyError for
missing data.
If tree is set, the caller can override the tree in which to look for the data.
"""
blob = self.get_blob_at(rel_path, missing_ok=missing_ok, tree=tree)
if blob is not None:
return memoryview(blob) if as_memoryview else blob.data
return None

def get_json_data_at(self, rel_path, missing_ok=False):
data = self.get_data_at(rel_path, missing_ok=missing_ok)
Expand Down Expand Up @@ -481,6 +491,9 @@ def get_feature(self, pk_values=None, *, path=None, data=None):
"""
raise NotImplementedError()

def get_feature_from_blob(self, feature_blob):
return self.get_feature(path=feature_blob.name, data=memoryview(feature_blob))


class IntegrityError(ValueError):
pass
8 changes: 1 addition & 7 deletions kart/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,15 +99,9 @@ def data_rm(ctx, message, output_format, datasets):
repo_diff = RepoDiff()
for ds_path in datasets:
dataset = repo.datasets()[ds_path]
pk_field = dataset.schema.pk_columns[0].name
feature_diff = DeltaDiff()
for feature in dataset.features():
delta = Delta.delete((feature[pk_field], feature))
feature_diff.add_delta(delta)

ds_diff = DatasetDiff()
ds_diff["meta"] = DeltaDiff.diff_dicts(dataset.meta_items(), {})
ds_diff["feature"] = feature_diff
ds_diff["feature"] = dataset.all_features_diff(delta_type=Delta.delete)
repo_diff[ds_path] = ds_diff

do_json = output_format == "json"
Expand Down
18 changes: 18 additions & 0 deletions kart/rich_base_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,3 +502,21 @@ def apply_feature_diff(
"Patch does not apply",
exit_code=PATCH_DOES_NOT_APPLY,
)

def all_features_diff(
self,
feature_filter=FeatureKeyFilter.MATCH_ALL,
delta_type=Delta.insert,
flags=0,
):
assert delta_type in (Delta.insert, Delta.delete)
feature_diff = DeltaDiff()
for blob in self.feature_blobs():
pk = self.decode_path_to_1pk(blob.name)
if pk not in feature_filter:
continue
feature_promise = functools.partial(self.get_feature_from_blob, blob)
delta = delta_type((pk, feature_promise))
delta.flags = flags
feature_diff.add_delta(delta)
return feature_diff
1 change: 0 additions & 1 deletion kart/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import click
import pygit2

from .diff_util import get_repo_diff
from .exceptions import (
InvalidOperation,
NotFound,
Expand Down
12 changes: 5 additions & 7 deletions kart/working_copy/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,13 +638,11 @@ def diff_db_to_tree_feature(

if schema_diff and schema_diff.type == "delete":
# The entire table has been deleted - add delete deltas for every feature.
feature_diff = DeltaDiff()
for feature in dataset.features():
if feature[pk_field] in feature_filter:
delta = Delta.delete((feature[pk_field], feature))
delta.flags = WORKING_COPY_EDIT
feature_diff.add_delta(delta)
return feature_diff
return dataset.all_features_diff(
feature_filter=feature_filter,
delta_type=Delta.delete,
flags=WORKING_COPY_EDIT,
)

find_renames = self.can_find_renames(meta_diff)

Expand Down

0 comments on commit 20ce4fc

Please sign in to comment.