Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Austenem/CAT-960 Fix EPIC redirects #3583

Merged
merged 10 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG-fix-epic-redirects.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
- Update support entity and processed/component dataset redirects to go only to the appropriate primary datasets.
43 changes: 22 additions & 21 deletions context/app/routes_browse.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

from .utils import (
get_default_flask_data, make_blueprint, get_client,
get_url_base_from_request, entity_types)
get_url_base_from_request, entity_types, find_earliest_dataset_ancestor,
should_redirect_entity)


blueprint = make_blueprint(__name__)
Expand Down Expand Up @@ -48,39 +49,39 @@ def details(type, uuid):
entity = client.get_entity(uuid)
actual_type = entity['entity_type'].lower()

# Redirect to primary dataset if this is
# - a support entity (e.g. an image pyramid)
# - a processed or component dataset
is_support = actual_type == 'support'
is_processed = entity.get('processing') != 'raw' and actual_type == 'dataset'
is_component = entity.get('is_component', False) is True
if (is_support or is_processed or is_component):
supported_entity = client.get_entities(
if (should_redirect_entity(entity)):
earliest_uuid = find_earliest_dataset_ancestor(client, uuid)
earliest_dataset = client.get_entities(
'datasets',
query_override={
"bool": {
"must": {
"terms": {
"descendant_ids": [uuid]
"term": {
"uuid": earliest_uuid
}
}
}
},
non_metadata_fields=['hubmap_id', 'uuid']
non_metadata_fields=['uuid', 'processing', 'entity_type', 'is_component']
)

pipeline_anchor = entity.get('pipeline', entity.get('hubmap_id')).replace(' ', '')
anchor = quote(f'section-{pipeline_anchor}-{entity.get("status")}').lower()

if len(supported_entity) > 0:
return redirect(
url_for('routes_browse.details',
type='dataset',
uuid=supported_entity[0]['uuid'],
_anchor=anchor,
redirected=True,
redirectedFromId=entity.get('hubmap_id'),
redirectedFromPipeline=entity.get('pipeline')))
# Check whether the oldest found ancestor exists and is of an expected type. 404 is
# preferable to a page that shows only a support entity or processed/component dataset.
if not earliest_dataset or should_redirect_entity(earliest_dataset[0]):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to call should_redirect_entity again?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was originally to check that the retrieved entity was not actually a processed/support dataset - with the updated query, this shouldn't be necessary.

abort(404)

# Redirect to the primary dataset
return redirect(
url_for('routes_browse.details',
type='dataset',
uuid=earliest_uuid,
_anchor=anchor,
redirected=True,
redirectedFromId=entity.get('hubmap_id'),
redirectedFromPipeline=entity.get('pipeline')))

if type != actual_type:
return redirect(url_for('routes_browse.details', type=actual_type, uuid=uuid))
Expand Down
136 changes: 136 additions & 0 deletions context/app/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import pytest
from unittest.mock import Mock
from .utils import find_earliest_dataset_ancestor


@pytest.mark.parametrize(
"initial_uuid, client_responses, expected_result",
[
# Dataset with no ancestors
(
"uuid_initial",
[
[{
"hubmap_id": "initial_id",
"uuid": "uuid_initial",
"immediate_ancestors": [],
"entity_type": "Dataset"
}]
],
"uuid_initial"
),
# Dataset with a non-dataset parent
(
"uuid_initial",
[
[{
"hubmap_id": "initial_id",
"uuid": "uuid_initial",
"immediate_ancestors": [{"uuid": "uuid_ancestor", "entity_type": "Sample"}],
"entity_type": "Dataset"
}],
[{
"hubmap_id": "ancestor_id",
"uuid": "uuid_ancestor",
"immediate_ancestors": [],
"entity_type": "Sample"
}]
],
"uuid_initial"
),
# Dataset with a dataset parent
(
"uuid_initial",
[
[{
"hubmap_id": "initial_id",
"uuid": "uuid_initial",
"immediate_ancestors": [{"uuid": "uuid_ancestor", "entity_type": "Dataset"}],
"entity_type": "Dataset"
}],
[{
"hubmap_id": "ancestor_id",
"uuid": "uuid_ancestor",
"immediate_ancestors": [],
"entity_type": "Dataset"
}]
],
"uuid_ancestor"
),
# Dataset with a dataset grandparent
(
"uuid_initial",
[
[{
"hubmap_id": "initial_id",
"uuid": "uuid_initial",
"immediate_ancestors": [{"uuid": "uuid_ancestor1", "entity_type": "Dataset"}],
"entity_type": "Dataset"
}],
[{
"hubmap_id": "ancestor1_id",
"uuid": "uuid_ancestor1",
"immediate_ancestors": [{"uuid": "uuid_ancestor2", "entity_type": "Dataset"}],
"entity_type": "Dataset"
}],
[{
"hubmap_id": "ancestor2_id",
"uuid": "uuid_ancestor2",
"immediate_ancestors": [{"uuid": "uuid_ancestor3", "entity_type": "Donor"}],
"entity_type": "Dataset"
}],
[{
"hubmap_id": "ancestor3_id",
"uuid": "uuid_ancestor3",
"immediate_ancestors": [],
"entity_type": "Donor"
}]
],
"uuid_ancestor2"
),
# Dataset with a dataset great-grandparent
(
"uuid_initial",
[
[{
"hubmap_id": "initial_id",
"uuid": "uuid_initial",
"immediate_ancestors": [{"uuid": "uuid_ancestor1", "entity_type": "Dataset"}],
"entity_type": "Dataset"
}],
[{
"hubmap_id": "ancestor1_id",
"uuid": "uuid_ancestor1",
"immediate_ancestors": [{"uuid": "uuid_ancestor2", "entity_type": "Dataset"}],
"entity_type": "Dataset"
}],
[{
"hubmap_id": "ancestor2_id",
"uuid": "uuid_ancestor2",
"immediate_ancestors": [{"uuid": "uuid_ancestor3", "entity_type": "Dataset"}],
"entity_type": "Dataset"
}],
[{
"hubmap_id": "ancestor3_id",
"uuid": "uuid_ancestor3",
"immediate_ancestors": [{"uuid": "uuid_ancestor4", "entity_type": "Donor"}],
"entity_type": "Dataset"
}],
[{
"hubmap_id": "ancestor4_id",
"uuid": "uuid_ancestor4",
"immediate_ancestors": [],
"entity_type": "Donor"
}]
],
"uuid_ancestor3"
),

]
)
def test_find_earliest_dataset_ancestor(initial_uuid, client_responses, expected_result):
client = Mock()
client.get_entities.side_effect = client_responses

result = find_earliest_dataset_ancestor(client, initial_uuid)
assert result == expected_result
46 changes: 46 additions & 0 deletions context/app/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,49 @@ def get_organs():
dir_path = Path(dirname(__file__) + '/organ')
organs = {p.stem: safe_load(p.read_text()) for p in dir_path.glob('*.yaml')}
return organs


# Redirect to primary dataset if this entity is
# - non-existent
# - a support entity (e.g. an image pyramid)
# - a processed or component dataset
def should_redirect_entity(entity):
if not entity:
return True

actual_type = entity.get('entity_type').lower()
is_support_type = actual_type == 'support'
is_component = entity.get('is_component', False) is True
is_not_raw_dataset = entity.get('processing') != 'raw' and actual_type == 'dataset'

if is_support_type or is_component or is_not_raw_dataset:
return True

return False


def find_earliest_dataset_ancestor(client, uuid):
dataset = client.get_entities(
'datasets',
query_override={
"bool": {
"must": {
"term": {
"uuid": uuid
}
}
}
},
non_metadata_fields=['hubmap_id', 'uuid', 'immediate_ancestors', 'entity_type']
)

# If no dataset is found or it has no ancestors, return the current dataset UUID
if not dataset or not dataset[0].get('immediate_ancestors'):
return uuid

# Traverse through immediate ancestors to find the earliest dataset ancestor
for ancestor in dataset[0]['immediate_ancestors']:
if ancestor.get('entity_type') == 'Dataset':
uuid = find_earliest_dataset_ancestor(client, ancestor.get('uuid'))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We shouldn't depend on immediate_ancestors since it's somewhat deprecated and we should also avoid recursion and multiple requests if possible.

Can we use a query that returns the dataset which has processing === 'raw', does not have the field ancestor_counts.entity_type.Dataset, and its uuid is included in the given dataset's ancestors_ids? Let me know if you need help drafting the query.


return uuid
Loading