Skip to content

Commit

Permalink
Always validate hashes when provided
Browse files Browse the repository at this point in the history
Currently when uploading datasets with associated hashes, this were
only validated if `validate_hashes=True` was also passed in the
request.

This commit drops `validate_hashes` completely and always validates
hashes when they were provided.
  • Loading branch information
nsoranzo committed Nov 5, 2024
1 parent e5635dc commit f7c5769
Show file tree
Hide file tree
Showing 16 changed files with 20 additions and 85 deletions.
21 changes: 1 addition & 20 deletions client/src/api/schema/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14083,21 +14083,6 @@ export interface components {
* @description The source of the content. Can be other history element to be copied or library elements.
*/
source: components["schemas"]["DatasetSourceType"];
/**
* Validate hashes
* @description Set to true to enable dataset validation during materialization.
* @default false
*/
validate_hashes: boolean;
};
/** MaterializeDatasetOptions */
MaterializeDatasetOptions: {
/**
* Validate hashes
* @description Set to true to enable dataset validation during materialization.
* @default false
*/
validate_hashes: boolean;
};
/** MessageExceptionModel */
MessageExceptionModel: {
Expand Down Expand Up @@ -24415,11 +24400,7 @@ export interface operations {
};
cookie?: never;
};
requestBody?: {
content: {
"application/json": components["schemas"]["MaterializeDatasetOptions"] | null;
};
};
requestBody?: never;
responses: {
/** @description Successful Response */
200: {
Expand Down
4 changes: 1 addition & 3 deletions lib/galaxy/managers/hdas.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,7 @@ def materialize(self, request: MaterializeDatasetInstanceTaskRequest, in_place:
else:
dataset_instance = self.ldda_manager.get_accessible(request.content, user)
history = self.app.history_manager.by_id(request.history_id)
new_hda = materializer.ensure_materialized(
dataset_instance, target_history=history, validate_hashes=request.validate_hashes, in_place=in_place
)
new_hda = materializer.ensure_materialized(dataset_instance, target_history=history, in_place=in_place)
if not in_place:
history.add_dataset(new_hda, set_hid=True)
else:
Expand Down
17 changes: 5 additions & 12 deletions lib/galaxy/model/deferred.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ def ensure_materialized(
self,
dataset_instance: Union[HistoryDatasetAssociation, LibraryDatasetDatasetAssociation],
target_history: Optional[History] = None,
validate_hashes: bool = False,
in_place: bool = False,
) -> HistoryDatasetAssociation:
"""Create a new detached dataset instance from the supplied instance.
Expand Down Expand Up @@ -143,9 +142,7 @@ def ensure_materialized(
sa_session.commit()
object_store_populator.set_dataset_object_store_id(materialized_dataset)
try:
path = self._stream_source(
target_source, dataset_instance.datatype, validate_hashes, materialized_dataset_hashes
)
path = self._stream_source(target_source, dataset_instance.datatype, materialized_dataset_hashes)
object_store.update_from_file(materialized_dataset, file_name=path)
materialized_dataset.set_size()
except Exception as e:
Expand All @@ -157,9 +154,7 @@ def ensure_materialized(
# TODO: optimize this by streaming right to this path...
# TODO: take into acount transform and ensure we are and are not modifying the file as appropriate.
try:
path = self._stream_source(
target_source, dataset_instance.datatype, validate_hashes, materialized_dataset_hashes
)
path = self._stream_source(target_source, dataset_instance.datatype, materialized_dataset_hashes)
shutil.move(path, transient_paths.external_filename)
materialized_dataset.external_filename = transient_paths.external_filename
except Exception as e:
Expand Down Expand Up @@ -211,14 +206,12 @@ def ensure_materialized(
materialized_dataset_instance.metadata_deferred = False
return materialized_dataset_instance

def _stream_source(
self, target_source: DatasetSource, datatype, validate_hashes: bool, dataset_hashes: List[DatasetHash]
) -> str:
def _stream_source(self, target_source: DatasetSource, datatype, dataset_hashes: List[DatasetHash]) -> str:
source_uri = target_source.source_uri
if source_uri is None:
raise Exception("Cannot stream from dataset source without specified source_uri")
path = stream_url_to_file(source_uri, file_sources=self._file_sources)
if validate_hashes and target_source.hashes:
if target_source.hashes:
for source_hash in target_source.hashes:
_validate_hash(path, source_hash, "downloaded file")

Expand All @@ -244,7 +237,7 @@ def _stream_source(
if datatype_groom:
datatype.groom_dataset_content(path)

if validate_hashes and dataset_hashes:
if dataset_hashes:
for dataset_hash in dataset_hashes:
_validate_hash(path, dataset_hash, "dataset contents")

Expand Down
3 changes: 2 additions & 1 deletion lib/galaxy/model/unittest_utils/store_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
TEST_SOURCE_URI_BAM = "https://raw.githubusercontent.com/galaxyproject/galaxy/dev/test-data/1.bam"
TEST_HASH_FUNCTION = "MD5"
TEST_HASH_VALUE = "f568c29421792b1b1df4474dafae01f1"
TEST_HASH_VALUE_BAM = "34693a376f9878cbe53166db083e0e26"
TEST_HISTORY_NAME = "My history in a model store"
TEST_EXTENSION = "bed"
TEST_LIBRARY_NAME = "My cool library"
Expand Down Expand Up @@ -339,7 +340,7 @@ def deferred_hda_model_store_dict_bam(
dataset_hash = dict(
model_class="DatasetHash",
hash_function=TEST_HASH_FUNCTION,
hash_value=TEST_HASH_VALUE,
hash_value=TEST_HASH_VALUE_BAM,
extra_files_path=None,
)
dataset_source: Dict[str, Any] = dict(
Expand Down
10 changes: 1 addition & 9 deletions lib/galaxy/schema/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -3682,15 +3682,7 @@ class PageSummaryBase(Model):
)


class MaterializeDatasetOptions(Model):
validate_hashes: bool = Field(
False,
title="Validate hashes",
description="Set to true to enable dataset validation during materialization.",
)


class MaterializeDatasetInstanceAPIRequest(MaterializeDatasetOptions):
class MaterializeDatasetInstanceAPIRequest(Model):
source: DatasetSourceType = Field(
title="Source",
description="The source of the content. Can be other history element to be copied or library elements.",
Expand Down
5 changes: 0 additions & 5 deletions lib/galaxy/schema/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,6 @@ class MaterializeDatasetInstanceTaskRequest(Model):
"- The decoded id of the HDA\n"
),
)
validate_hashes: bool = Field(
False,
title="Validate hashes",
description="Set to true to enable dataset validation during materialization.",
)


class ComputeDatasetHashTaskRequest(Model):
Expand Down
12 changes: 4 additions & 8 deletions lib/galaxy/tools/data_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def _resolve_item_with_primary(item):
hash_function = hash_dict.get("hash_function")
hash_value = hash_dict.get("hash_value")
try:
_handle_hash_validation(upload_config, hash_function, hash_value, path)
_handle_hash_validation(hash_function, hash_value, path)
except Exception as e:
error_message = str(e)
item["error_message"] = error_message
Expand Down Expand Up @@ -501,7 +501,7 @@ def _has_src_to_path(upload_config: "UploadConfig", item: Dict[str, Any], is_dat
for hash_function in HASH_NAMES:
hash_value = item.get(hash_function)
if hash_value:
_handle_hash_validation(upload_config, hash_function, hash_value, path)
_handle_hash_validation(hash_function, hash_value, path)
if name is None:
name = url.split("/")[-1]
elif src == "pasted":
Expand All @@ -516,11 +516,8 @@ def _has_src_to_path(upload_config: "UploadConfig", item: Dict[str, Any], is_dat
return name, path


def _handle_hash_validation(
upload_config: "UploadConfig", hash_function: HashFunctionNameEnum, hash_value: str, path: str
):
if upload_config.validate_hashes:
verify_hash(path, hash_func_name=hash_function, hash_value=hash_value, what="upload")
def _handle_hash_validation(hash_function: HashFunctionNameEnum, hash_value: str, path: str):
verify_hash(path, hash_func_name=hash_function, hash_value=hash_value, what="upload")


def _arg_parser():
Expand Down Expand Up @@ -566,7 +563,6 @@ def __init__(
self.to_posix_lines = request.get("to_posix_lines", False)
self.space_to_tab = request.get("space_to_tab", False)
self.auto_decompress = request.get("auto_decompress", False)
self.validate_hashes = request.get("validate_hashes", False)
self.deferred = request.get("deferred", False)
self.link_data_only = _link_data_only(request)
self.file_sources_dict = file_sources_dict
Expand Down
6 changes: 0 additions & 6 deletions lib/galaxy/webapps/galaxy/api/history_contents.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@
HistoryContentType,
MaterializeDatasetInstanceAPIRequest,
MaterializeDatasetInstanceRequest,
MaterializeDatasetOptions,
StoreExportPayload,
UpdateHistoryContentsBatchPayload,
UpdateHistoryContentsPayload,
Expand Down Expand Up @@ -1073,17 +1072,12 @@ def materialize_dataset(
history_id: HistoryIDPathParam,
id: HistoryItemIDPathParam,
trans: ProvidesHistoryContext = DependsOnTrans,
materialize_api_payload: Optional[MaterializeDatasetOptions] = Body(None),
) -> AsyncTaskResultSummary:
validate_hashes: bool = (
materialize_api_payload.validate_hashes if materialize_api_payload is not None else False
)
# values are already validated, use model_construct
materialize_request = MaterializeDatasetInstanceRequest.model_construct(
history_id=history_id,
source=DatasetSourceType.hda,
content=id,
validate_hashes=validate_hashes,
)
rval = self.service.materialize(trans, materialize_request)
return rval
Expand Down
1 change: 0 additions & 1 deletion lib/galaxy/webapps/galaxy/services/history_contents.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,7 +573,6 @@ def materialize(
history_id=request.history_id,
source=request.source,
content=request.content,
validate_hashes=request.validate_hashes,
user=trans.async_request_user,
)
results = materialize_task.delay(request=task_request)
Expand Down
1 change: 0 additions & 1 deletion lib/galaxy/workflow/scheduling_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,6 @@ def __attempt_materialize(self, workflow_invocation, session) -> bool:
history_id=workflow_invocation.history.id,
source="hda",
content=hda.id,
validate_hashes=True,
)
materialized_okay = self.app.hda_manager.materialize(task_request, in_place=True)
if not materialized_okay:
Expand Down
1 change: 0 additions & 1 deletion lib/galaxy_test/api/test_libraries.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,6 @@ def test_fetch_failed_validation(self):
payload = {
"history_id": history_id, # TODO: Shouldn't be needed :(
"targets": targets,
"validate_hashes": True,
}
tool_response = self.dataset_populator.fetch(payload, assert_ok=False)
job = self.dataset_populator.check_run(tool_response)
Expand Down
2 changes: 0 additions & 2 deletions lib/galaxy_test/api/test_tools_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -1025,7 +1025,6 @@ def test_upload_and_validate_hash_valid(self):
payload = {
"history_id": history_id,
"targets": targets,
"validate_hashes": True,
}
fetch_response = self.dataset_populator.fetch(payload)
self._assert_status_code_is(fetch_response, 200)
Expand All @@ -1050,7 +1049,6 @@ def test_upload_and_validate_hash_invalid(self):
payload = {
"history_id": history_id,
"targets": targets,
"validate_hashes": True,
}
fetch_response = self.dataset_populator.fetch(payload, assert_ok=True, wait=False)
self._assert_status_code_is(fetch_response, 200)
Expand Down
7 changes: 1 addition & 6 deletions lib/galaxy_test/base/populators.py
Original file line number Diff line number Diff line change
Expand Up @@ -1023,9 +1023,7 @@ def tools_post(self, payload: dict, url="tools") -> Response:
def describe_tool_execution(self, tool_id: str) -> "DescribeToolExecution":
return DescribeToolExecution(self, tool_id)

def materialize_dataset_instance(
self, history_id: str, id: str, source: str = "hda", validate_hashes: bool = False
):
def materialize_dataset_instance(self, history_id: str, id: str, source: str = "hda"):
payload: Dict[str, Any]
if source == "ldda":
url = f"histories/{history_id}/materialize"
Expand All @@ -1036,8 +1034,6 @@ def materialize_dataset_instance(
else:
url = f"histories/{history_id}/contents/datasets/{id}/materialize"
payload = {}
if validate_hashes:
payload["validate_hashes"] = True
create_response = self._post(url, payload, json=True)
api_asserts.assert_status_code_is_ok(create_response)
create_response_json = create_response.json()
Expand Down Expand Up @@ -2780,7 +2776,6 @@ def fetch_single_url_to_folder(self, file_type="auto", assert_ok=True):
payload = {
"history_id": history_id, # TODO: Shouldn't be needed :(
"targets": targets,
"validate_hashes": True,
}
return library, self.dataset_populator.fetch(payload, assert_ok=assert_ok)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def test_materialize_hash_failure(self, history_id: str):
assert deferred_hda["state"] == "deferred"
assert not deferred_hda["deleted"]

self.dataset_populator.materialize_dataset_instance(history_id, deferred_hda["id"], validate_hashes=True)
self.dataset_populator.materialize_dataset_instance(history_id, deferred_hda["id"])
self.dataset_populator.wait_on_history_length(history_id, 2)
new_hda_details = self.dataset_populator.get_history_dataset_details(
history_id, hid=2, assert_ok=False, wait=False
Expand Down
5 changes: 0 additions & 5 deletions test/unit/app/tools/test_data_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ def test_simple_path_get(hash_value: str, error_message: Optional[str]):
],
}
],
"validate_hashes": True,
}
execute_context.execute_request(request)
output = _unnamed_output(execute_context)
Expand Down Expand Up @@ -111,7 +110,6 @@ def test_correct_md5():
],
}
],
"validate_hashes": True,
}
execute_context.execute_request(request)
output = _unnamed_output(execute_context)
Expand Down Expand Up @@ -142,7 +140,6 @@ def test_incorrect_md5():
],
}
],
"validate_hashes": True,
}
execute_context.execute_request(request)
output = _unnamed_output(execute_context)
Expand Down Expand Up @@ -175,7 +172,6 @@ def test_correct_sha1():
],
}
],
"validate_hashes": True,
}
execute_context.execute_request(request)
output = _unnamed_output(execute_context)
Expand Down Expand Up @@ -206,7 +202,6 @@ def test_incorrect_sha1():
],
}
],
"validate_hashes": True,
}
execute_context.execute_request(request)
output = _unnamed_output(execute_context)
Expand Down
8 changes: 4 additions & 4 deletions test/unit/data/test_dataset_materialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def test_hash_validate():
_assert_2_bed_metadata(deferred_hda)
assert deferred_hda.dataset.state == "deferred"
materializer = materializer_factory(True, object_store=fixture_context.app.object_store)
materialized_hda = materializer.ensure_materialized(deferred_hda, validate_hashes=True)
materialized_hda = materializer.ensure_materialized(deferred_hda)
materialized_dataset = materialized_hda.dataset
assert materialized_dataset.state == "ok"

Expand All @@ -86,7 +86,7 @@ def test_hash_invalid():
_assert_2_bed_metadata(deferred_hda)
assert deferred_hda.dataset.state == "deferred"
materializer = materializer_factory(True, object_store=fixture_context.app.object_store)
materialized_hda = materializer.ensure_materialized(deferred_hda, validate_hashes=True)
materialized_hda = materializer.ensure_materialized(deferred_hda)
materialized_dataset = materialized_hda.dataset
assert materialized_dataset.state == "error"

Expand All @@ -103,7 +103,7 @@ def test_hash_validate_source_of_download():
_assert_2_bed_metadata(deferred_hda)
assert deferred_hda.dataset.state == "deferred"
materializer = materializer_factory(True, object_store=fixture_context.app.object_store)
materialized_hda = materializer.ensure_materialized(deferred_hda, validate_hashes=True)
materialized_hda = materializer.ensure_materialized(deferred_hda)
materialized_dataset = materialized_hda.dataset
assert materialized_dataset.state == "ok", materialized_hda.info

Expand All @@ -120,7 +120,7 @@ def test_hash_invalid_source_of_download():
_assert_2_bed_metadata(deferred_hda)
assert deferred_hda.dataset.state == "deferred"
materializer = materializer_factory(True, object_store=fixture_context.app.object_store)
materialized_hda = materializer.ensure_materialized(deferred_hda, validate_hashes=True)
materialized_hda = materializer.ensure_materialized(deferred_hda)
materialized_dataset = materialized_hda.dataset
assert materialized_dataset.state == "error", materialized_hda.info

Expand Down

0 comments on commit f7c5769

Please sign in to comment.