From 316c606008dde764a72114e3d5046b33a4264866 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paul=20M=C3=BCller?= Date: Wed, 6 Dec 2023 15:14:50 +0100 Subject: [PATCH] enh: compute the hash of the input file --- CHANGELOG | 3 +++ mpl_data_cast/recipe.py | 29 ++++++++++++++++++++++------- mpl_data_cast/util.py | 25 +++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 7 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 8408370..752ba05 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,6 @@ +0.6.1 + - enh: compute the hash of the input file a second time while waiting + for the hash of the target file 0.6.0 - feat: generalize GUI to use all recipes - enh: prevent GUI from locking when transferring large file diff --git a/mpl_data_cast/recipe.py b/mpl_data_cast/recipe.py index 1db6489..7ca7280 100644 --- a/mpl_data_cast/recipe.py +++ b/mpl_data_cast/recipe.py @@ -8,7 +8,7 @@ import uuid from typing import Type, Callable, List -from .util import hashfile, copyhashfile +from .util import HasherThread, hashfile, copyhashfile #: Files that are not copied (unless specified explicitly by a recipe) @@ -222,14 +222,29 @@ def transfer_to_target_path(temp_path: pathlib.Path, success = True else: # transfer to target_path + hash_input_verify = copyhashfile(temp_path, target_path) + + # Compute the hash of the target path *and* the hash of the + # input path (you never know) again. We save some time here + # by computing the hash in two parallel threads (assuming + # disk/network speed is the bottleneck, not the CPU). + thr_out = HasherThread(target_path) + thr_out.start() if hash_input is None: - hash_input = copyhashfile(temp_path, target_path) - else: - shutil.copy2(temp_path, target_path) - # compute md5 hash of target path - hash_cp = hashfile(target_path) + thr_in = HasherThread(temp_path) + thr_in.start() + thr_in.join() + hash_input = thr_in.hash + thr_out.join() + hash_target = thr_out.hash + + # sanity check + assert len(hash_target) == 32 + assert len(hash_input) == 32 + assert len(hash_input_verify) == 32 + # compare md5 hashes (verification) - success = hash_input == hash_cp + success = hash_input == hash_target == hash_input_verify if not success: # Since we copied the wrong file, we are responsible for # deleting it. diff --git a/mpl_data_cast/util.py b/mpl_data_cast/util.py index d645fdc..faab1f9 100644 --- a/mpl_data_cast/util.py +++ b/mpl_data_cast/util.py @@ -3,12 +3,37 @@ import hashlib import pathlib import shutil +import threading from typing import Callable DEFAULT_BLOCK_SIZE = 4 * (1024 ** 2) +class HasherThread(threading.Thread): + def __init__(self, path, copy_to=None, *args, **kwargs): + """Thread for hashing files + + Parameters + ---------- + path: pathlib.Path + Path to hash + copy_to: pathlib.Path + Write data to this file while hashing + """ + super(HasherThread, self).__init__(*args, **kwargs) + self.path = path + self.copy_to = copy_to + self.hash = None + + def run(self): + if self.copy_to: + self.hash = copyhashfile(path_in=self.path, + path_out=self.copy_to) + else: + self.hash = hashfile(self.path) + + def copyhashfile(path_in: str | pathlib.Path, path_out: str | pathlib.Path, blocksize: int = DEFAULT_BLOCK_SIZE,