From 316c606008dde764a72114e3d5046b33a4264866 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Paul=20M=C3=BCller?= <dev@craban.de>
Date: Wed, 6 Dec 2023 15:14:50 +0100
Subject: [PATCH] enh: compute the hash of the input file

---
 CHANGELOG               |  3 +++
 mpl_data_cast/recipe.py | 29 ++++++++++++++++++++++-------
 mpl_data_cast/util.py   | 25 +++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 8408370..752ba05 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,6 @@
+0.6.1
+ - enh: compute the hash of the input file a second time while waiting
+   for the hash of the target file
 0.6.0
  - feat: generalize GUI to use all recipes
  - enh: prevent GUI from locking when transferring large file
diff --git a/mpl_data_cast/recipe.py b/mpl_data_cast/recipe.py
index 1db6489..7ca7280 100644
--- a/mpl_data_cast/recipe.py
+++ b/mpl_data_cast/recipe.py
@@ -8,7 +8,7 @@
 import uuid
 from typing import Type, Callable, List
 
-from .util import hashfile, copyhashfile
+from .util import HasherThread, hashfile, copyhashfile
 
 
 #: Files that are not copied (unless specified explicitly by a recipe)
@@ -222,14 +222,29 @@ def transfer_to_target_path(temp_path: pathlib.Path,
                 success = True
         else:
             # transfer to target_path
+            hash_input_verify = copyhashfile(temp_path, target_path)
+
+            # Compute the hash of the target path *and* the hash of the
+            # input path (you never know) again. We save some time here
+            # by computing the hash in two parallel threads (assuming
+            # disk/network speed is the bottleneck, not the CPU).
+            thr_out = HasherThread(target_path)
+            thr_out.start()
             if hash_input is None:
-                hash_input = copyhashfile(temp_path, target_path)
-            else:
-                shutil.copy2(temp_path, target_path)
-            # compute md5 hash of target path
-            hash_cp = hashfile(target_path)
+                thr_in = HasherThread(temp_path)
+                thr_in.start()
+                thr_in.join()
+                hash_input = thr_in.hash
+            thr_out.join()
+            hash_target = thr_out.hash
+
+            # sanity check
+            assert len(hash_target) == 32
+            assert len(hash_input) == 32
+            assert len(hash_input_verify) == 32
+
             # compare md5 hashes (verification)
-            success = hash_input == hash_cp
+            success = hash_input == hash_target == hash_input_verify
             if not success:
                 # Since we copied the wrong file, we are responsible for
                 # deleting it.
diff --git a/mpl_data_cast/util.py b/mpl_data_cast/util.py
index d645fdc..faab1f9 100644
--- a/mpl_data_cast/util.py
+++ b/mpl_data_cast/util.py
@@ -3,12 +3,37 @@
 import hashlib
 import pathlib
 import shutil
+import threading
 from typing import Callable
 
 
 DEFAULT_BLOCK_SIZE = 4 * (1024 ** 2)
 
 
+class HasherThread(threading.Thread):
+    def __init__(self, path, copy_to=None, *args, **kwargs):
+        """Thread for hashing files
+
+        Parameters
+        ----------
+        path: pathlib.Path
+            Path to hash
+        copy_to: pathlib.Path
+            Write data to this file while hashing
+        """
+        super(HasherThread, self).__init__(*args, **kwargs)
+        self.path = path
+        self.copy_to = copy_to
+        self.hash = None
+
+    def run(self):
+        if self.copy_to:
+            self.hash = copyhashfile(path_in=self.path,
+                                     path_out=self.copy_to)
+        else:
+            self.hash = hashfile(self.path)
+
+
 def copyhashfile(path_in: str | pathlib.Path,
                  path_out: str | pathlib.Path,
                  blocksize: int = DEFAULT_BLOCK_SIZE,