Merge pull request #11 from MusicalNinjaRandInt/working

0.3.0 - speed improvements
MusicalNinjaDad · Dec 13, 2023 · b3e7c1e · b3e7c1e
2 parents f078ce5 + bab3ff6
commit b3e7c1e
Show file tree

Hide file tree

Showing 12 changed files with 115 additions and 36 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # CHANGELOG - Link Duplicates
 
+## 0.3.0 - speed improvements
+
+- speed up cases where reprocessing after a previous run
+- add totals to info output
+
 ## v0.2.3
 
 - add timestamps to status output

diff --git a/__version__ b/__version__
@@ -1 +1 @@
-0.2.3
+0.3.0
diff --git a/duplicates/bufferediofile.py b/duplicates/bufferediofile.py
@@ -55,7 +55,7 @@ def __repr__(self) -> str:
     def open(self):
         with open(self.path, 'rb') as self.__handle: 
             yield
-            self.__handle = None
+        self.__handle = None
 
     class _FileIterator():
         def __init__(self, handle: BufferedIOBase, chunksize: int) -> None:

diff --git a/duplicates/cli.py b/duplicates/cli.py
@@ -36,4 +36,5 @@ def dupes(rootdir, link, approved, _list, short):
             confirm('Link files?', abort=True, err=True) 
             print('', file=sys.stderr) #prompting to stderr doesn't echo input (including \n)
         _logger.info(f'Linking files in {os.fspath(rootdir)} ...')
-        duplicatefiles.link()
+        duplicatefiles.link()
+        _logger.info(f'Done')
diff --git a/duplicates/dupes.py b/duplicates/dupes.py
@@ -17,20 +17,22 @@ def frompath(cls, rootpath: Path):
         _logger.info(f'Initiating search of {rootpath}')
 
         samesizefiles = _filesofsamesize(rootpath)
-        _logger.info(f'Found {len(samesizefiles)} groups of same-sized files')
-
+        allfiles = {file for fileset in samesizefiles for file in fileset}
+        _logger.info(f'Found {len(samesizefiles)} groups of same-sized files, totalling {len(allfiles)} files')
+
         inoindex = _indexbyino(file for samesizeset in samesizefiles for file in samesizeset)
-        allfiles = {file for fileset in inoindex.values() for file in fileset}
         uniqueinos = frozenset(next(iter(files)) for files in inoindex.values())
-        _logger.info(f'Identified {len(allfiles)-len(uniqueinos)} pre-existing hard links')
+        _logger.info(f'Identified {len(allfiles)-len(uniqueinos)} pre-existing hard links, leaving {len(uniqueinos)} files for comparison')
         _logger.info(f'Will now begin comparing file contents, this may take some time')
 
         dupes = set()
         for fileset in samesizefiles:
             nohardlinks = fileset.intersection(uniqueinos)
-            with ExitStack() as stack:
-                _ = [stack.enter_context(file.open()) for file in nohardlinks]
-                dupes |= comparefilecontents({frozenset(nohardlinks)})
+            if len(nohardlinks) > 1:
+                with ExitStack() as stack:
+                    _ = [stack.enter_context(file.open()) for file in nohardlinks]
+                    dupes |= comparefilecontents({frozenset(nohardlinks)})
+
         alldupes = {file for fileset in dupes for file in fileset}
         totalsize = sum(file.stat.st_size for file in alldupes)
         futuresize = sum(next(iter(group)).stat.st_size for group in dupes)

diff --git a/test/conftest.py b/test/conftest.py
@@ -1,3 +1,4 @@
+import os
 import uuid
 from collections import defaultdict
 from contextlib import ExitStack
@@ -54,22 +55,34 @@ class Testfiles():
 
 @fixture
 def copiedtestfiles(request, tmp_path) -> Testfiles:
-    yield from copytestfiles(request, tmp_path)
+    filestocopy = request.node.get_closest_marker('copyfiles')
+    if filestocopy.args:
+        yield copytestfiles(request, tmp_path, filestocopy.args)
+    else:
+        def mktmp(id): 
+            dir = tmp_path / Path(id)
+            dir.mkdir()
+            return dir
+        yield {setname: copytestfiles(request, mktmp(setname), setoffiles) for setname, setoffiles in filestocopy.kwargs.items()}
+
 
 @fixture(scope='class')
 def classtestfiles(request, tmp_path_factory) -> Testfiles:
     tmp_dir = tmp_path_factory.mktemp(str(request.node.name))
-    yield from copytestfiles(request, tmp_dir)
+    filestocopy = request.node.get_closest_marker('copyfiles')
+    if filestocopy.args:
+        yield copytestfiles(request, tmp_dir, filestocopy.args)
+    else:
+        raise NotImplementedError
 
-def copytestfiles(request, tmp_path) -> Testfiles:
+def copytestfiles(request, tmp_path, filestocopy) -> Testfiles:
     tmp_files = Testfiles(
         root = tmp_path,
         paths = defaultdict(list),
         handles = defaultdict(list)
     )
 
-    filestocopy = request.node.get_closest_marker('copyfiles')
-    for file in filestocopy.args:
+    for file in filestocopy:
         fileid, numcopies = file
         for _ in range(numcopies):
             uniquedir = tmp_path / str(uuid.uuid1())
@@ -87,7 +100,7 @@ def copytestfiles(request, tmp_path) -> Testfiles:
                 newfile = tmp_path / uniquedir / sourcefiles.paths[fileid].name
                 newfile.hardlink_to(tmp_files.paths[fileid][0])
                 tmp_files.paths[fileid].append(newfile)
-    yield tmp_files
+    return tmp_files
 
 @fixture
 def filesopen(copiedtestfiles):

diff --git a/test/majorver/test_cli.py b/test/majorver/test_cli.py
@@ -16,12 +16,13 @@ def test_link(copiedtestfiles):
 
     output = [
         f'Initiating search of {copiedtestfiles.root}',
-        f'Found 2 groups of same-sized files',
-        f'Identified 0 pre-existing hard links',
+        f'Found 2 groups of same-sized files, totalling 5 files',
+        f'Identified 0 pre-existing hard links, leaving 5 files for comparison',
         f'Will now begin comparing file contents, this may take some time',
         f'Identified 2 sets of duplicate files, totalling 5 files',
         f'Current usage: 101, future usage: 39, saving: 62',
-        f'Linking files in {copiedtestfiles.root} ...'
+        f'Linking files in {copiedtestfiles.root} ...',
+        'Done'
     ]
 
     stderr = [removetimestamp(s.strip()) for s in completed.stderr.decode().strip().split('\n')]
@@ -52,8 +53,8 @@ def test_nolink(copiedtestfiles):
 
     output = [
         f'Initiating search of {copiedtestfiles.root}',
-        f'Found 2 groups of same-sized files',
-        f'Identified 0 pre-existing hard links',
+        f'Found 2 groups of same-sized files, totalling 5 files',
+        f'Identified 0 pre-existing hard links, leaving 5 files for comparison',
         f'Will now begin comparing file contents, this may take some time',
         f'Identified 2 sets of duplicate files, totalling 5 files',
         f'Current usage: 101, future usage: 39, saving: 62'

diff --git a/test/majorver/test_dupes_identification.py b/test/majorver/test_dupes_identification.py
@@ -87,4 +87,19 @@ def test_instantiate_dropsymlinks(copiedtestfiles):
     with skipon(OSError, lambda e: e.winerror == 1314, 'SymLinks not available on Windows without DevMode enabled'):
         symlink.symlink_to(fileA)
     duplicatefiles = DuplicateFiles.frompath(copiedtestfiles.root)
-    assert duplicatefiles.duplicates == {frozenset(path for path in copiedtestfiles.paths['fileA'])}, f'Following files identified as duplicates: {duplicatefiles.duplicates}'
+    assert duplicatefiles.duplicates == {frozenset(path for path in copiedtestfiles.paths['fileA'])}, f'Following files identified as duplicates: {duplicatefiles.duplicates}'
+
+@mark.copyfiles(('fileA',1), ('fileB',2))
+@mark.linkfiles(('fileA',2))
+def test_somefilesalreadyprocessed(copiedtestfiles):
+    identicalfiles = DuplicateFiles.frompath(copiedtestfiles.root)
+    assert identicalfiles.duplicates == {
+        frozenset(BufferedIOFile(path) for path in copiedtestfiles.paths['fileB'])
+    }
+
+@mark.copyfiles(
+    set1 = (('fileA',1), ('fileB',2)),
+    set2 = (('fileA2',2), ('fileB',2))
+    )
+def test_nocommonroot(copiedtestfiles):
+    pass
diff --git a/test/majorver/test_dupes_linking.py b/test/majorver/test_dupes_linking.py
@@ -25,4 +25,21 @@ def test_link_duplicatefileswithmultiplegroupsoflinks(copiedtestfiles):
     inoscorrect = {(fileid, i): file.stat().st_ino == fileAino for fileid in ('fileA', 'fileA-copy') for i, file in enumerate(copiedtestfiles.paths[fileid])}
     assert all(
         inoscorrect.values()
-    ), f'{inoscorrect}'
+    ), f'{inoscorrect}'
+
+@mark.copyfiles(('fileA',1))
+@mark.linkfiles(('fileA',2))
+def test_donothingifonlylinks(copiedtestfiles, monkeypatch):
+    class InvalidCallToReplaceWithLinkError(Exception):
+            pass
+
+    @contextmanager
+    def _dontlink(keep, link):
+        raise InvalidCallToReplaceWithLinkError
+
+    from ...duplicates import dupes
+    monkeypatch.setattr(dupes, "_replacewithlink", _dontlink)
+    # Monkeypatch was validated by adding an extra set of files which did need linking
+
+    duplicatefiles = DuplicateFiles.frompath(copiedtestfiles.root)
+    duplicatefiles.link()
diff --git a/test/majorver/test_logging.py b/test/majorver/test_logging.py
@@ -12,8 +12,8 @@ def test_instantiatefrompath(copiedtestfiles, caplog):
 
     expectedmessages = [
         f'Initiating search of {copiedtestfiles.root}',
-        f'Found 1 groups of same-sized files',
-        f'Identified 1 pre-existing hard links',
+        f'Found 1 groups of same-sized files, totalling 4 files',
+        f'Identified 1 pre-existing hard links, leaving 3 files for comparison',
         f'Will now begin comparing file contents, this may take some time',
         f'Identified 1 sets of duplicate files, totalling 2 files',
         f'Current usage: 32, future usage: 16, saving: 16'

diff --git a/test/minorver/test_cli.py b/test/minorver/test_cli.py
@@ -16,12 +16,13 @@ def test_link(copiedtestfiles):
 
     output = [
         f'Initiating search of {copiedtestfiles.root}',
-        f'Found 2 groups of same-sized files',
-        f'Identified 0 pre-existing hard links',
+        f'Found 2 groups of same-sized files, totalling 5 files',
+        f'Identified 0 pre-existing hard links, leaving 5 files for comparison',
         f'Will now begin comparing file contents, this may take some time',
         f'Identified 2 sets of duplicate files, totalling 5 files',
         f'Current usage: 101, future usage: 39, saving: 62',
-        f'Linking files in {copiedtestfiles.root} ...'
+        f'Linking files in {copiedtestfiles.root} ...',
+        'Done'
     ]
 
     stderr = [removetimestamp(s.strip()) for s in result.stderr.strip().split('\n')]
@@ -49,13 +50,14 @@ def test_linkapproved(copiedtestfiles):
 
     output = [
         f'Initiating search of {copiedtestfiles.root}',
-        f'Found 2 groups of same-sized files',
-        f'Identified 0 pre-existing hard links',
+        f'Found 2 groups of same-sized files, totalling 5 files',
+        f'Identified 0 pre-existing hard links, leaving 5 files for comparison',
         f'Will now begin comparing file contents, this may take some time',
         f'Identified 2 sets of duplicate files, totalling 5 files',
         f'Current usage: 101, future usage: 39, saving: 62',
         'Link files? [y/N]:', #prompting to stderr doesn't echo input (including \n)
-        f'Linking files in {copiedtestfiles.root} ...'
+        f'Linking files in {copiedtestfiles.root} ...',
+        'Done'
     ]
 
     stderr = [removetimestamp(s.strip()) for s in result.stderr.strip().split('\n')]
@@ -84,8 +86,8 @@ def test_link_abort(copiedtestfiles):
 
     output = [
         f'Initiating search of {copiedtestfiles.root}',
-        f'Found 2 groups of same-sized files',
-        f'Identified 0 pre-existing hard links',
+        f'Found 2 groups of same-sized files, totalling 5 files',
+        f'Identified 0 pre-existing hard links, leaving 5 files for comparison',
         f'Will now begin comparing file contents, this may take some time',
         f'Identified 2 sets of duplicate files, totalling 5 files',
         f'Current usage: 101, future usage: 39, saving: 62',
@@ -115,8 +117,8 @@ def test_nolink(copiedtestfiles):
 
     output = [
         f'Initiating search of {copiedtestfiles.root}',
-        f'Found 2 groups of same-sized files',
-        f'Identified 0 pre-existing hard links',
+        f'Found 2 groups of same-sized files, totalling 5 files',
+        f'Identified 0 pre-existing hard links, leaving 5 files for comparison',
         f'Will now begin comparing file contents, this may take some time',
         f'Identified 2 sets of duplicate files, totalling 5 files',
         f'Current usage: 101, future usage: 39, saving: 62'

diff --git a/test/minorver/test_filecomparison.py b/test/minorver/test_filecomparison.py
@@ -44,4 +44,27 @@ def test_indexbyino(copiedtestfiles):
     inoindex = _indexbyino(filestocompare)
     assert len(inoindex) == 2
     assert {copiedtestfiles.paths['fileA'][0], copiedtestfiles.paths['fileA'][2]} in inoindex.values()
-    assert {copiedtestfiles.paths['fileA'][1]} in inoindex.values()
+    assert {copiedtestfiles.paths['fileA'][1]} in inoindex.values()
+
+@mark.copyfiles(('fileA',1))
+@mark.linkfiles(('fileA',2))
+def test_dontscanoridentifyifonlylinks(copiedtestfiles, monkeypatch):
+    """This can waste a lot of time if there are files which have already been processed by a previous run of dupes and no new copies are present.
+    """
+    class InvalidCallToOpenError(Exception):
+            pass
+
+    @contextmanager
+    def _dontopen(self):
+        raise InvalidCallToOpenError
+
+    from ...duplicates.bufferediofile import BufferedIOFile
+    monkeypatch.setattr(BufferedIOFile, 'open', _dontopen)
+
+    # Validating monkeypatch worked
+    t = BufferedIOFile(copiedtestfiles.paths['fileA'][0])
+    with raises(InvalidCallToOpenError), t.open():
+        assert True
+
+    duplicatefiles = DuplicateFiles.frompath(copiedtestfiles.root)
+    assert not duplicatefiles.duplicates