Skip to content

Commit

Permalink
Merge pull request #11 from MusicalNinjaRandInt/working
Browse files Browse the repository at this point in the history
0.3.0 - speed improvements
  • Loading branch information
MusicalNinjaDad authored Dec 13, 2023
2 parents f078ce5 + bab3ff6 commit b3e7c1e
Show file tree
Hide file tree
Showing 12 changed files with 115 additions and 36 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# CHANGELOG - Link Duplicates

## 0.3.0 - speed improvements

- speed up cases where reprocessing after a previous run
- add totals to info output

## v0.2.3

- add timestamps to status output
Expand Down
2 changes: 1 addition & 1 deletion __version__
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.2.3
0.3.0
2 changes: 1 addition & 1 deletion duplicates/bufferediofile.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def __repr__(self) -> str:
def open(self):
with open(self.path, 'rb') as self.__handle:
yield
self.__handle = None
self.__handle = None

class _FileIterator():
def __init__(self, handle: BufferedIOBase, chunksize: int) -> None:
Expand Down
3 changes: 2 additions & 1 deletion duplicates/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def dupes(rootdir, link, approved, _list, short):
confirm('Link files?', abort=True, err=True)
print('', file=sys.stderr) #prompting to stderr doesn't echo input (including \n)
_logger.info(f'Linking files in {os.fspath(rootdir)} ...')
duplicatefiles.link()
duplicatefiles.link()
_logger.info(f'Done')
16 changes: 9 additions & 7 deletions duplicates/dupes.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,22 @@ def frompath(cls, rootpath: Path):
_logger.info(f'Initiating search of {rootpath}')

samesizefiles = _filesofsamesize(rootpath)
_logger.info(f'Found {len(samesizefiles)} groups of same-sized files')

allfiles = {file for fileset in samesizefiles for file in fileset}
_logger.info(f'Found {len(samesizefiles)} groups of same-sized files, totalling {len(allfiles)} files')

inoindex = _indexbyino(file for samesizeset in samesizefiles for file in samesizeset)
allfiles = {file for fileset in inoindex.values() for file in fileset}
uniqueinos = frozenset(next(iter(files)) for files in inoindex.values())
_logger.info(f'Identified {len(allfiles)-len(uniqueinos)} pre-existing hard links')
_logger.info(f'Identified {len(allfiles)-len(uniqueinos)} pre-existing hard links, leaving {len(uniqueinos)} files for comparison')
_logger.info(f'Will now begin comparing file contents, this may take some time')

dupes = set()
for fileset in samesizefiles:
nohardlinks = fileset.intersection(uniqueinos)
with ExitStack() as stack:
_ = [stack.enter_context(file.open()) for file in nohardlinks]
dupes |= comparefilecontents({frozenset(nohardlinks)})
if len(nohardlinks) > 1:
with ExitStack() as stack:
_ = [stack.enter_context(file.open()) for file in nohardlinks]
dupes |= comparefilecontents({frozenset(nohardlinks)})

alldupes = {file for fileset in dupes for file in fileset}
totalsize = sum(file.stat.st_size for file in alldupes)
futuresize = sum(next(iter(group)).stat.st_size for group in dupes)
Expand Down
25 changes: 19 additions & 6 deletions test/conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import uuid
from collections import defaultdict
from contextlib import ExitStack
Expand Down Expand Up @@ -54,22 +55,34 @@ class Testfiles():

@fixture
def copiedtestfiles(request, tmp_path) -> Testfiles:
yield from copytestfiles(request, tmp_path)
filestocopy = request.node.get_closest_marker('copyfiles')
if filestocopy.args:
yield copytestfiles(request, tmp_path, filestocopy.args)
else:
def mktmp(id):
dir = tmp_path / Path(id)
dir.mkdir()
return dir
yield {setname: copytestfiles(request, mktmp(setname), setoffiles) for setname, setoffiles in filestocopy.kwargs.items()}


@fixture(scope='class')
def classtestfiles(request, tmp_path_factory) -> Testfiles:
tmp_dir = tmp_path_factory.mktemp(str(request.node.name))
yield from copytestfiles(request, tmp_dir)
filestocopy = request.node.get_closest_marker('copyfiles')
if filestocopy.args:
yield copytestfiles(request, tmp_dir, filestocopy.args)
else:
raise NotImplementedError

def copytestfiles(request, tmp_path) -> Testfiles:
def copytestfiles(request, tmp_path, filestocopy) -> Testfiles:
tmp_files = Testfiles(
root = tmp_path,
paths = defaultdict(list),
handles = defaultdict(list)
)

filestocopy = request.node.get_closest_marker('copyfiles')
for file in filestocopy.args:
for file in filestocopy:
fileid, numcopies = file
for _ in range(numcopies):
uniquedir = tmp_path / str(uuid.uuid1())
Expand All @@ -87,7 +100,7 @@ def copytestfiles(request, tmp_path) -> Testfiles:
newfile = tmp_path / uniquedir / sourcefiles.paths[fileid].name
newfile.hardlink_to(tmp_files.paths[fileid][0])
tmp_files.paths[fileid].append(newfile)
yield tmp_files
return tmp_files

@fixture
def filesopen(copiedtestfiles):
Expand Down
11 changes: 6 additions & 5 deletions test/majorver/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@ def test_link(copiedtestfiles):

output = [
f'Initiating search of {copiedtestfiles.root}',
f'Found 2 groups of same-sized files',
f'Identified 0 pre-existing hard links',
f'Found 2 groups of same-sized files, totalling 5 files',
f'Identified 0 pre-existing hard links, leaving 5 files for comparison',
f'Will now begin comparing file contents, this may take some time',
f'Identified 2 sets of duplicate files, totalling 5 files',
f'Current usage: 101, future usage: 39, saving: 62',
f'Linking files in {copiedtestfiles.root} ...'
f'Linking files in {copiedtestfiles.root} ...',
'Done'
]

stderr = [removetimestamp(s.strip()) for s in completed.stderr.decode().strip().split('\n')]
Expand Down Expand Up @@ -52,8 +53,8 @@ def test_nolink(copiedtestfiles):

output = [
f'Initiating search of {copiedtestfiles.root}',
f'Found 2 groups of same-sized files',
f'Identified 0 pre-existing hard links',
f'Found 2 groups of same-sized files, totalling 5 files',
f'Identified 0 pre-existing hard links, leaving 5 files for comparison',
f'Will now begin comparing file contents, this may take some time',
f'Identified 2 sets of duplicate files, totalling 5 files',
f'Current usage: 101, future usage: 39, saving: 62'
Expand Down
17 changes: 16 additions & 1 deletion test/majorver/test_dupes_identification.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,4 +87,19 @@ def test_instantiate_dropsymlinks(copiedtestfiles):
with skipon(OSError, lambda e: e.winerror == 1314, 'SymLinks not available on Windows without DevMode enabled'):
symlink.symlink_to(fileA)
duplicatefiles = DuplicateFiles.frompath(copiedtestfiles.root)
assert duplicatefiles.duplicates == {frozenset(path for path in copiedtestfiles.paths['fileA'])}, f'Following files identified as duplicates: {duplicatefiles.duplicates}'
assert duplicatefiles.duplicates == {frozenset(path for path in copiedtestfiles.paths['fileA'])}, f'Following files identified as duplicates: {duplicatefiles.duplicates}'

@mark.copyfiles(('fileA',1), ('fileB',2))
@mark.linkfiles(('fileA',2))
def test_somefilesalreadyprocessed(copiedtestfiles):
identicalfiles = DuplicateFiles.frompath(copiedtestfiles.root)
assert identicalfiles.duplicates == {
frozenset(BufferedIOFile(path) for path in copiedtestfiles.paths['fileB'])
}

@mark.copyfiles(
set1 = (('fileA',1), ('fileB',2)),
set2 = (('fileA2',2), ('fileB',2))
)
def test_nocommonroot(copiedtestfiles):
pass
19 changes: 18 additions & 1 deletion test/majorver/test_dupes_linking.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,21 @@ def test_link_duplicatefileswithmultiplegroupsoflinks(copiedtestfiles):
inoscorrect = {(fileid, i): file.stat().st_ino == fileAino for fileid in ('fileA', 'fileA-copy') for i, file in enumerate(copiedtestfiles.paths[fileid])}
assert all(
inoscorrect.values()
), f'{inoscorrect}'
), f'{inoscorrect}'

@mark.copyfiles(('fileA',1))
@mark.linkfiles(('fileA',2))
def test_donothingifonlylinks(copiedtestfiles, monkeypatch):
class InvalidCallToReplaceWithLinkError(Exception):
pass

@contextmanager
def _dontlink(keep, link):
raise InvalidCallToReplaceWithLinkError

from ...duplicates import dupes
monkeypatch.setattr(dupes, "_replacewithlink", _dontlink)
# Monkeypatch was validated by adding an extra set of files which did need linking

duplicatefiles = DuplicateFiles.frompath(copiedtestfiles.root)
duplicatefiles.link()
4 changes: 2 additions & 2 deletions test/majorver/test_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ def test_instantiatefrompath(copiedtestfiles, caplog):

expectedmessages = [
f'Initiating search of {copiedtestfiles.root}',
f'Found 1 groups of same-sized files',
f'Identified 1 pre-existing hard links',
f'Found 1 groups of same-sized files, totalling 4 files',
f'Identified 1 pre-existing hard links, leaving 3 files for comparison',
f'Will now begin comparing file contents, this may take some time',
f'Identified 1 sets of duplicate files, totalling 2 files',
f'Current usage: 32, future usage: 16, saving: 16'
Expand Down
22 changes: 12 additions & 10 deletions test/minorver/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@ def test_link(copiedtestfiles):

output = [
f'Initiating search of {copiedtestfiles.root}',
f'Found 2 groups of same-sized files',
f'Identified 0 pre-existing hard links',
f'Found 2 groups of same-sized files, totalling 5 files',
f'Identified 0 pre-existing hard links, leaving 5 files for comparison',
f'Will now begin comparing file contents, this may take some time',
f'Identified 2 sets of duplicate files, totalling 5 files',
f'Current usage: 101, future usage: 39, saving: 62',
f'Linking files in {copiedtestfiles.root} ...'
f'Linking files in {copiedtestfiles.root} ...',
'Done'
]

stderr = [removetimestamp(s.strip()) for s in result.stderr.strip().split('\n')]
Expand Down Expand Up @@ -49,13 +50,14 @@ def test_linkapproved(copiedtestfiles):

output = [
f'Initiating search of {copiedtestfiles.root}',
f'Found 2 groups of same-sized files',
f'Identified 0 pre-existing hard links',
f'Found 2 groups of same-sized files, totalling 5 files',
f'Identified 0 pre-existing hard links, leaving 5 files for comparison',
f'Will now begin comparing file contents, this may take some time',
f'Identified 2 sets of duplicate files, totalling 5 files',
f'Current usage: 101, future usage: 39, saving: 62',
'Link files? [y/N]:', #prompting to stderr doesn't echo input (including \n)
f'Linking files in {copiedtestfiles.root} ...'
f'Linking files in {copiedtestfiles.root} ...',
'Done'
]

stderr = [removetimestamp(s.strip()) for s in result.stderr.strip().split('\n')]
Expand Down Expand Up @@ -84,8 +86,8 @@ def test_link_abort(copiedtestfiles):

output = [
f'Initiating search of {copiedtestfiles.root}',
f'Found 2 groups of same-sized files',
f'Identified 0 pre-existing hard links',
f'Found 2 groups of same-sized files, totalling 5 files',
f'Identified 0 pre-existing hard links, leaving 5 files for comparison',
f'Will now begin comparing file contents, this may take some time',
f'Identified 2 sets of duplicate files, totalling 5 files',
f'Current usage: 101, future usage: 39, saving: 62',
Expand Down Expand Up @@ -115,8 +117,8 @@ def test_nolink(copiedtestfiles):

output = [
f'Initiating search of {copiedtestfiles.root}',
f'Found 2 groups of same-sized files',
f'Identified 0 pre-existing hard links',
f'Found 2 groups of same-sized files, totalling 5 files',
f'Identified 0 pre-existing hard links, leaving 5 files for comparison',
f'Will now begin comparing file contents, this may take some time',
f'Identified 2 sets of duplicate files, totalling 5 files',
f'Current usage: 101, future usage: 39, saving: 62'
Expand Down
25 changes: 24 additions & 1 deletion test/minorver/test_filecomparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,27 @@ def test_indexbyino(copiedtestfiles):
inoindex = _indexbyino(filestocompare)
assert len(inoindex) == 2
assert {copiedtestfiles.paths['fileA'][0], copiedtestfiles.paths['fileA'][2]} in inoindex.values()
assert {copiedtestfiles.paths['fileA'][1]} in inoindex.values()
assert {copiedtestfiles.paths['fileA'][1]} in inoindex.values()

@mark.copyfiles(('fileA',1))
@mark.linkfiles(('fileA',2))
def test_dontscanoridentifyifonlylinks(copiedtestfiles, monkeypatch):
"""This can waste a lot of time if there are files which have already been processed by a previous run of dupes and no new copies are present.
"""
class InvalidCallToOpenError(Exception):
pass

@contextmanager
def _dontopen(self):
raise InvalidCallToOpenError

from ...duplicates.bufferediofile import BufferedIOFile
monkeypatch.setattr(BufferedIOFile, 'open', _dontopen)

# Validating monkeypatch worked
t = BufferedIOFile(copiedtestfiles.paths['fileA'][0])
with raises(InvalidCallToOpenError), t.open():
assert True

duplicatefiles = DuplicateFiles.frompath(copiedtestfiles.root)
assert not duplicatefiles.duplicates

0 comments on commit b3e7c1e

Please sign in to comment.