Skip to content

Commit

Permalink
Merge pull request #8 from MusicalNinjaRandInt/working
Browse files Browse the repository at this point in the history
v0.2.1
  • Loading branch information
MusicalNinjaDad authored Dec 12, 2023
2 parents 3ffd1e5 + 412695e commit f15b4d4
Show file tree
Hide file tree
Showing 12 changed files with 144 additions and 41 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# CHANGELOG - Link Duplicates

## v0.2.1

- improved output while running to give some kind of info on progress

## v0.2.0

- added info on storage usage and savings
Expand Down
2 changes: 1 addition & 1 deletion __version__
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.2.0
0.2.1
6 changes: 5 additions & 1 deletion duplicates.code-workspace
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,9 @@
"path": "."
}
],
"settings": {}
"settings": {
"black-formatter.args": [
"-S"
]
}
}
2 changes: 2 additions & 0 deletions duplicates/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
LOGROOT = 'dupes'

from .dupes import *
from .bufferediofile import *
25 changes: 14 additions & 11 deletions duplicates/cli.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import logging
import os
from pathlib import Path
import sys
from click import argument, command, confirm, option

from . import DuplicateFiles
from . import DuplicateFiles, LOGROOT

_logger = logging.getLogger(LOGROOT)

@command()
@argument('rootdir')
Expand All @@ -12,17 +15,17 @@
@option('--list', '_list', is_flag=True)
@option('--short', is_flag=True)
def dupes(rootdir, link, approved, _list, short):
roodir = Path(rootdir)
duplicatefiles = DuplicateFiles.frompath(roodir)

sets = len(duplicatefiles.duplicates)
totalfiles = len([file for group in duplicatefiles.duplicates for file in group])
print(f'{sets} sets of duplicates found, totalling {totalfiles} files')
_logger.setLevel(logging.INFO)
consoleoutput = logging.StreamHandler()
consoleoutput.setLevel(logging.INFO)
consoleoutput.setStream(sys.stdout)
outputformat = logging.Formatter('%(message)s')
consoleoutput.setFormatter(outputformat)
_logger.addHandler(consoleoutput)

rootdir = Path(rootdir)
duplicatefiles = DuplicateFiles.frompath(rootdir)

totalsize = sum(file.stat.st_size for group in duplicatefiles.duplicates for file in group)
futuresize = sum(next(iter(group)).stat.st_size for group in duplicatefiles.duplicates)
print(f'current usage: {totalsize}, potential usage: {futuresize}, saving: {totalsize-futuresize}')

if short:
print(duplicatefiles.printout(ignoresamenames=True))
elif _list:
Expand Down
17 changes: 17 additions & 0 deletions duplicates/dupes.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,42 @@
from collections import defaultdict
from contextlib import ExitStack
import logging
import os
from pathlib import Path
from typing import Any, Callable, Iterable
from uuid import uuid1

from .bufferediofile import BufferedIOFile, IsASymlinkError
from . import LOGROOT

class DuplicateFiles:

@classmethod
def frompath(cls, rootpath: Path):
_logger = logging.getLogger(f'{LOGROOT}.frompath')
_logger.info(f'Initiating search of {rootpath}')

samesizefiles = _filesofsamesize(rootpath)
_logger.info(f'Found {len(samesizefiles)} groups of same-sized files')

inoindex = _indexbyino(file for samesizeset in samesizefiles for file in samesizeset)
allfiles = {file for fileset in inoindex.values() for file in fileset}
uniqueinos = frozenset(next(iter(files)) for files in inoindex.values())
_logger.info(f'Identified {len(allfiles)-len(uniqueinos)} pre-existing hard links')
_logger.info(f'Will now begin comparing file contents, this may take some time')

dupes = set()
for fileset in samesizefiles:
nohardlinks = fileset.intersection(uniqueinos)
with ExitStack() as stack:
_ = [stack.enter_context(file.open()) for file in nohardlinks]
dupes |= comparefilecontents({frozenset(nohardlinks)})
alldupes = {file for fileset in dupes for file in fileset}
totalsize = sum(file.stat.st_size for file in alldupes)
futuresize = sum(next(iter(group)).stat.st_size for group in dupes)
_logger.info(f'Identified {len(dupes)} sets of duplicate files, totalling {len(alldupes)} files')
_logger.info(f'Current usage: {totalsize}, future usage: {futuresize}, saving: {totalsize-futuresize}')

return DuplicateFiles(duplicates=dupes, inoindex=inoindex)

def __init__(self, duplicates: set[frozenset[BufferedIOFile]], inoindex: dict[int: frozenset[Path]]) -> None:
Expand Down
26 changes: 25 additions & 1 deletion test/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,26 @@
from .. import *
from pytest import mark, raises, skip
from pytest import mark, raises, skip

@contextmanager
def skipon(exceptiontype: Exception, check: callable = lambda x: True, reason: str = ''):
"""Skip test on Exception of type exceptiontype.
Optionally run additional validation of exception before skipping.
Arguments:
- exceptiontype: the type of exception to skip
- check (optional): a `callable` which returns a single `bool` value.
Test will only be skipped if this check results `True`
- reason (optional): reason to be passed to `skip` and included in the test logs
Example:
```
with skipon(OSError, lambda e: e.winerror == 1314, 'SymLinks not available on Windows without DevMode enabled')
...
```
will skip test if Windows throws an OSError on missing permissions to create a symlink
"""

try:
yield
except exceptiontype as e:
if check(e): skip(reason=reason)
12 changes: 3 additions & 9 deletions test/majorver/test_BufferedIOFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,8 @@ def test_equal_relativepathsgiven():
def test_equal_pathsresolved(copiedtestfiles):
fileA = copiedtestfiles.paths['fileA'][0]
symlink = copiedtestfiles.root / Path('linktoA.txt')
try:
with skipon(OSError, lambda e: e.winerror == 1314, 'SymLinks not available on Windows without DevMode enabled'):
symlink.symlink_to(fileA)
except OSError as e:
if e.winerror == 1314: skip(reason='SymLinks not available on Windows without DevMode enabled')
assert fileA != symlink, 'Something when wrong in the test setup'
assert fileA == symlink.resolve(), 'Something when wrong in the test setup'
fileA = BufferedIOFile(fileA)
Expand All @@ -58,21 +56,17 @@ def test_equal_pathsresolved(copiedtestfiles):
def test_symlink_raiseserror(copiedtestfiles):
fileA = copiedtestfiles.paths['fileA'][0]
symlink = copiedtestfiles.root / Path('linktoA.txt')
try:
with skipon(OSError, lambda e: e.winerror == 1314, 'SymLinks not available on Windows without DevMode enabled'):
symlink.symlink_to(fileA)
except OSError as e:
if e.winerror == 1314: skip(reason='SymLinks not available on Windows without DevMode enabled')
with raises(IsASymlinkError):
symlink = BufferedIOFile(symlink)

@mark.copyfiles(('fileA',1))
def test_followsymlinks_notimplemented(copiedtestfiles):
fileA = copiedtestfiles.paths['fileA'][0]
symlink = copiedtestfiles.root / Path('linktoA.txt')
try:
with skipon(OSError, lambda e: e.winerror == 1314, 'SymLinks not available on Windows without DevMode enabled'):
symlink.symlink_to(fileA)
except OSError as e:
if e.winerror == 1314: skip(reason='SymLinks not available on Windows without DevMode enabled')
with raises(NotImplementedError):
symlink = BufferedIOFile(symlink, follow_symlinks=True)

Expand Down
26 changes: 20 additions & 6 deletions test/majorver/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,19 @@ def test_link(copiedtestfiles):
completed = run(command, capture_output=True)

output = [
'2 sets of duplicates found, totalling 5 files',
'current usage: 101, potential usage: 39, saving: 62',
f'Initiating search of {copiedtestfiles.root}',
f'Found 2 groups of same-sized files',
f'Identified 0 pre-existing hard links',
f'Will now begin comparing file contents, this may take some time',
f'Identified 2 sets of duplicate files, totalling 5 files',
f'Current usage: 101, future usage: 39, saving: 62',
f'Linking files in {copiedtestfiles.root} ...'
]

assert [s.strip() for s in completed.stdout.decode().strip().split('\n')] == output
stdout = [s.strip() for s in completed.stdout.decode().strip().split('\n')]
assert (
stdout == output
), f'\nOutput: {stdout}\nExpected: {output}'

fileAino = copiedtestfiles.paths['fileA'][0].stat().st_ino
fileBino = copiedtestfiles.paths['fileB'][0].stat().st_ino
Expand All @@ -44,11 +51,18 @@ def test_nolink(copiedtestfiles):
completed = run(command, capture_output=True)

output = [
'2 sets of duplicates found, totalling 5 files',
'current usage: 101, potential usage: 39, saving: 62'
f'Initiating search of {copiedtestfiles.root}',
f'Found 2 groups of same-sized files',
f'Identified 0 pre-existing hard links',
f'Will now begin comparing file contents, this may take some time',
f'Identified 2 sets of duplicate files, totalling 5 files',
f'Current usage: 101, future usage: 39, saving: 62'
]

assert [s.strip() for s in completed.stdout.decode().strip().split('\n')] == output
stdout = [s.strip() for s in completed.stdout.decode().strip().split('\n')]
assert (
stdout == output
), f'\nOutput: {stdout}\nExpected: {output}'

newinos = {file.stat().st_ino for copies in copiedtestfiles.paths.values() for file in copies}

Expand Down
4 changes: 1 addition & 3 deletions test/majorver/test_dupes_identification.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,7 @@ def test_multiplezerosizefiles(copiedtestfiles):
def test_instantiate_dropsymlinks(copiedtestfiles):
fileA = copiedtestfiles.paths['fileA'][0]
symlink = copiedtestfiles.root / Path('linktoA.txt')
try:
with skipon(OSError, lambda e: e.winerror == 1314, 'SymLinks not available on Windows without DevMode enabled'):
symlink.symlink_to(fileA)
except OSError as e:
if e.winerror == 1314: skip(reason='SymLinks not available on Windows without DevMode enabled')
duplicatefiles = DuplicateFiles.frompath(copiedtestfiles.root)
assert duplicatefiles.duplicates == {frozenset(path for path in copiedtestfiles.paths['fileA'])}, f'Following files identified as duplicates: {duplicatefiles.duplicates}'
24 changes: 24 additions & 0 deletions test/majorver/test_logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from . import *
import logging


@mark.copyfiles(('fileA', 2), ('fileB', 1), ('fileA2', 1))
@mark.linkfiles(('fileA', 1))
def test_instantiatefrompath(copiedtestfiles, caplog):
caplog.set_level(logging.INFO, logger='dupes')
_ = DuplicateFiles.frompath(copiedtestfiles.root)
logs = [record for record in caplog.records if record.name.startswith(LOGROOT)]
logmessages = [record.message for record in logs]

expectedmessages = [
f'Initiating search of {copiedtestfiles.root}',
f'Found 1 groups of same-sized files',
f'Identified 1 pre-existing hard links',
f'Will now begin comparing file contents, this may take some time',
f'Identified 1 sets of duplicate files, totalling 2 files',
f'Current usage: 32, future usage: 16, saving: 16'
]

assert (
logmessages == expectedmessages
), f'\nReceived log: {logmessages}\nExpected: {expectedmessages}\nMissing: {set(expectedmessages).difference(logmessages)}\nExtra: {set(logmessages).difference(expectedmessages)}'
37 changes: 28 additions & 9 deletions test/minorver/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,19 @@ def test_link(copiedtestfiles):
result = clirunner.invoke(dupes, command)

output = [
'2 sets of duplicates found, totalling 5 files',
'current usage: 101, potential usage: 39, saving: 62',
f'Initiating search of {copiedtestfiles.root}',
f'Found 2 groups of same-sized files',
f'Identified 0 pre-existing hard links',
f'Will now begin comparing file contents, this may take some time',
f'Identified 2 sets of duplicate files, totalling 5 files',
f'Current usage: 101, future usage: 39, saving: 62',
f'Linking files in {copiedtestfiles.root} ...'
]

assert [s.strip() for s in result.output.strip().split('\n')] == output
stdout = [s.strip() for s in result.output.strip().split('\n')]
assert (
stdout == output
), f'\nOutput: {stdout}\nExpected: {output}'

fileAino = copiedtestfiles.paths['fileA'][0].stat().st_ino
fileBino = copiedtestfiles.paths['fileB'][0].stat().st_ino
Expand All @@ -40,8 +47,12 @@ def test_linkapproved(copiedtestfiles):
result = clirunner.invoke(dupes, command, input='y')

output = [
'2 sets of duplicates found, totalling 5 files',
'current usage: 101, potential usage: 39, saving: 62',
f'Initiating search of {copiedtestfiles.root}',
f'Found 2 groups of same-sized files',
f'Identified 0 pre-existing hard links',
f'Will now begin comparing file contents, this may take some time',
f'Identified 2 sets of duplicate files, totalling 5 files',
f'Current usage: 101, future usage: 39, saving: 62',
'Link files? [y/N]: y',
f'Linking files in {copiedtestfiles.root} ...'
]
Expand Down Expand Up @@ -70,8 +81,12 @@ def test_link_abort(copiedtestfiles):
result = clirunner.invoke(dupes, command, input='n')

output = [
'2 sets of duplicates found, totalling 5 files',
'current usage: 101, potential usage: 39, saving: 62',
f'Initiating search of {copiedtestfiles.root}',
f'Found 2 groups of same-sized files',
f'Identified 0 pre-existing hard links',
f'Will now begin comparing file contents, this may take some time',
f'Identified 2 sets of duplicate files, totalling 5 files',
f'Current usage: 101, future usage: 39, saving: 62',
'Link files? [y/N]: n',
'Aborted!'
]
Expand All @@ -97,8 +112,12 @@ def test_nolink(copiedtestfiles):
result = clirunner.invoke(dupes, command)

output = [
'2 sets of duplicates found, totalling 5 files',
'current usage: 101, potential usage: 39, saving: 62'
f'Initiating search of {copiedtestfiles.root}',
f'Found 2 groups of same-sized files',
f'Identified 0 pre-existing hard links',
f'Will now begin comparing file contents, this may take some time',
f'Identified 2 sets of duplicate files, totalling 5 files',
f'Current usage: 101, future usage: 39, saving: 62'
]

assert [s.strip() for s in result.output.strip().split('\n')] == output
Expand Down

0 comments on commit f15b4d4

Please sign in to comment.