forked from borgbackup/borg
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
analyze: changed chunks per directory
- Loading branch information
1 parent
1700c7a
commit ff3b331
Showing
3 changed files
with
160 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
import argparse | ||
from collections import defaultdict | ||
import os | ||
|
||
from ._common import with_repository, define_archive_filters_group | ||
from ..archive import Archive | ||
from ..constants import * # NOQA | ||
from ..helpers import bin_to_hex, Error | ||
from ..helpers import ProgressIndicatorPercent | ||
from ..manifest import Manifest | ||
from ..remote import RemoteRepository | ||
from ..repository import Repository | ||
|
||
from ..logger import create_logger | ||
|
||
logger = create_logger() | ||
|
||
|
||
class ArchiveAnalyzer: | ||
def __init__(self, args, repository, manifest): | ||
self.args = args | ||
self.repository = repository | ||
assert isinstance(repository, (Repository, RemoteRepository)) | ||
self.manifest = manifest | ||
self.difference_by_path = defaultdict(int) # directory path -> count of chunks changed | ||
|
||
def analyze(self): | ||
logger.info("Starting archives analysis...") | ||
self.analyze_archives() | ||
self.report() | ||
logger.info("Finished archives analysis.") | ||
|
||
def analyze_archives(self) -> None: | ||
"""Analyze all archives matching the given selection criteria.""" | ||
archive_infos = self.manifest.archives.list_considering(self.args) | ||
num_archives = len(archive_infos) | ||
if num_archives < 2: | ||
raise Error("Need at least 2 archives to analyze.") | ||
|
||
pi = ProgressIndicatorPercent( | ||
total=num_archives, msg="Analyzing archives %3.1f%%", step=0.1, msgid="analyze.analyze_archives" | ||
) | ||
i = 0 | ||
info = archive_infos[i] | ||
pi.show(i) | ||
logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 1}/{num_archives})") | ||
base = self.analyze_archive(info.id) | ||
for i, info in enumerate(archive_infos[1:]): | ||
pi.show(i + 1) | ||
logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 2}/{num_archives})") | ||
new = self.analyze_archive(info.id) | ||
self.analyze_change(base, new) | ||
base = new | ||
pi.finish() | ||
|
||
def analyze_archive(self, id): | ||
"""compute the set of chunks for each directory in this archive""" | ||
archive = Archive(self.manifest, id) | ||
chunks_by_path = defaultdict(set) # collect all chunk IDs generated from files in this directory path | ||
for item in archive.iter_items(): | ||
if "chunks" in item: | ||
item_chunks = set(id for id, size in item.chunks) | ||
directory_path = os.path.dirname(item.path) | ||
chunks_by_path[directory_path].update(item_chunks) | ||
return chunks_by_path | ||
|
||
def analyze_change(self, base, new): | ||
"""for each directory path, count the chunks changed (removed or added chunks) between base and new.""" | ||
|
||
def analyze_path_change(path): | ||
base_chunks = base[path] | ||
new_chunks = new[path] | ||
different_chunks = base_chunks.symmetric_difference(new_chunks) # removed or added chunks | ||
self.difference_by_path[directory_path] += len(different_chunks) | ||
|
||
for directory_path in base: | ||
analyze_path_change(directory_path) | ||
for directory_path in new: | ||
if directory_path not in base: | ||
analyze_path_change(directory_path) | ||
|
||
def report(self): | ||
print() | ||
print("chunks added or removed by directory path") | ||
print("=========================================") | ||
for directory_path in sorted(self.difference_by_path, key=lambda p: self.difference_by_path[p], reverse=True): | ||
difference = self.difference_by_path[directory_path] | ||
if difference > 0: | ||
print(f"{directory_path}: {difference}") | ||
|
||
|
||
class AnalyzeMixIn: | ||
@with_repository(compatibility=(Manifest.Operation.READ,)) | ||
def do_analyze(self, args, repository, manifest): | ||
"""Analyze archives""" | ||
ArchiveAnalyzer(args, repository, manifest).analyze() | ||
|
||
def build_parser_analyze(self, subparsers, common_parser, mid_common_parser): | ||
from ._common import process_epilog | ||
|
||
analyze_epilog = process_epilog( | ||
""" | ||
Analyze archives. | ||
""" | ||
) | ||
subparser = subparsers.add_parser( | ||
"analyze", | ||
parents=[common_parser], | ||
add_help=False, | ||
description=self.do_analyze.__doc__, | ||
epilog=analyze_epilog, | ||
formatter_class=argparse.RawDescriptionHelpFormatter, | ||
help="analyze archives", | ||
) | ||
subparser.set_defaults(func=self.do_analyze) | ||
define_archive_filters_group(subparser) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import pathlib | ||
|
||
from ...constants import * # NOQA | ||
from . import cmd, generate_archiver_tests, RK_ENCRYPTION | ||
|
||
pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local") # NOQA | ||
|
||
|
||
def test_analyze(archivers, request): | ||
def create_archive(): | ||
cmd(archiver, "create", "archive", archiver.input_path) | ||
|
||
def analyze_archives(): | ||
return cmd(archiver, "analyze", "-a", "archive") | ||
|
||
archiver = request.getfixturevalue(archivers) | ||
|
||
cmd(archiver, "repo-create", RK_ENCRYPTION) | ||
input_path = pathlib.Path(archiver.input_path) | ||
|
||
# 1st archive | ||
(input_path / "file1").write_text("foo") | ||
create_archive() | ||
|
||
# 2nd archive | ||
(input_path / "file2").write_text("bar") | ||
create_archive() | ||
|
||
assert "/input: 1" in analyze_archives() # 2nd archive added 1 chunk for input path | ||
|
||
# 3rd archive | ||
(input_path / "file3").write_text("baz") | ||
create_archive() | ||
|
||
assert "/input: 2" in analyze_archives() # 2nd/3rd archives added 2 chunks for input path | ||
|
||
# 4th archive | ||
(input_path / "file2").unlink() | ||
create_archive() | ||
|
||
assert "/input: 3" in analyze_archives() # 2nd/3rd archives added 2, 4th archive removed 1 |