From bd22d1fbbb6969bd362c93b8f0e897ea7afe7cf9 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 8 Oct 2024 23:43:40 +0200 Subject: [PATCH 1/3] check: clarify what is meant by remote repository --- src/borg/archiver/check_cmd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/borg/archiver/check_cmd.py b/src/borg/archiver/check_cmd.py index 6f075e8f4b..27b8fd6c03 100644 --- a/src/borg/archiver/check_cmd.py +++ b/src/borg/archiver/check_cmd.py @@ -77,8 +77,8 @@ def build_parser_check(self, subparsers, common_parser, mid_common_parser): the repository. The read data is checked by size and hash. Bit rot and other types of accidental damage can be detected this way. Running the repository check can be split into multiple partial checks using ``--max-duration``. - When checking a remote repository, please note that the checks run on the - server and do not cause significant network traffic. + When checking a ssh:// remote repository, please note that the checks run on + the server and do not cause significant network traffic. 2. Checking consistency and correctness of the archive metadata and optionally archive data (requires ``--verify-data``). This includes ensuring that the From bc24d01928798e9fbbf32fa9715d17402e8ea130 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 9 Oct 2024 01:28:14 +0200 Subject: [PATCH 2/3] check (repository part): build and cache a ChunkIndex --- src/borg/repository.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/borg/repository.py b/src/borg/repository.py index a6a5c7c73c..fc671861fc 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -8,6 +8,7 @@ from .checksums import xxh64 from .constants import * # NOQA +from .hashindex import ChunkIndex, ChunkIndexEntry from .helpers import Error, ErrorWithTraceback, IntegrityError from .helpers import Location from .helpers import bin_to_hex, hex_to_bin @@ -306,6 +307,12 @@ def check_object(obj): t_start = time.monotonic() t_last_checkpoint = t_start objs_checked = objs_errors = 0 + chunks = ChunkIndex() + # we don't do refcounting anymore, neither we can know here whether any archive + # is using this object, but we assume that this is the case and set refcount to + # MAX_VALUE. As we don't do garbage collection here, this is not a problem. + # We also don't know the plaintext size, so we set it to 0. + init_entry = ChunkIndexEntry(refcount=ChunkIndex.MAX_VALUE, size=0) infos = self.store.list("data") try: for info in infos: @@ -338,6 +345,12 @@ def check_object(obj): self.store.delete(key) else: log_error("reloading did help, inconsistent behaviour detected!") + if not (obj_corrupted and repair): + # add all existing objects to the index. + # borg check: the index may have corrupted objects (we did not delete them) + # borg check --repair: the index will only have non-corrupted objects. + id = hex_to_bin(info.name) + chunks[id] = init_entry now = time.monotonic() if now > t_last_checkpoint + 300: # checkpoint every 5 mins t_last_checkpoint = now @@ -353,6 +366,11 @@ def check_object(obj): self.store.delete("config/last-key-checked") except StoreObjectNotFound: pass + if not partial: + # if we did a full pass in one go, we built a complete, uptodate ChunkIndex, cache it! + from .cache import write_chunkindex_to_repo_cache + + write_chunkindex_to_repo_cache(self, chunks, compact=True, clear=True, force_write=True) except StoreObjectNotFound: # it can be that there is no "data/" at all, then it crashes when iterating infos. pass From 7288f4f94d99356f9f68001c638b83474938ad44 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 9 Oct 2024 01:58:26 +0200 Subject: [PATCH 3/3] check (archives part): use cached ChunkIndex from check (repository part) --- src/borg/archive.py | 4 +++- src/borg/archiver/check_cmd.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 2aac5afbe5..847f6992d5 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -1649,7 +1649,9 @@ def check( self.check_all = not any((first, last, match, older, newer, oldest, newest)) self.repair = repair self.repository = repository - self.chunks = build_chunkindex_from_repo(self.repository, disable_caches=True, cache_immediately=not repair) + # Repository.check already did a full repository-level check and has built and cached a fresh chunkindex - + # we can use that here, so we don't disable the caches (also no need to cache immediately, again): + self.chunks = build_chunkindex_from_repo(self.repository, disable_caches=False, cache_immediately=False) self.key = self.make_key(repository) self.repo_objs = RepoObj(self.key) if verify_data: diff --git a/src/borg/archiver/check_cmd.py b/src/borg/archiver/check_cmd.py index 27b8fd6c03..a7d0ea9900 100644 --- a/src/borg/archiver/check_cmd.py +++ b/src/borg/archiver/check_cmd.py @@ -41,6 +41,7 @@ def do_check(self, args, repository): raise CommandError("--undelete-archives requires --repair argument.") if args.max_duration and not args.repo_only: # when doing a partial repo check, we can only check xxh64 hashes in repository files. + # archives check requires that a full repo check was done before and has built/cached a ChunkIndex. # also, there is no max_duration support in the archives check code anyway. raise CommandError("--repository-only is required for --max-duration support.") if not args.archives_only: