Skip to content

Commit

Permalink
build ad-hoc files cache based on previous archive in series
Browse files Browse the repository at this point in the history
also: fix the tests, so they use a series.
  • Loading branch information
ThomasWaldmann committed Sep 18, 2024
1 parent 2de032c commit f2e2850
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 24 deletions.
2 changes: 1 addition & 1 deletion src/borg/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -1345,7 +1345,7 @@ def process_file(self, *, path, parent_fd, name, st, cache, flags=flags_normal,
item.chunks.append(chunk_entry)
else: # normal case, no "2nd+" hardlink
if not is_special_file:
hashed_path = safe_encode(os.path.join(self.cwd, path))
hashed_path = safe_encode(item.path) # path as in archive item!
started_hashing = time.monotonic()
path_hash = self.key.id_hash(hashed_path)
self.stats.hashing_time += time.monotonic() - started_hashing
Expand Down
1 change: 1 addition & 0 deletions src/borg/archiver/create_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ def create_inner(archive, cache, fso):
lock_wait=self.lock_wait,
cache_mode=args.files_cache_mode,
iec=args.iec,
archive_name=args.name,
) as cache:
archive = Archive(
manifest,
Expand Down
31 changes: 22 additions & 9 deletions src/borg/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
from .hashindex import ChunkIndex, ChunkIndexEntry
from .helpers import Error
from .helpers import get_cache_dir, get_security_dir
from .helpers import hex_to_bin, parse_stringified_list
from .helpers import format_file_size
from .helpers import hex_to_bin, bin_to_hex, parse_stringified_list
from .helpers import format_file_size, safe_encode
from .helpers import yes
from .helpers import ProgressIndicatorMessage
from .helpers import msgpack
Expand Down Expand Up @@ -347,6 +347,7 @@ def __new__(
lock_wait=None,
cache_mode=FILES_CACHE_MODE_DISABLED,
iec=False,
archive_name=None,
):
return AdHocWithFilesCache(
manifest=manifest,
Expand All @@ -356,6 +357,7 @@ def __new__(
iec=iec,
lock_wait=lock_wait,
cache_mode=cache_mode,
archive_name=archive_name,
)


Expand All @@ -369,8 +371,8 @@ class FilesCacheMixin:
If so, we use the cached chunks list and skip reading/chunking the file contents.
"""

def __init__(self, cache_mode, previous_archive_id=None):
self.previous_archive_id = previous_archive_id
def __init__(self, cache_mode, archive_name=None):
self.archive_name = archive_name # ideally a SERIES name
assert not ("c" in cache_mode and "m" in cache_mode)
assert "d" in cache_mode or "c" in cache_mode or "m" in cache_mode
self.cache_mode = cache_mode
Expand All @@ -387,20 +389,30 @@ def files(self):
def _build_files_cache(self):
if "d" in self.cache_mode: # d(isabled)
return {}
if self.previous_archive_id is None:

if not self.archive_name:
return {}

from .archive import Archive

# get the latest archive with the IDENTICAL name, supporting archive series:
archives = self.manifest.archives.list(match=self.archive_name, sort_by=["ts"], last=1)
if not archives:
# nothing found
return {}
prev_archive = archives[0]

files = {}
logger.debug("Building files cache ...")
logger.debug(
f"Building files cache from {prev_archive.name} {prev_archive.ts} {bin_to_hex(prev_archive.id)} ..."
)
files_cache_logger.debug("FILES-CACHE-BUILD: starting...")
archive = Archive(self.manifest, self.previous_archive_id)
archive = Archive(self.manifest, prev_archive.id)
for item in archive.iter_items(preload=False):
# only put regular files' infos into the files cache:
if stat.S_ISREG(item.mode):
assert "chunks" in item # TODO: is item.chunks ready?
path_hash = self.key.id_hash(item.path) # TODO: NOT the full absolute path as it used to be!
path_hash = self.key.id_hash(safe_encode(item.path))
# keep track of the key(s) for the most recent timestamp(s):
ctime_ns = item.ctime
if ctime_ns > self._newest_cmtime:
Expand Down Expand Up @@ -585,13 +597,14 @@ def __init__(
lock_wait=None,
cache_mode=FILES_CACHE_MODE_DISABLED,
iec=False,
archive_name=None,
):
"""
:param warn_if_unencrypted: print warning if accessing unknown unencrypted repository
:param lock_wait: timeout for lock acquisition (int [s] or None [wait forever])
:param cache_mode: what shall be compared in the file stat infos vs. cached stat infos comparison
"""
FilesCacheMixin.__init__(self, cache_mode) # TODO: give previous_archive_id
FilesCacheMixin.__init__(self, cache_mode, archive_name)
ChunksMixin.__init__(self)
assert isinstance(manifest, Manifest)
self.manifest = manifest
Expand Down
28 changes: 14 additions & 14 deletions src/borg/testsuite/archiver/create_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -657,7 +657,7 @@ def test_file_status(archivers, request):
assert "A input/file1" in output
assert "A input/file2" in output
# should find first file as unmodified
output = cmd(archiver, "create", "--list", "test2", "input")
output = cmd(archiver, "create", "--list", "test", "input")
assert "U input/file1" in output
# although surprising, this is expected. For why, see:
# https://borgbackup.readthedocs.org/en/latest/faq.html#i-am-seeing-a-added-status-for-a-unchanged-file
Expand All @@ -674,13 +674,13 @@ def test_file_status_cs_cache_mode(archivers, request):
time.sleep(1) # file2 must have newer timestamps than file1
create_regular_file(archiver.input_path, "file2", size=10)
cmd(archiver, "repo-create", RK_ENCRYPTION)
cmd(archiver, "create", "test1", "input", "--list", "--files-cache=ctime,size")
cmd(archiver, "create", "test", "input", "--list", "--files-cache=ctime,size")
# modify file1, but cheat with the mtime (and atime) and also keep same size:
st = os.stat("input/file1")
create_regular_file(archiver.input_path, "file1", contents=b"321")
os.utime("input/file1", ns=(st.st_atime_ns, st.st_mtime_ns))
# this mode uses ctime for change detection, so it should find file1 as modified
output = cmd(archiver, "create", "test2", "input", "--list", "--files-cache=ctime,size")
output = cmd(archiver, "create", "test", "input", "--list", "--files-cache=ctime,size")
assert "M input/file1" in output


Expand All @@ -691,12 +691,12 @@ def test_file_status_ms_cache_mode(archivers, request):
time.sleep(1) # file2 must have newer timestamps than file1
create_regular_file(archiver.input_path, "file2", size=10)
cmd(archiver, "repo-create", RK_ENCRYPTION)
cmd(archiver, "create", "--list", "--files-cache=mtime,size", "test1", "input")
cmd(archiver, "create", "--list", "--files-cache=mtime,size", "test", "input")
# change mode of file1, no content change:
st = os.stat("input/file1")
os.chmod("input/file1", st.st_mode ^ stat.S_IRWXO) # this triggers a ctime change, but mtime is unchanged
# this mode uses mtime for change detection, so it should find file1 as unmodified
output = cmd(archiver, "create", "--list", "--files-cache=mtime,size", "test2", "input")
output = cmd(archiver, "create", "--list", "--files-cache=mtime,size", "test", "input")
assert "U input/file1" in output


Expand All @@ -707,9 +707,9 @@ def test_file_status_rc_cache_mode(archivers, request):
time.sleep(1) # file2 must have newer timestamps than file1
create_regular_file(archiver.input_path, "file2", size=10)
cmd(archiver, "repo-create", RK_ENCRYPTION)
cmd(archiver, "create", "--list", "--files-cache=rechunk,ctime", "test1", "input")
cmd(archiver, "create", "--list", "--files-cache=rechunk,ctime", "test", "input")
# no changes here, but this mode rechunks unconditionally
output = cmd(archiver, "create", "--list", "--files-cache=rechunk,ctime", "test2", "input")
output = cmd(archiver, "create", "--list", "--files-cache=rechunk,ctime", "test", "input")
assert "A input/file1" in output


Expand All @@ -729,7 +729,7 @@ def test_file_status_excluded(archivers, request):
if has_lchflags:
assert "- input/file3" in output
# should find second file as excluded
output = cmd(archiver, "create", "test1", "input", "--list", "--exclude-nodump", "--exclude", "*/file2")
output = cmd(archiver, "create", "test", "input", "--list", "--exclude-nodump", "--exclude", "*/file2")
assert "U input/file1" in output
assert "- input/file2" in output
if has_lchflags:
Expand Down Expand Up @@ -762,14 +762,14 @@ def to_dict(borg_create_output):
create_regular_file(archiver.input_path, "testfile1", contents=b"test1")
time.sleep(1.0 if is_darwin else 0.01) # testfile2 must have newer timestamps than testfile1
create_regular_file(archiver.input_path, "testfile2", contents=b"test2")
result = cmd(archiver, "create", "--stats", "test_archive2", archiver.input_path)
result = cmd(archiver, "create", "--stats", "test_archive", archiver.input_path)
result = to_dict(result)
assert result["Added files"] == 2
assert result["Unchanged files"] == 0
assert result["Modified files"] == 0
# Archive a dir with 1 unmodified file and 1 modified
create_regular_file(archiver.input_path, "testfile1", contents=b"new data")
result = cmd(archiver, "create", "--stats", "test_archive3", archiver.input_path)
result = cmd(archiver, "create", "--stats", "test_archive", archiver.input_path)
result = to_dict(result)
# Should process testfile2 as added because of
# https://borgbackup.readthedocs.io/en/stable/faq.html#i-am-seeing-a-added-status-for-an-unchanged-file
Expand Down Expand Up @@ -807,18 +807,18 @@ def test_create_topical(archivers, request):
output = cmd(archiver, "create", "test", "input")
assert "file1" not in output
# shouldn't be listed even if unchanged
output = cmd(archiver, "create", "test0", "input")
output = cmd(archiver, "create", "test", "input")
assert "file1" not in output
# should list the file as unchanged
output = cmd(archiver, "create", "test1", "input", "--list", "--filter=U")
output = cmd(archiver, "create", "test", "input", "--list", "--filter=U")
assert "file1" in output
# should *not* list the file as changed
output = cmd(archiver, "create", "test2", "input", "--list", "--filter=AM")
output = cmd(archiver, "create", "test", "input", "--list", "--filter=AM")
assert "file1" not in output
# change the file
create_regular_file(archiver.input_path, "file1", size=1024 * 100)
# should list the file as changed
output = cmd(archiver, "create", "test3", "input", "--list", "--filter=AM")
output = cmd(archiver, "create", "test", "input", "--list", "--filter=AM")
assert "file1" in output


Expand Down

0 comments on commit f2e2850

Please sign in to comment.