From f3594cf8d5a7b483824ac0e5392dfdd5e08450f8 Mon Sep 17 00:00:00 2001 From: Brian Scholer <1260690+briantist@users.noreply.github.com> Date: Sun, 4 Sep 2022 14:51:09 -0400 Subject: [PATCH] Cache configurability - expiry time, read/write controls (#13) * stop printing url_map * add a custom argparser action for bool options * add cache control options * use context manager on StringIO * make HashedTempFile a native context manager with optional closing * cache rules everything around me * update README --- README.md | 41 ++++++++++++++++++++------------ galactory/__main__.py | 29 +++++++++++++++++++++-- galactory/api/v2/collections.py | 15 +++++++++--- galactory/download/download.py | 10 ++++++-- galactory/upstream.py | 42 +++++++++++++++++++++++---------- galactory/utilities.py | 39 +++++++++++++++++++----------- 6 files changed, 127 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index 0fe3624..0c704d8 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,8 @@ usage: python -m galactory [-h] [-c CONFIG] [--listen-addr LISTEN_ADDR] [--prefer-configured-key] [--log-file LOG_FILE] [--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [--log-headers] [--log-body] [--proxy-upstream PROXY_UPSTREAM] - [-npns NO_PROXY_NAMESPACE] + [-npns NO_PROXY_NAMESPACE] [--cache-minutes CACHE_MINUTES] + [--cache-read CACHE_READ] [--cache-write CACHE_WRITE] galactory is a partial Ansible Galaxy proxy that uploads and downloads collections, using an Artifactory generic repository as its backend. @@ -48,30 +49,40 @@ optional arguments: The host name and port of the server, as seen from clients. Used for generating links. [env var: GALACTORY_SERVER_NAME] --artifactory-path ARTIFACTORY_PATH - The URL of the path in Artifactory where collections are stored. [env var: - GALACTORY_ARTIFACTORY_PATH] + The URL of the path in Artifactory where collections are stored. + [env var: GALACTORY_ARTIFACTORY_PATH] --artifactory-api-key ARTIFACTORY_API_KEY - If set, is the API key used to access Artifactory. [env var: - GALACTORY_ARTIFACTORY_API_KEY] - --use-galaxy-key If set, uses the Galaxy token as the Artifactory API key. [env var: - GALACTORY_USE_GALAXY_KEY] + If set, is the API key used to access Artifactory. + [env var: GALACTORY_ARTIFACTORY_API_KEY] + --use-galaxy-key If set, uses the Galaxy token as the Artifactory API key. + [env var: GALACTORY_USE_GALAXY_KEY] --prefer-configured-key - If set, prefer the confgured Artifactory key over the Galaxy token. [env - var: GALACTORY_PREFER_CONFIGURED_KEY] - --log-file LOG_FILE If set, logging will go to this file instead of the console. [env var: - GALACTORY_LOG_FILE] + If set, prefer the confgured Artifactory key over the Galaxy token. + [env var: GALACTORY_PREFER_CONFIGURED_KEY] + --log-file LOG_FILE If set, logging will go to this file instead of the console. + [env var: GALACTORY_LOG_FILE] --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL} The desired logging level. [env var: GALACTORY_LOG_LEVEL] - --log-headers Log the headers of every request (DEBUG level only). [env var: - GALACTORY_LOG_HEADERS] - --log-body Log the body of every request (DEBUG level only). [env var: - GALACTORY_LOG_BODY] + --log-headers Log the headers of every request (DEBUG level only). + [env var: GALACTORY_LOG_HEADERS] + --log-body Log the body of every request (DEBUG level only). + [env var: GALACTORY_LOG_BODY] --proxy-upstream PROXY_UPSTREAM If set, then find, pull and cache results from the specified galaxy server in addition to local. [env var: GALACTORY_PROXY_UPSTREAM] -npns NO_PROXY_NAMESPACE, --no-proxy-namespace NO_PROXY_NAMESPACE Requests for this namespace should never be proxied. Can be specified multiple times. [env var: GALACTORY_NO_PROXY_NAMESPACE] + --cache-minutes CACHE_MINUTES + The time period that a cache entry should be considered valid. + [env var: GALACTORY_CACHE_MINUTES] + --cache-read CACHE_READ + Look for upsteam caches and use their values. + [env var: GALACTORY_CACHE_READ] + --cache-write CACHE_WRITE + Populate the upstream cache in Artifactory. Should be false when no API key is + provided or the key has no permission to write. + [env var: GALACTORY_CACHE_WRITE] Args that start with '--' (eg. --listen-addr) can also be set in a config file (/etc/galactory.d/*.conf or ~/.galactory/*.conf or specified via -c). Config file syntax allows: diff --git a/galactory/__main__.py b/galactory/__main__.py index 8f3e6b4..d96affd 100644 --- a/galactory/__main__.py +++ b/galactory/__main__.py @@ -2,12 +2,32 @@ # (c) 2022 Brian Scholer (@briantist) import logging -from configargparse import ArgParser +from configargparse import ArgParser, ArgumentError, Action from artifactory import ArtifactoryPath from . import create_app +# TODO: when py3.8 support is dropped, switch to using argparse.BooleanOptionalAction +class _StrBool(Action): + FALSES = {'false', '0', 'no'} + TRUES = {'true', '1', 'yes'} + + def _booler(self, value): + if isinstance(value, bool): + return value + + if value.lower() in self.FALSES: + return False + if value.lower() in self.TRUES: + return True + + raise ArgumentError(self, f"Expecting 'true', 'false', 'yes', 'no', '1' or '0', but got '{value}'") + + def __call__(self, parser, namespace, values, option_string=None): + setattr(namespace, self.dest, self._booler(values)) + + if __name__ == '__main__': parser = ArgParser( prog='python -m galactory', @@ -37,6 +57,9 @@ parser.add_argument('--log-body', action='store_true', env_var='GALACTORY_LOG_BODY', help='Log the body of every request (DEBUG level only).') parser.add_argument('--proxy-upstream', type=lambda x: str(x).rstrip('/') + '/', env_var='GALACTORY_PROXY_UPSTREAM', help='If set, then find, pull and cache results from the specified galaxy server in addition to local.') parser.add_argument('-npns', '--no-proxy-namespace', action='append', default=[], env_var='GALACTORY_NO_PROXY_NAMESPACE', help='Requests for this namespace should never be proxied. Can be specified multiple times.') + parser.add_argument('--cache-minutes', default=60, type=int, env_var='GALACTORY_CACHE_MINUTES', help='The time period that a cache entry should be considered valid.') + parser.add_argument('--cache-read', action=_StrBool, default=True, env_var='GALACTORY_CACHE_READ', help='Look for upsteam caches and use their values.') + parser.add_argument('--cache-write', action=_StrBool, default=True, env_var='GALACTORY_CACHE_WRITE', help='Populate the upstream cache in Artifactory. Should be false when no API key is provided or the key has no permission to write.') args = parser.parse_args() logging.basicConfig(filename=args.log_file, level=args.log_level) @@ -51,7 +74,9 @@ USE_GALAXY_KEY=args.use_galaxy_key, PREFER_CONFIGURED_KEY=args.prefer_configured_key, SERVER_NAME=args.server_name, + CACHE_MINUTES=args.cache_minutes, + CACHE_READ=args.cache_read, + CACHE_WRITE=args.cache_write, ) - print(app.url_map) app.run(args.listen_addr, args.listen_port, threaded=True) diff --git a/galactory/api/v2/collections.py b/galactory/api/v2/collections.py index 925758b..7f368d8 100644 --- a/galactory/api/v2/collections.py +++ b/galactory/api/v2/collections.py @@ -33,10 +33,13 @@ def collection(namespace, collection): repository = authorize(request, current_app.config['ARTIFACTORY_PATH']) upstream = current_app.config['PROXY_UPSTREAM'] no_proxy = current_app.config['NO_PROXY_NAMESPACES'] + cache_minutes = current_app.config['CACHE_MINUTES'] + cache_read = current_app.config['CACHE_READ'] + cache_write = current_app.config['CACHE_WRITE'] upstream_result = None if upstream and (not no_proxy or namespace not in no_proxy): - proxy = ProxyUpstream(repository, upstream) + proxy = ProxyUpstream(repository, upstream, cache_read, cache_write, cache_minutes) upstream_result = proxy.proxy(request) results = _collection_listing(repository, namespace, collection) @@ -66,10 +69,13 @@ def versions(namespace, collection): repository = authorize(request, current_app.config['ARTIFACTORY_PATH']) upstream = current_app.config['PROXY_UPSTREAM'] no_proxy = current_app.config['NO_PROXY_NAMESPACES'] + cache_minutes = current_app.config['CACHE_MINUTES'] + cache_read = current_app.config['CACHE_READ'] + cache_write = current_app.config['CACHE_WRITE'] upstream_result = None if upstream and (not no_proxy or namespace not in no_proxy): - proxy = ProxyUpstream(repository, upstream) + proxy = ProxyUpstream(repository, upstream, cache_read, cache_write, cache_minutes) upstream_result = proxy.proxy(request) collections = collected_collections(repository, namespace=namespace, name=collection) @@ -117,12 +123,15 @@ def version(namespace, collection, version): repository = authorize(request, current_app.config['ARTIFACTORY_PATH']) upstream = current_app.config['PROXY_UPSTREAM'] no_proxy = current_app.config['NO_PROXY_NAMESPACES'] + cache_minutes = current_app.config['CACHE_MINUTES'] + cache_read = current_app.config['CACHE_READ'] + cache_write = current_app.config['CACHE_WRITE'] try: info = next(discover_collections(repository, namespace=namespace, name=collection, version=version)) except StopIteration: if upstream and (not no_proxy or namespace not in no_proxy): - proxy = ProxyUpstream(repository, upstream) + proxy = ProxyUpstream(repository, upstream, cache_read, cache_write, cache_minutes) upstream_result = proxy.proxy(request) return jsonify(upstream_result) else: diff --git a/galactory/download/download.py b/galactory/download/download.py index 3f9f145..be048ed 100644 --- a/galactory/download/download.py +++ b/galactory/download/download.py @@ -17,6 +17,9 @@ def download(filename): artifact = authorize(request, current_app.config['ARTIFACTORY_PATH'] / filename) upstream = current_app.config['PROXY_UPSTREAM'] # no_proxy = current_app.config['NO_PROXY_NAMESPACES'] + cache_minutes = current_app.config['CACHE_MINUTES'] + cache_read = current_app.config['CACHE_READ'] + cache_write = current_app.config['CACHE_WRITE'] try: stat = artifact.stat() @@ -28,9 +31,12 @@ def download(filename): if not upstream: # or not (not no_proxy or namespace not in no_proxy): abort(C.HTTP_NOT_FOUND) - proxy = ProxyUpstream(artifact, upstream) + proxy = ProxyUpstream(artifact, upstream, cache_read, cache_write, cache_minutes) + + with proxy.proxy_download(request) as resp, _chunk_to_temp(None, iterator=resp.iter_content, close=cache_write) as tmp: + if not cache_write: + return send_file(tmp.handle, as_attachment=True, download_name=filename, etag=False) - with proxy.proxy_download(request) as resp, _chunk_to_temp(None, iterator=resp.iter_content) as tmp: try: artifact.deploy(tmp.handle, md5=tmp.md5, sha1=tmp.sha1, sha256=tmp.sha256) except ArtifactoryException as exc: diff --git a/galactory/upstream.py b/galactory/upstream.py index 75d6a3d..d3afe5d 100644 --- a/galactory/upstream.py +++ b/galactory/upstream.py @@ -34,7 +34,7 @@ def _time_decoder(pairs): loaded = json.load(f, object_pairs_hook=_time_decoder) return cls(data=loaded['data'], metadata=loaded['metadata'], **kwargs) - def __init__(self, data=None, metadata=None, expiry_delta=timedelta(hours=1), calculate_expiry_on_read=False) -> None: + def __init__(self, expiry_delta, data=None, metadata=None, calculate_expiry_on_read=True) -> None: raw = {'metadata': {}, 'data': {}} self._expiry_delta = expiry_delta self._calc_on_read = calculate_expiry_on_read @@ -113,29 +113,41 @@ def _to_serializable_dict(self): class ProxyUpstream: _cache_path = '_cache' - def __init__(self, repository, upstream_url) -> None: + def __init__(self, repository, upstream_url, read_cache, write_cache, cache_expiry_minutes) -> None: self._repository = repository self._upstream = upstream_url + self._read_cache = read_cache + self._write_cache = write_cache + self._cache_expiry_delta = timedelta(minutes=cache_expiry_minutes) - def _get_cache(self, request, **kwargs) -> _CacheEntry: + + def _get_cache(self, request, expiry_delta=None, **kwargs) -> _CacheEntry: path = self._repository / self._cache_path / request.path / 'data.json' + if expiry_delta is None: + expiry_delta = self._cache_expiry_delta + + if not self._read_cache: + return _CacheEntry(expiry_delta=expiry_delta, **kwargs) + try: with path.open() as f: - return _CacheEntry.from_file(f, **kwargs) + return _CacheEntry.from_file(f, expiry_delta=expiry_delta, **kwargs) except ArtifactoryException: - return _CacheEntry(**kwargs) + return _CacheEntry(expiry_delta=expiry_delta, **kwargs) def _set_cache(self, request, cache) -> None: + if not self._write_cache: + return + from . import DateTimeIsoFormatJSONEncoder path = self._repository / self._cache_path / request.path / 'data.json' - buffer = StringIO() - cache.update() - json.dump(cache._to_serializable_dict(), buffer, cls=DateTimeIsoFormatJSONEncoder) - buffer.seek(0) - path.deploy(buffer) - buffer.close() + with StringIO() as buffer: + cache.update() + json.dump(cache._to_serializable_dict(), buffer, cls=DateTimeIsoFormatJSONEncoder) + buffer.seek(0) + path.deploy(buffer) @contextmanager def proxy_download(self, request): @@ -171,10 +183,14 @@ def proxy(self, request): # abort(Response(resp.text, resp.status_code)) else: - current_app.logger.info(f"Cache miss: {request.url}") + if self._read_cache: + current_app.logger.info(f"Cache miss: {request.url}") + data = resp.json() cache.data = data - self._set_cache(request, cache) + + if self._write_cache: + self._set_cache(request, cache) else: current_app.logger.info(f"Cache hit: {request.url}") diff --git a/galactory/utilities.py b/galactory/utilities.py index b7016a3..f0e97a7 100644 --- a/galactory/utilities.py +++ b/galactory/utilities.py @@ -6,9 +6,7 @@ import math import hashlib -from collections import namedtuple from tempfile import SpooledTemporaryFile -from contextlib import contextmanager from urllib.request import urlopen from urllib3 import Retry from requests.adapters import HTTPAdapter @@ -188,11 +186,23 @@ def lcm(a, b, *more): return abs(a * z) // math.gcd(a, z) -HashedTempFile = namedtuple('HashedTempFile', ('handle', 'md5', 'sha1', 'sha256')) +class HashedTempFile(): + def __init__(self, handle, md5, sha1, sha256, close=True) -> None: + self.handle = handle + self.md5 = md5 + self.sha1 = sha1 + self.sha256 = sha256 + self._close = close + def __enter__(self): + return self -@contextmanager -def _chunk_to_temp(fsrc, iterator=None, spool_size=5*1024*1024, seek_to_zero=True, chunk_multiplier=64) -> HashedTempFile: + def __exit__(self, exc_type, exc_value, exc_tb): + if self._close: + self.handle.close() + + +def _chunk_to_temp(fsrc, iterator=None, spool_size=5*1024*1024, seek_to_zero=True, chunk_multiplier=64, close=True) -> HashedTempFile: md5sum = hashlib.md5() sha1sum = hashlib.sha1() sha256sum = hashlib.sha256() @@ -201,14 +211,15 @@ def _chunk_to_temp(fsrc, iterator=None, spool_size=5*1024*1024, seek_to_zero=Tru it = iter(lambda: fsrc.read(chunk_size), b'') if iterator is None else iterator(chunk_size) - with SpooledTemporaryFile(max_size=spool_size) as tmp: - for chunk in it: - md5sum.update(chunk) - sha1sum.update(chunk) - sha256sum.update(chunk) - tmp.write(chunk) + tmp = SpooledTemporaryFile(max_size=spool_size) + + for chunk in it: + md5sum.update(chunk) + sha1sum.update(chunk) + sha256sum.update(chunk) + tmp.write(chunk) - if seek_to_zero: - tmp.seek(0) + if seek_to_zero: + tmp.seek(0) - yield HashedTempFile(tmp, md5sum.hexdigest(), sha1sum.hexdigest(), sha256sum.hexdigest()) + return HashedTempFile(tmp, md5sum.hexdigest(), sha1sum.hexdigest(), sha256sum.hexdigest(), close=close)