From 8f6113a8b30b2bb84bee8ecd18f3c12e891f148f Mon Sep 17 00:00:00 2001 From: Will Ronchetti Date: Thu, 25 Jul 2024 13:08:38 -0400 Subject: [PATCH] Fix Order Bug (#301) * fix order bug * beta * more logging, new beta * fix version * fix xl order call * add index to rid column * version --- CHANGELOG.rst | 7 +++++++ pyproject.toml | 2 +- snovault/commands/create_mapping_on_deploy.py | 2 +- snovault/dev_servers.py | 2 ++ snovault/elasticsearch/create_mapping.py | 16 +++++++++------- snovault/storage.py | 3 ++- 6 files changed, 22 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5efa7ebd6..699504c09 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,13 @@ snovault Change Log ---------- +11.20.0 +======= + +* Bug fix: use loadxl_order() in staggered reindexing +* Add B-tree index to rid column in propsheets to optimize revision history retrieval + + 11.19.0 ======= diff --git a/pyproject.toml b/pyproject.toml index 4fdb2c1a3..0eccd7f7a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicsnovault" -version = "11.19.0" +version = "11.20.0" description = "Storage support for 4DN Data Portals." authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/snovault/commands/create_mapping_on_deploy.py b/snovault/commands/create_mapping_on_deploy.py index 52e7b89fd..b85f4543e 100644 --- a/snovault/commands/create_mapping_on_deploy.py +++ b/snovault/commands/create_mapping_on_deploy.py @@ -66,7 +66,7 @@ def _run_create_mapping(app, args): log.info('Overriding deploy_cfg and wiping ES') deploy_cfg['WIPE_ES'] = True run_create_mapping(app, check_first=(not deploy_cfg['WIPE_ES']), purge_queue=args.clear_queue, - item_order=loadxl_order) + item_order=loadxl_order()) except Exception as e: log.error("Exception encountered while gathering deployment information or running create_mapping") log.error(str(e)) diff --git a/snovault/dev_servers.py b/snovault/dev_servers.py index 53d738322..82ee76065 100644 --- a/snovault/dev_servers.py +++ b/snovault/dev_servers.py @@ -213,6 +213,8 @@ def cleanup_process(): # now clear the queues and queue items for indexing create_mapping.run(app, check_first=True, strict=True, purge_queue=False) + # To test prod setup: + # create_mapping.reindex_by_type_staggered(app) PRINT('Started. ^C to exit.') diff --git a/snovault/elasticsearch/create_mapping.py b/snovault/elasticsearch/create_mapping.py index 159e2d4e3..c968b3a62 100644 --- a/snovault/elasticsearch/create_mapping.py +++ b/snovault/elasticsearch/create_mapping.py @@ -33,6 +33,7 @@ # from ..commands.es_index_data import run as run_index_data from ..schema_utils import combine_schemas from ..tools import make_indexer_testapp +from ..project_app import app_project from ..util import ( add_default_embeds, IndexSettings, NUM_SHARDS, NUM_REPLICAS, SEARCH_MAX, KW_IGNORE_ABOVE, MIN_NGRAM, @@ -874,11 +875,11 @@ def build_index(app, es, index_name, in_type, mapping, uuids_to_index, dry_run, start = timer() coll_uuids = set(get_uuids_for_types(app.registry, types=[in_type])) end = timer() - log.info('Time to get collection uuids: %s' % str(end-start), cat='fetch time', - duration=str(end-start), collection=in_type) + log.warning('Time to get collection uuids: %s' % str(end-start), cat='fetch time', + duration=str(end-start), collection=in_type) uuids_to_index[in_type] = coll_uuids - log.info('MAPPING: will queue all %s items in the new index %s for reindexing' % - (len(coll_uuids), in_type), cat='items to queue', count=len(coll_uuids), collection=in_type) + log.warning('MAPPING: will queue all %s items in the new index %s for reindexing' % + (len(coll_uuids), in_type), cat='items to queue', count=len(coll_uuids), collection=in_type) def check_if_index_exists(es, in_type): @@ -1383,7 +1384,7 @@ def reindex_by_type_staggered(app): registry = app.registry es = registry[ELASTIC_SEARCH] indexer_queue = registry[INDEXER_QUEUE] - all_types = registry[COLLECTIONS].by_item_type + all_types = app_project().loadxl_order() log.warning('Running staggered create_mapping command - wiping and reindexing indices sequentially' ' to minimize downtime.') @@ -1396,10 +1397,11 @@ def reindex_by_type_staggered(app): uuids = {} build_index(app, es, namespaced_index, i_type, mapping, uuids, False) mapping_end = timer() - to_index_list = flatten_and_sort_uuids(app.registry, uuids, None) + to_index_list = flatten_and_sort_uuids(app.registry, uuids, None) # none here is fine since we pre-ordered indexer_queue.add_uuids(app.registry, to_index_list, strict=True, target_queue='secondary') - log.warning(f'Queued type {i_type} in {mapping_end - current_start}') + log.warning(f'Queued type {i_type} ({len(to_index_list)} total items) in {mapping_end - current_start}') + log.warning(f'First 10 items: {list(uuid for uuid in to_index_list[0:10])}') time.sleep(10) # give queue some time to catch up while not indexer_queue.queue_is_empty(): time.sleep(10) # check every 10 seconds diff --git a/snovault/storage.py b/snovault/storage.py index a5eae7926..556e718c2 100644 --- a/snovault/storage.py +++ b/snovault/storage.py @@ -784,11 +784,12 @@ class PropertySheet(Base): ) # The sid column also serves as the order. sid = Column(types.Integer, autoincrement=True, primary_key=True) + # B-tree index on rid here greatly optimizes retrieval of revision history rid = Column(UUID, ForeignKey('resources.rid', deferrable=True, initially='DEFERRED'), - nullable=False) + nullable=False, index=True) name = Column(types.String, nullable=False) properties = Column(JSON) resource = orm.relationship('Resource')